In [None]:
# Required Imports
from elasticsearch import Elasticsearch 
from parser import parse_docs
import string

In [None]:
# Loads stop words within provided file
stopword_path = '/Users/reelataher/hw1-Reela-Taher/IR_data/AP_DATA/stoplist.txt'  

with open(stopword_path) as f:
  stop_words = set(f.read().split())
  
# Set stopwords to list
stop_words = list(stop_words)

In [None]:
es = Elasticsearch(['http://localhost:9200/'])

index_name = "ap89"

# Index settings
ap89_index = {
  "settings": { # Reduces resource usage
    "analysis": {
      "filter": {
        "english_stop": {
          "type": "stop",
          "stopwords": stop_words # Custom stop filter using predefined stopwords 
        }
      },
      "analyzer": { 
        "stopped": {
           "type": "custom",
           "tokenizer": "standard",
           "filter": [
             "lowercase",
             "english_stop"
           ]
        }
      }
    }
  },
  "mappings": { # Field data enabled for sorting and aggregations
    "properties": {
      "text": {
        "type": "text",
        "analyzer": "stopped", 
        "index_options": "positions"  
      }
    }
  }
}

In [None]:
# Parse documents 
docs = parse_docs() 

# Create index 
es.indices.create(index=index_name, body=ap89_index)

In [None]:
# Initialize the term vectors dictionary
term_vectors = {} 

# Index documents
for doc in docs:
  
  # Make all text lowercase
  text = doc['text'].lower()

  # Remove punctuation within text
  text = text.translate(str.maketrans('', '', string.punctuation))   

  # Filter out stop words
  filtered_words = [word for word in text.split() if word not in stop_words]

  # Join back to string
  doc['text'] = ' '.join(filtered_words)

  # Store term vectors from each document into dictionary
  term_vectors[doc['DOCNO']] = es.mtermvectors(index=index_name, id=doc['DOCNO'], fields=['text'])

  es.index(index=index_name, body=doc, id=doc['DOCNO'])

In [None]:
# Initialize dictionary to store term frequencies for each term in each document
term_frequency = {}

# Calculate term frequencies for each term in each document
for id, doc in term_vectors.items():
   
    # Calculate term frequencies for current document
    term_frequency[doc['DOCNO']] = {}
    
    # Iterate over the terms in the term vector of the current document 
    for term, info in doc['text']['terms'].items():
        term_frequency[doc['DOCNO']][term] = info['term_freq']

In [None]:
# Calculate all document lengths
total_dl = sum(sum(tf.values()) for tf in term_frequency.values())

# Calculate average document length 
avg_dl = total_dl / len(docs)

In [None]:
# Calculate Okapi TF score for a term in a document
def okapi_tf_score(tf, length_doc, avg_corpus_length):
    score = tf / (tf + 0.5 + 1.5 * (length_doc / avg_corpus_length))
    return score