In [80]:
# ES built-in to Read and Run All Queries

# Required Imports
import os
import math
import string
from elasticsearch import Elasticsearch 
from nltk.stem import PorterStemmer

es = Elasticsearch(['http://localhost:9200/'])

index_name = "ap89"

# Library to remove the suffixes from an English word and obtain its stem
parser = PorterStemmer()

# Path to the directory containing the files
directory_path = '/Users/reelataher/hw1-Reela-Taher/IR_data/AP_DATA/ap89_collection'

In [3]:
# Loads stop words within provided file
stopword_path = '/Users/reelataher/hw1-Reela-Taher/IR_data/AP_DATA/stoplist.txt'  

with open(stopword_path) as f:
  stop_words = set(f.read().split())
  
# Set stopwords to list
stop_words = list(stop_words)

In [4]:
# Function to parse all documents within directory
def parse_docs():

    # Initialize a list to store all extracted documents
    documents = []

    # Store all files within a variable using the os module
    for filename in os.listdir(directory_path):

        # Construct the full path of the file
        file_path = os.path.join(directory_path, filename)

        # Open each file in the ap89_collection directory for reading
        with open(file_path, 'r', encoding='ISO-8859-1') as file:

            # Initialize what one document should contain, document ID, and body
            single_doc = {'DOCNO': '', 'text': ''}
            texts = []
            inside_text = False
            current_text = ''

            # Parsing every individual file
            for line in file:

                # Checks if this is the beginning of a document and initializes an empty dictionary
                if line.startswith('<DOC>'):
                    single_doc = {'DOCNO': '', 'text': ''}
                    texts = []

                # Reads that this is the document ID and stores this in a variable
                elif line.startswith('<DOCNO>'):
                    single_doc['DOCNO'] = line.split('<DOCNO>')[1].split('</DOCNO>')[0].strip()

                # Reads that this is the body of the document and starts appending found text
                elif line.startswith('<TEXT>'):
                    inside_text = True
                    current_text = ''

                # Reads that this is the end of the document and stores the final content
                elif line.startswith('</TEXT>'):
                    inside_text = False
                    texts.append(current_text.strip())

                elif inside_text:
                    current_text += line + ' '

                # Reads that this is the end of the document
                elif line.startswith('</DOC>'):
                    single_doc['text'] = ' '.join(texts)
                    documents.append(single_doc)

    # Returns a list of all parsed documents
    return documents

In [5]:
# Index settings
ap89_index = {
  "settings": { # Reduces resource usage
    "analysis": {
      "filter": {
        "english_stop": {
          "type": "stop",
          "stopwords": stop_words # Custom stop filter using predefined stopwords 
        }
      },
      "analyzer": { 
        "stopped": {
           "type": "custom",
           "tokenizer": "standard",
           "filter": [
             "lowercase",
             "english_stop"
           ]
        } 
      }
    }
  },
  "mappings": { # Field data enabled for sorting and aggregations
    "properties": {
      "text": {
        "type": "text",
        "term_vector": "yes",
        "analyzer": "stopped", 
        "index_options": "positions"  
      }
    }
  }
}

In [6]:
# Parse documents 
docs = parse_docs() 

# # Create index 
es.indices.create(index=index_name, body=ap89_index)

In [None]:
# Index documents
for doc in docs:
  
  # Make all text lowercase
  text = doc['text'].lower()

  # Remove punctuation within text
  text = text.translate(str.maketrans('', '', string.punctuation))   

  # Filter out stop words
  filtered_words = [word for word in text.split() if word not in stop_words]

  # Stem each word in the filtered_words list
  stemmed_words = [parser.stem(word) for word in filtered_words]

  # Join back to string
  doc['text'] = ' '.join(stemmed_words)

  es.index(index=index_name, body=doc, id=doc['DOCNO'])

In [210]:
# Path to the directory containing the query file
query_path = '/Users/reelataher/hw1-Reela-Taher/IR_data/AP_DATA/query_desc.51-100.short.txt'

In [106]:
# Function to preprocess queries
def process_query(query):

    # Lowercase
    query = query.lower()

    # Remove punctuation
    query = query.translate(str.maketrans('', '', string.punctuation))

    # Tokenize on spaces
    words = query.split(' ')

    # Remove stopwords
    filtered = [word for word in words if word not in stop_words]

    # Stem each word in the filtered list
    stemmed = [parser.stem(word) for word in filtered]

    # Join back to string
    processed_query = " ".join(stemmed)

    return processed_query

In [108]:
# Function to read and return queries from txt file
def read_query(filename):
    
    # Initialize empty dictionary to store queries 
    queries = {} 

    # Read file line by line
    with open(filename, 'r') as file:
        
        for line in file:
            if not line.strip(): 
                continue

            # Split line to get query number and text
            attributes = line.split('.', 1)  

            # Extract query number and text
            query_number = attributes[0].strip()  
            query = attributes[1].strip()

            # Use the new query_analyzer for query processing
            processed_query = process_query(query)
            
            # Store processed query text with query number as key
            queries[query_number] = processed_query
    
    # Return dictionary of queries
    return queries

In [11]:
# Search Elasticsearch index using match query 
def es_search(query):
    
    # Send match query to Elasticsearch
    result = es.search(index=index_name, body={'query': {'match': {'text': query}}}, size=1000)

    # Return the hits
    return result.get('hits', {}).get('hits', [])

In [12]:
# Function to retrieve term vectors
def get_vectors(doc_id):
    body = {
        "ids": [doc_id],
        "parameters": {
            "term_statistics": True,
            "field_statistics": True
        }
    }

    result = es.mtermvectors(index = index_name, body = body)
                             
    return result

In [13]:
# Initialize dictionaries and variables for document lengths and term frequencies
doc_lengths = {}
term_frequencies = {}
total_length = 0
total_docs = 0 

for doc in docs:

    # Get term vectors for this document
    vectors_data = get_vectors(doc['DOCNO'])

    # Check if 'term_vectors' has actual text 
    if 'term_vectors' not in vectors_data['docs'][0] or 'text' not in vectors_data['docs'][0]['term_vectors']:
        continue

    # Access the term vectors
    term_info = vectors_data['docs'][0]['term_vectors']['text']['terms']

    # Calculate the document length
    doc_length = sum(info['term_freq'] for info in term_info.values())

    # Add to total length
    total_length += doc_length

    # Increment count
    total_docs += 1

    # Store the document length
    doc_lengths[doc['DOCNO']] = doc_length

    # Store term frequencies for each term in the document
    term_frequencies[doc['DOCNO']] = {term: info['term_freq'] for term, info in term_info.items()}

 # Calculate average length of documents
average_length = total_length / total_docs

In [211]:
# Read queries from the specified file
queries = read_query(query_path)

# Precompute Inverse Document Frequency (IDF) for all terms in the corpus
doc_frequencies = {}
unique_terms = set() 

# Iterate through documents and their term frequencies
for doc, terms in term_frequencies.items():
    
    # Iterate through terms in the document
    for term in terms:
        # Update document frequency for the term
        doc_frequencies[term] = doc_frequencies.get(term, 0) + 1
        unique_terms.add(term)

# Store the number of unique terms in the collection
V = len(unique_terms)

# Precompute IDF for all terms in the queries
query_idfs = {}

# Iterate through queries
for query_number, query_text in queries.items():
    
    # Split the query text into individual terms
    query_terms = query_text.split()

    # Iterate through terms in the query
    for term in query_terms:
        
        # Get the document frequency (df) for the term
        df = doc_frequencies.get(term, 0)

        # Skip if document frequency is 0 (term not present in the corpus)
        if df == 0:
            continue

        # Calculate Inverse Document Frequency (IDF) for the term
        idf = math.log(len(docs) / df)

        # Update the query IDF dictionary with the computed IDF for the term
   
        query_idfs[term] = idf

In [94]:
# Calculate ES Builtin score for a document
def es_built_score(doc):
  
  return doc['_score'] 

# Calculate Okapi TF score for a term in a document
def okapi_tf_score(doc_id):

  # Initialize total TF score
  tf_score = 0 
  
  # Loop through each query term
  for term in query_terms:

    # Get the term frequency for this term
    term_freq = term_frequencies[doc_id].get(term, 0)
    
    # Calculate the TF score for this specific term 
    tf_term = term_freq / (term_freq + 0.5 + (1.5 * doc_length / average_length))
    
    # Add the TF score for this term to the total 
    tf_score += tf_term

  # Return the total summed TF score
  return tf_score

# Calculate TF-IDF score for query terms in a document
def tfidf_score(doc_id):

    # Get Okapi TF score for just this term
    doc_tf = okapi_tf_score(doc_id)
    
    tfidf_doc_score = 0.0 

    # Loop through each query term
    for term in query_terms:
        
        # Get the IDF for the term, using default if not present
        idf = query_idfs.get(term, 0) 
        tfidf_doc_score += doc_tf * idf  

    return tfidf_doc_score

# BM25 constants
k1 = 1.2
k2 = 100
b = 0.75

# Calculate BM25 score for query terms in a document
def bm25_score(doc_id):
  
    bm25_score = 0

    # Iterate through query terms
    for term in query_terms:

        # Get term frequencies
        tf_d = term_frequencies[doc_id].get(term, 0)
        tf_q = query_terms.count(term)

        # Calculate document frequency (df) for the term
        df = doc_frequencies.get(term, 0)

        # Calculate BM25 formula components
        idf_component = math.log((len(docs) + 0.5) / (df + 0.5))
        doc_length = doc_lengths[doc_id]
        tf_d_component = (tf_d + k1 * tf_d) / (tf_d + k1 * ((1 - b) + b * (doc_length / average_length)))
        tf_q_component = (tf_q + k2 * tf_q) / (tf_q + k2)

        # Accumulate BM25 score
        bm25_score += idf_component * tf_d_component * tf_q_component

    return bm25_score

# Function to calculate Unigram LM score with Laplace smoothing for a document
def lml_score(doc_id):

    lm_laplace_score = 0

    # Calculate document length
    doc_length = doc_lengths[doc_id]

    # Iterate through query terms
    for term in query_terms:

        # Get term frequency  
        tf_w_d = term_frequencies[doc_id].get(term, 0) 
        
        # Calculate probability with smoothed tf
        p_laplace = (tf_w_d + 1) / (doc_length + V)

        # If term is absent, push hard penalty of -1000
        if tf_w_d == 0:
          lm_laplace_score += -1000
          continue

        else :
          lm_laplace_score += math.log(p_laplace)
      
    return lm_laplace_score

lambda_jm = 0.9 

# Function to calculate Unigram LM score with Jelinek-Mercer smoothing parameter
def lm_jm_score(doc_id):

    lm_jm_score = 0
    
    # Calculate document length
    doc_length = doc_lengths[doc_id]

    # Iterate through query terms 
    for term in query_terms:

        # Get term frequency in this document
        tf_w_d = term_frequencies[doc_id].get(term, 0)

        # Calculate probability with Jelinek-Mercer smoothing using cfw/V
        bg_model = term_frequencies.get(term, 0) / total_length
        p_jm = lambda_jm * (tf_w_d/doc_length) + (1 - lambda_jm) * bg_model

        # Handle the case when p_jm is exactly 0 by adding a penalty score
        if p_jm == 0:
            lm_jm_score += -1000
        else:
            # Accumulate log probability
            lm_jm_score += math.log(p_jm)

    return lm_jm_score


In [22]:
# Defining dictionary for scoring functions that will be used for different text files
scoring_function = {
  'es_built': es_built_score,
  'okapi': okapi_tf_score,
  'tfidf': tfidf_score, 
  'BM25': bm25_score,
  'LML' : lml_score,
  'LMJM' : lm_jm_score
}

In [23]:
def process_results(query_number, hits, scoring_model):
  
  output = []

  # Initialize the rank
  rank = 1

  # Check if there are hits
  if hits:

    # Define the scoring function based on the model
    if scoring_model == "es_built":
        scoring_function = es_built_score
    elif scoring_model == 'okapi':
        scoring_function = okapi_tf_score
    elif scoring_model == 'tfidf':
        scoring_function = tfidf_score
    elif scoring_model == 'BM25':
        scoring_function = bm25_score
    elif scoring_model == 'LML':
        scoring_function = lml_score
    elif scoring_model == 'LMJM':
        scoring_function = lm_jm_score

    # Iterate through the hits  
    for hit in hits:
      
      # Get values from hit
      docno = hit['_source']['DOCNO']

      # Pass the appropriate arguments to the scoring function
      if scoring_model == "es_built":
        score = scoring_function(hit)
      elif scoring_model == 'okapi':
        score = scoring_function(docno)
      elif scoring_model == 'tfidf':
        score = scoring_function(docno)
      elif scoring_model == 'BM25':
        score = scoring_function(docno)
      elif scoring_model == 'LML':
        score = scoring_function(docno)
      elif scoring_model == 'LMJM':
        score = scoring_function(docno)

      # Create output line 
      output_line = f"{query_number} Q0 {docno} {rank} {score} Exp\n"
      
      output.append(output_line)
      
      # Increment rank
      rank += 1
  
  # Return complete output 
  return output

In [18]:
# Defines directory to add new file
output_path = '/Users/reelataher/hw1-Reela-Taher/IR_data/AP_DATA'

# Takes output string from search and writes it to a text file in the specified output directory
def output_txt(filename, results):
    with open(output_path + '/' + filename + '.txt', 'w') as file:
        file.write(''.join(results))

In [204]:
# Read queries and initialize results variables
es_string = ''

# Process each query for ES Built-in
for query_number, query_text in queries.items():
    
    # Run search function for text in each query using the new query_search function
    hits = es_search(query_text)

    # Process results for ES Built-in using es_built_score
    es_results = process_results(query_number, hits, "es_built")

    # Check if there are results before updating string and sort results
    if es_results:
        es_results.sort(key=lambda x: float(x.split()[-2]), reverse=True)

    # Check if there are hits before updating the string
    if es_results:
        es_string += ''.join(es_results)

# Write ES Built-in output file
output_txt("es_results", es_string)


In [205]:
# Read queries and initialize results variables
okapi_string = ''

# Process each query for Okapi
for query_number, query_text in queries.items():
   
    # Split query into terms
    query_terms = query_text.split()
    
    # Run search function for text in each query
    hits = es_search(query_text)

    # Process results for Okapi
    okapi_results = process_results(query_number, hits, "okapi")

    # Check if there are results before updating string and sort results
    if okapi_results:
        okapi_results.sort(key=lambda x: float(x.split()[-2]), reverse=True)

    # Check if there are hits before updating the string
    if okapi_results:
        okapi_string += ''.join(okapi_results)

# Write Okapi output file
output_txt("okapi_results", okapi_string)

In [206]:
# Read queries and initialize results variables
tfidf_string = ''

# Process each query for TF-IDF
for query_number, query_text in queries.items():
   
    # Split query into terms
    query_terms = query_text.split()

    # Run search function for text in each query
    hits = es_search(query_text)

    # Process results for TF-IDF
    tfidf_results = process_results(query_number, hits, "tfidf")

    # Check if there are results before updating string and sort results
    if tfidf_results:
        tfidf_results.sort(key=lambda x: float(x.split()[-2]), reverse=True)

    # Check if there are hits before updating the string
    if tfidf_results:
        tfidf_string += ''.join(tfidf_results)

# Write TF-IDF output file
output_txt("tfidf_results", tfidf_string)

In [207]:
# Read queries and initialize results variables
bm25_string = ''
queries = read_query(query_path)

# Process each query for BM25 
for query_number, query_text in queries.items():

    # Split query into terms
    query_terms = query_text.split()  

    # Run search function for text in each query
    hits = es_search(query_text)

    # Process results for BM25
    bm25_results = process_results(query_number, hits, "BM25")

    # Check if there are results before updating string and sort results
    if bm25_results:
        bm25_results.sort(key=lambda x: float(x.split()[-2]), reverse=True)

    # Check if there are hits before updating string
    if bm25_results:
        bm25_string += ''.join(bm25_results)
        
# Write BM25 output file
output_txt("bm25_results", bm25_string)

In [208]:
# Read queries and initialize results variables
lm_laplace_string = ''
 
# Process each query for LM Laplace
for query_number, query_text in queries.items():

    # Split query into terms
    query_terms = query_text.split()  

    # Run search function for text in each query
    hits = es_search(query_text)

    # Process results for LM Laplace
    lm_laplace_results = process_results(query_number, hits, "LML")

    # Check if there are results before updating string and sort results
    if lm_laplace_results:
        lm_laplace_results.sort(key=lambda x: float(x.split()[-2]), reverse=True)

    # Check if there are results before updating string
    if lm_laplace_results:
        lm_laplace_string += ''.join(lm_laplace_results)
        
# Write LM Laplace output file
output_txt("lml_results", lm_laplace_string)

In [209]:
# Read queries and initialize results variables
lm_jm_string = ''

# Process each query for JM
for query_number, query_text in queries.items():

  # Split query into terms
  query_terms = query_text.split()  

  # Run search function for text in each query
  hits = es_search(query_text)

  # Process results for JM
  lm_jm_results = process_results(query_number, hits, "LMJM")

  # Check if there are results before updating string and sort results
  if lm_jm_results:
    lm_jm_results.sort(key=lambda x: float(x.split()[-2]), reverse=True)

    # Check if there are results before updating string
    if lm_jm_results:
      lm_jm_string += ''.join(lm_jm_results)
      
# Write JM output file  
output_txt("lmjm_results", lm_jm_string)