In [1]:
# ES built-in to Read and Run All Queries

# Required Imports
from elasticsearch import Elasticsearch
import string

es = Elasticsearch(['http://localhost:9200/'])

index_name = "ap89"

In [2]:
# Loads stop words within provided file
stopword_path = '/Users/reelataher/hw1-Reela-Taher/IR_data/AP_DATA/stoplist.txt'  

with open(stopword_path) as f:
  stop_words = set(f.read().split())
  
# Set stopwords to list
stop_words = list(stop_words)

In [3]:
# Path to the directory containing the query file
query_path = '/Users/reelataher/hw1-Reela-Taher/IR_data/AP_DATA/query_desc.51-100.short.txt'

In [4]:
# Function to preprocess queries
def process_query(query):

  # Lowercase 
  query = query.lower()
  
  # Remove punctuation
  query = query.translate(str.maketrans('', '', string.punctuation))   

  # Tokenize on spaces
  words = query.split(' ')

  # Remove stopwords
  filtered = [word for word in words if word not in stop_words]

  # Join back to string
  processed_query = " ".join(filtered)

  return processed_query

In [5]:
# Function to read and return queries from txt file
def read_query(filename):

    # Initialize empty dictionary to store queries 
    queries = {} 

    # Read file line by line
    with open(filename, 'r') as file:
        
        for line in file:
            if not line.strip(): 
                continue

            # Split line to get query number and text
            attributes = line.split('.', 1)  

            # Extract query number and text
            query_number = attributes[0].strip()  
            query = attributes[1].strip()

            # Process query text
            processed_query = process_query(query)
            
            # Store processed query text with query number as key
            queries[query_number] = processed_query
    
    # Return dictionary of queries
    return queries

In [6]:
# Search Elasticsearch index using match query 
def es_search(query):
    
    # Send match query to Elasticsearch
    result = es.search(index=index_name, body={'query': {'match': {'text': query}}}, size=1000)

    # Return the hits
    return result.get('hits', {}).get('hits', [])

In [7]:
# Process results from ES with specified format
def process_results(query_number, hits):
  
  output = []

  # Initialize the rank
  rank = 1

  # Check if there are hits
  if hits:

    # Iterate through the hits  
    for hit in hits:
      
      # Get values from hit
      docno = hit['_source']['DOCNO']
      score = hit.get('_score', 0)  # Use get to handle the case when '_score' is not present
    
      # Create output line 
      output_line = f"{query_number} Q0 {docno} {rank} {score} Exp\n"
      
      output.append(output_line)
      
      # Increment rank
      rank += 1
  
  # Return complete output 
  return output

In [8]:
# Defines directory to add new file
output_path = '/Users/reelataher/hw1-Reela-Taher/Deliverables'

# Takes output string from search and writes it to a text file in the specified output directory
def output_txt(filename, results):
    with open(output_path + '/'+ filename + '.txt', 'w') as file:
        file.write(results)

In [11]:
# Read queries
queries = read_query(query_path)

# Initialize results list
result = ''

# Process each query
for query_number, query_text in queries.items():
    
    # Search
    hits = es_search(query_text)
    
    # Process results
    query_results = process_results(query_number, hits)
    result += ''.join(query_results)

# Write results to a text file
output_file = "query_ES_builtin"
output_txt(output_file, result)