## Test Environment for completing task 3

In [1]:
from elasticsearch import Elasticsearch, exceptions
from tokenizer import porter_processing

es = Elasticsearch([{"host":"localhost","port":9200, "scheme": "http"}])
print(f"Elasticsearch instantiated? {es.ping()}")

Elasticsearch instantiated? True


### Getting mtermvector data

In [2]:
response = es.mtermvectors(index="ap89_final",term_statistics= True, ids = ["AP891023-0169"], fields = "_content")

import json

file_path = 'mtermvectors_response.json'

with open(file_path, 'w') as file:
    json.dump(response, file, indent=2)

### Processing the query

In [3]:
from tokenizer import porter_processing

query_processed = porter_processing("identify individuals or organizations which produce fiber optics equipment.")

query = {
  "query": {
    "match": {
      "_content": query_processed
    }
  }
}

# Replace 'your_index_name' with the name of your index
response = es.search(index="ap89_final", body=query)

file_path = 'queryresp_mod.json'

with open(file_path, 'w') as file:
    json.dump(response, file, indent=2)
print(response)
# Print the returned documents
# for hit in response['hits']['hits']:
#     print(hit['_source'])  # Prints out the source of each hit

Entered tokenizer
Exit tokenizer
{'took': 9, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 10000, 'relation': 'gte'}, 'max_score': 24.236298, 'hits': [{'_index': 'ap89_final', '_type': '_doc', '_id': 'AP890706-0122', '_score': 24.236298, '_source': {'_content': 'follow project exampl equal urgent east west europ transeuropean highspe railway common european program new solarenergi technolog equip process store nuclear wast enhanc safeti nuclear power station addit fiber optic channel transmit inform alleuropean satellit tv system intern thermonuclear experiment reactor'}}, {'_index': 'ap89_final', '_type': '_doc', '_id': 'AP890913-0208', '_score': 22.392181, '_source': {'_content': 'commerc secretari robert mosbach call wednesday structur chang japan economi fewer trade barrier allow unit state compet japan benefit japanes consum plight japanes consum pay highest price world hous food result directli countri close m

  response = es.search(index="ap89_final", body=query)


In [4]:
# Getting a list of all the documents
import re

def documentid_fetcher():

    document_list = []
    file_path = "/home/burpcat/Documents/assignments/ir/hw1-burpcat/IR_data/AP_DATA/doclist_new_0609.txt"

    with open(file_path, 'r') as file:
        for line in file:
            document_list.append(line.split()[1])
            
        return document_list

In [6]:
document_list = documentid_fetcher()

def get_term_vectors_batch(index_name, document_list, batch_size=50):
    responses = []  # List to store responses
    for i in range(0, len(document_list), batch_size):
        batch = document_list[i:i + batch_size]
        response = es.mtermvectors(index=index_name, term_statistics=True, ids=batch, fields=["_content"])
        responses.append(response)
    return responses

all_responses = get_term_vectors_batch("ap89_final", document_list, batch_size=50)

In [12]:
number_of_documents = len(document_list)
documents_and_vectors = {}
documents_and_documentlength = {}
term_to_docfrequency = {}
termtotalfrequency = {}
terms_list = set()

def extracting_termvectors(all_responses):
    for ele in all_responses:
        term_vector = ele['docs']
        document_id = term_vector[0]["_id"]

        if "_content" not in term_vector[0]["term_vectors"]:
            documents_and_vectors[document_id] = {}
            documents_and_documentlength = {}
        else:
            terms = term_vector[0]["term_vectors"]["_content"]["terms"]

            for term in terms:
                term_to_docfrequency[term] = terms[term]['doc_freq']
                termtotalfrequency[term] = terms[term]['ttf']

            documents_and_vectors[document_id] = terms
            # documents_and_documentlength[document_id] = sum([terms[x]['term_freq'] for x in terms])

            terms_list.add(term)


In [13]:
extracting_termvectors(all_responses)

In [18]:
# avg_doc_length = sum([documents_and_documentlength[ele] for ele in document_list])  / number_of_documents
total_doc_length = sum([documents_and_documentlength[ele] for ele in documents_and_documentlength])
vocab_size = len(terms_list)

In [19]:
# Return document frequency for a term

def documentfrequency(term):
    if term in term_to_docfrequency:
        return term_to_docfrequency[term]
    else:
        return 0

def termfrequency(term,document_id):
    if term in documents_and_vectors[document_id]:
        return documents_and_vectors[document_id][term]['term_freq']
    else:
        return 0
    
def termfrequency_in_query(term,query):
    return query.count(term)