In [2]:
import json
import math
from collections import defaultdict


inverted_index_file_path = './IRDataset/JsonFiles/inverted_index.json'
idf_file_path = './IRDataset/JsonFiles/idf_values.json'
doc_id_mapper_path = './IRDataset/JsonFiles/docId_mapper.json'
doc_vectors_file_path = './IRDataset/JsonFiles/doc_vectors.json'

with open(inverted_index_file_path, 'r') as json_file:
    inverted_index = json.load(json_file)

with open(idf_file_path, 'r') as json_file:
    idf_values = json.load(json_file)

with open(doc_id_mapper_path, 'r') as json_file:
    doc_id_mapper = json.load(json_file)

with open(doc_vectors_file_path, 'r') as json_file:
    doc_vectors = json.load(json_file)

In [3]:
import spacy
from collections import Counter

nlp = spacy.load("en_core_web_sm")


def process_query(query):
    query = query.lower().strip()
    doc = nlp(query)
    query_tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return query_tokens


def build_query_vector(query_tokens, idf_values):
    query_vector = {}
    max_term_freq = max(Counter(query_tokens).values(), default=1)
    
    for term in set(query_tokens):
        tf = Counter(query_tokens)[term] / max_term_freq
        idf = idf_values.get(term, 0)
        query_vector[term] = tf * idf
    
    return query_vector

def cosine_similarity(query_vector, doc_vector):
    dot_product = sum(query_vector[term] * doc_vector.get(term, 0) for term in set(query_vector) & set(doc_vector))
    query_norm = math.sqrt(sum(weight**2 for weight in query_vector.values()))
    doc_norm = math.sqrt(sum(weight**2 for weight in doc_vector.values()))
    
    if query_norm == 0 or doc_norm == 0:
        return 0  
    
    similarity = dot_product / (query_norm * doc_norm)
    return similarity

In [4]:



user_query = "Latest Technology updates in Amazon"
processed_query = process_query(user_query)
query_vector = build_query_vector(processed_query, idf_values)


document_scores = {}

for doc_id, doc_vector in doc_vectors.items():
    similarity = cosine_similarity(query_vector, doc_vector)
    document_scores[doc_id] = similarity


ranked_docs = sorted(document_scores.items(), key=lambda x: x[1], reverse=True)


top_10_document_ids = [doc_id for doc_id, _ in ranked_docs[:10]]

results_file_path = './IRDataset/JsonFiles/query_results.json'
with open(results_file_path, 'w') as results_file:
    json.dump(top_10_document_ids, results_file, indent=2)

print(f'Top 10 Document IDs stored in {results_file_path}')


Top 10 Document IDs stored in ./IRDataset/JsonFiles/query_results.json


In [5]:

import json

def extract_details_from_mapper(query_results_file, doc_mapper_file, output_file):
 
    with open(query_results_file, 'r') as results_file:
        doc_ids = json.load(results_file)

   
    with open(doc_mapper_file, 'r') as mapper_file:
        doc_mapper = json.load(mapper_file)

  
    details = {}

    for doc_id in doc_ids:
        try:
            doc_info = doc_mapper[str(doc_id)]
            url = doc_info.get('URL')
            title = doc_info.get('Title')
            small_content = doc_info.get('First_10_Words')
            
            details[str(doc_id)] = {
                "URL": url,
                "Title": title,
                "First_10_Words": small_content
            }
        except KeyError:
            print(f"Warning: No entry found for docId {doc_id} in the docId_mapper.json file.")

  
    output_data = {"query": "", "details": details}
    with open(output_file, 'w') as output_json:
        json.dump(output_data, output_json, indent=2)


query_results_file_path = './IRDataset/JsonFiles/query_results.json'
doc_mapper_file_path = './IRDataset/JsonFiles/docId_mapper.json'
output_file_path = './IRDataset/JsonFiles/output.json'

extract_details_from_mapper(query_results_file_path, doc_mapper_file_path, output_file_path)
