# Query Processing 🌼

In [1]:
# Spell checking
# pip install pyspellchecker 

from spellchecker import SpellChecker
from nltk.tokenize import word_tokenize
from typing import List  # Import the List type from the typing module
import re #to remove stop words
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

class QueryProcessing:
        
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()

    @staticmethod
    def correct_sentence_spelling(tokens: List[str]) -> List[str]:
        spell = SpellChecker()
        misspelled = spell.unknown(tokens)
        for i, token in enumerate(tokens):
            if token in misspelled:
                corrected = spell.correction(token)
                if corrected is not None:
                    tokens[i] = corrected
        return tokens

    @staticmethod
    def query_processing(query_text):

    # Normalization
        query_text = re.sub(r'\W', ' ', str(query_text)) # Replace non-word characters with a space
        query_text = re.sub(r'\s+', ' ', query_text)  # Remove extra spaces

        # Convert the entire document to lowercase
        query_text = query_text.lower() 

        # Tokenize into words
        query_text_words = word_tokenize(query_text)
        query_text_words = QueryProcessing.correct_sentence_spelling(query_text_words)
 
        # Stemming      
        query_text_stemmed_words = [QueryProcessing.stemmer.stem(word) for word in query_text_words] 
        
        # Lemmatization
        query_text_Lemmatized_words = [QueryProcessing.lemmatizer.lemmatize(word) for word in query_text_stemmed_words]
       
        # Normalization
        # Remove stopwords from the text 
        query_text_words = [word for word in query_text_Lemmatized_words if word not in stopwords.words('English')]

        removed_punctuation_query = [word for word in query_text_words if word.isalnum()]

        return removed_punctuation_query



# Matching & Ranking 🌼

In [35]:
import pickle
from QueryProcessing import QueryProcessing
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

class MatchingRanking:

    # Convert the documents into a list of strings
    df = pd.read_csv('recreation-collection.csv',encoding='latin-1')
    document_strings = []
    for document in df['doc']:
        document_strings.append(document)
    # print(f"The size of the document_strings is: {len(document_strings)}")


    document_IDs = []
    for document_id in df['num']:
        document_IDs.append(document_id)
    # print(f"The size of the document_IDs is: {len(document_IDs)}")

    @staticmethod
    def matching_and_ranking(query_text):
        # Load tfidf_matrix from the binary file
        with open('tfidf_matrix.bin', 'rb') as file:
            tfidf_matrix = pickle.load(file)
            # print("tfidf_matrix: ")
            # print(tfidf_matrix.shape)

        # Load the model
        with open('model.pkl', 'rb') as file:
            vectorizer = pickle.load(file)
            
        # Preprocess the query
        queryProcessing_instance = QueryProcessing()
        query = queryProcessing_instance.query_processing(query_text) 

        # Transform the query into a vector
        query_string = ' '.join(query)  # Convert the list of tokens back into a single string

        # Convert the query document to a TF-IDF vector
        query_vector = vectorizer.transform([query_string])

        # Calculate cosine similarity between the query vector and document vectors
        cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()

        # Rank the results
        document_indices = np.argsort(cosine_similarities)[::-1]
        k = 10
        ranked_documents = document_indices[:k]  

        ranked_document_ids = []

        # Print the ranked documents
        for idx in ranked_documents:
            ranked_document_ids.append(MatchingRanking.document_IDs[idx])

        return ranked_document_ids

# Call the matching_and_ranking() method to execute the code
# MatchingRanking.matching_and_ranking("real world we dont use most of it")
    
# First Data
# qrels = Evaluation.load_qrels("C:\\Users\\DELL\\Desktop\\antique DataSet\\antique-test.qrel")
# queries = Evaluation.load_queries("C:\\Users\\DELL\\Desktop\\antique DataSet\\antique-test-queries.txt")

# ranked_doc_ids = {}
# for query_id, query_text in queries.items():
#     ranked_doc_ids[query_id] = MatchingRanking.matching_and_ranking(query_text)
    
# Second Data
qrels_recreation = Evaluation.load_qrels_recreation("C:\\Users\\Asus\\Desktop\\project 2-6\\project 2-6\\qas.search.jsonl")
queries_recreation = Evaluation.load_queries_recreation("C:\\Users\\Asus\\Desktop\\project 2-6\\project 2-6\\questions.search.txt")

ranked_doc_ids_recreation = {}
for query_id, query_text in queries_recreation.items():
    ranked_doc_ids_recreation[query_id] = MatchingRanking.matching_and_ranking(query_text)
    

# Evaluation 🌼

In [44]:
from MatchingRanking import MatchingRanking
import json

class Evaluation:

    @staticmethod
    def load_qrels(file_path):
        qrels = {}
        with open(file_path, 'r') as f:
            for line in f:
                query_id, _, doc_id, relevance = line.strip().split()
                if query_id not in qrels:
                    qrels[query_id] = {}
                qrels[query_id][doc_id] = int(relevance)
        return qrels

    @staticmethod
    def load_queries(file_path):
        queries = {}
        with open(file_path, 'r') as f:
            for line in f:
                query_id, query_text = line.strip().split('\t')
                queries[query_id] = query_text
        return queries


    @staticmethod    
    def load_qrels_recreation(file_path):
        qrels = {}
        with open(file_path, 'r') as f:
            for line in f: 
                try:
                    # Attempt to parse each line as JSON
                    data = json.loads(line.strip())  # Parse each line as separate JSON
                    query_id =  str(data['qid'])
                    for doc_id in data['answer_pids']:
                        relevance = 1  # Assuming relevance is always 1 in this format
                        if query_id not in qrels:
                            qrels[query_id] = {}
                        qrels[query_id][doc_id] = relevance
                    relevance = 1  # Assuming relevance is always 1 in this format
                except json.JSONDecodeError:
                    # Handle potential non-JSON lines
                    continue  # Skip to the next line

                if query_id not in qrels:
                    qrels[query_id] = {}
                qrels[query_id][doc_id] = relevance
        return qrels

    @staticmethod
    def load_queries_recreation(file_path):
        queries = {}
        with open(file_path, 'r') as f:
            for line in f:
                query_id, query_text = line.strip().split('\t')
                queries[query_id] = query_text
        return queries
    
    
    @staticmethod
    def calculate_precision_at_k(qrels, queries, ranked_doc_ids, k=10):
        avg_precision_at_k = 0
        num_queries = 0

        for query_id, query_text in queries.items():
            if query_id not in qrels:
                print(f"No relevance judgments found for query {query_id}.")
                continue

            relevant_docs_at_k = 0
            for i, doc_id in enumerate(ranked_doc_ids[query_id]):
                if i >= k:
                    break
                if doc_id in qrels[query_id] and qrels[query_id][doc_id] >= 1:
                    relevant_docs_at_k += 1

            precision_at_k = relevant_docs_at_k / k
            avg_precision_at_k += precision_at_k
            num_queries += 1

        if num_queries > 0:
            avg_precision_at_k /= num_queries
        return avg_precision_at_k



    @staticmethod
    def calculate_recall_at_k(qrels, queries, ranked_doc_ids, k=10, relevance_threshold=8):
        avg_recall_at_k = 0
        num_queries = 0

        for query_id, query_text in queries.items():
            if query_id not in qrels:
                print(f"No relevance judgments found for query {query_id}.")
                continue

            relevant_docs_at_k = 0
            for i, doc_id in enumerate(ranked_doc_ids[query_id]):
                if i >= k:
                    break
                if doc_id in qrels[query_id] and qrels[query_id][doc_id] >= 1:
                    relevant_docs_at_k += 1

            recall_at_k = relevant_docs_at_k / relevance_threshold
            avg_recall_at_k += recall_at_k
            num_queries += 1

        if num_queries > 0:
            avg_recall_at_k /= num_queries
        return avg_recall_at_k


    @staticmethod
    def calculate_mean_average_precision(qrels, queries, ranked_doc_ids):
        average_precisions = []

        for query_id, query_text in queries.items():
            if query_id not in qrels:
                print(f"No relevance judgments found for query {query_id}.")
                continue

            relevant_docs = 0
            precision_values = []

            for i, doc_id in enumerate(ranked_doc_ids[query_id]):
                if doc_id in qrels[query_id] and qrels[query_id][doc_id] >= 1:
                    relevant_docs += 1
                    precision = relevant_docs / (i + 1)
                    precision_values.append(precision)

            if relevant_docs > 0:
                average_precision = sum(precision_values) / relevant_docs
                average_precisions.append(average_precision)

        return sum(average_precisions) / len(average_precisions)

    @staticmethod
    def calculate_mean_reciprocal_rank(qrels, queries, ranked_doc_ids):
        reciprocal_ranks = []

        for query_id, query_text in queries.items():
            if query_id not in qrels:
                print(f"No relevance judgments found for query {query_id}.")
                continue

            found_relevant = False
            rank = 1

            for doc_id in ranked_doc_ids[query_id]:
                if doc_id in qrels[query_id] and qrels[query_id][doc_id] >= 1:
                    reciprocal_rank = 1 / rank
                    reciprocal_ranks.append(reciprocal_rank)
                    found_relevant = True
                    break
                rank += 1

            if not found_relevant:
                reciprocal_ranks.append(0.0)

        return sum(reciprocal_ranks) / len(reciprocal_ranks)
    



## First Data

In [18]:
avg_p_at_10 = Evaluation.calculate_precision_at_k(qrels, queries, ranked_doc_ids, k=10)
print(f"Average Precision@10 antique: {avg_p_at_10:.4f}")

avg_recall_at_10 = Evaluation.calculate_recall_at_k(qrels, queries, ranked_doc_ids, k=10)
print(f"Average Recall@10 antique: {avg_recall_at_10:.4f}")

map_score = Evaluation.calculate_mean_average_precision(qrels, queries, ranked_doc_ids)
print(f"Mean Average Precision (MAP) antique: {map_score:.4f}")

mrr_score = Evaluation.calculate_mean_reciprocal_rank(qrels, queries, ranked_doc_ids)
print(f"Mean Reciprocal Rank (MRR) antique: {mrr_score:.4f}")

Average Precision@10 antique: 0.5845
Average Recall@10 antique: 0.5675
Mean Average Precision (MAP) antique: 0.6855
Mean Reciprocal Rank (MRR) antique: 0.6465


## Second Data

In [45]:
avg_p_at_10_recreation = Evaluation.calculate_precision_at_k(qrels_recreation, queries_recreation, ranked_doc_ids_recreation, k=10)
print(f"Average Precision@10 recreation: {avg_p_at_10_recreation:.4f}")

avg_recall_at_10_recreation = Evaluation.calculate_recall_at_k(qrels_recreation, queries_recreation, ranked_doc_ids_recreation, k=10)
print(f"Average Recall@10 recreation: {avg_recall_at_10_recreation:.4f}")

map_score_recreation = Evaluation.calculate_mean_average_precision(qrels_recreation, queries_recreation, ranked_doc_ids_recreation)
print(f"Mean Average Precision (MAP) recreation: {map_score_recreation:.4f}")

mrr_score_recreation = Evaluation.calculate_mean_reciprocal_rank(qrels_recreation, queries_recreation, ranked_doc_ids_recreation)
print(f"Mean Reciprocal Rank (MRR) recreation: {mrr_score_recreation:.4f}")

Average Precision@10 recreation: 0.4833
Average Recall@10 recreation: 0.4609
Mean Average Precision (MAP) recreation: 0.4709
Mean Reciprocal Rank (MRR) recreation: 0.1753
