# Task -1:


In [12]:
import spacy
import re
import pandas as pd

In [13]:
df = pd.read_csv("./Query_Doc/docs.csv")
queries = pd.read_csv("./Query_Doc/queries.csv")

df.head()

Unnamed: 0.1,Unnamed: 0,doc_id,doc_text
0,0,1,What is the step by step guide to invest in sh...
1,1,2,What is the step by step guide to invest in sh...
2,2,3,What is the story of Kohinoor (Koh-i-Noor) Dia...
3,3,4,What would happen if the Indian government sto...
4,4,5,How can I increase the speed of my internet co...


In [14]:
queries.head()

Unnamed: 0.1,Unnamed: 0,query_id,query_text
0,0,4584,How can ask questions using photos?
1,1,6588,What is Atal Pension Yojana? What are its bene...
2,2,10113,Where is starch digested? How is it digested?
3,3,7957,What is a conjecture? What are some examples?
4,4,5498,What can India do to support the people suffer...


In [15]:
nlp = spacy.blank("en")

### 1. Preprocessing of the docs and queries - removing the characters other than alphanumerics or whitespaces

In [16]:
def purify_docs(data):
    purified_doc = re.sub(r'[^a-zA-Z0-9\s]', ' ', data)
    return purified_doc

df['pure'] = (df['doc_text']).apply(purify_docs)
queries['pure'] = (queries['query_text']).apply(purify_docs)

# Display the DataFrame
print(df['pure'].head())
print(queries['pure'].head())

0    What is the step by step guide to invest in sh...
1    What is the step by step guide to invest in sh...
2    What is the story of Kohinoor  Koh i Noor  Dia...
3    What would happen if the Indian government sto...
4    How can I increase the speed of my internet co...
Name: pure, dtype: object
0                  How can ask questions using photos 
1    What is Atal Pension Yojana  What are its bene...
2        Where is starch digested  How is it digested 
3        What is a conjecture  What are some examples 
4    What can India do to support the people suffer...
Name: pure, dtype: object


### 2. We need to correct the spellings in both queries and documents. For each query, which got corrected, we need to display the original and the corrected query on two spearate lines

In [None]:
## This is problematic!!
import contextualSpellCheck

nlp = spacy.load('en_core_web_sm')
contextualSpellCheck.add_to_pipe(nlp)

def rectify_sentences(sentence):
    doc = nlp(sentence)
    return doc._.outcome_spellCheck

queries['rectified'] = queries['pure'].apply(rectify_sentences)
df['rectified'] = df['pure'].apply(rectify_sentences)

for index, row in queries.iterrows():
    original_query = row['pure']
    corrected_query = row['rectified']
    if original_query != corrected_query:
        print(f"Original Query: {original_query}")
        print(f"Corrected Query: {corrected_query}")
        print("\n")

In [None]:
nlp = spacy.black("en")

def derive_tokens(sentence):
    doc = nlp(sentence)
    tokensList = [token.text for token in doc]
    return tokensList

docs_df['tokens'] = df['rectified'].apply(derive_tokens)


lower_limit = 5
upper_limit = 0.85

collection = []

for tokens in df['tokens']:
    for token in tokens:
        collection.append(token)
        
vocab = set(collection)

doc_freq = {token: collection.count(token) for token in vocab}

filtered_tokens = [token for token in unique_tokens if min_df <= doc_freq[token] <= len(docs_df) * max_df]
docs_df['tokens'] = docs_df['tokens'].apply(lambda tokens: [token for token in tokens if token in filtered_tokens])

# Create TF-IDF vectors for documents
vectorizer = TfidfVectorizer()
tfidf_matrix_docs = vectorizer.fit_transform(docs_df['rectified'])

query_df['tokens'] = query_df['corrected_query'].apply(tokenize)
query_df['tokens'] = query_df['tokens'].apply(lambda tokens: [token for token in tokens if token in filtered_tokens])

tfidf_matrix_queries = vectorizer.transform(query_df['rec'])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Assuming tfidf_matrix_docs and tfidf_matrix_queries are already created
# Using the TF-IDF vectors from the previous code snippets

# Calculate cosine similarity between query vectors and document vectors
cosine_similarities = cosine_similarity(tfidf_matrix_queries, tfidf_matrix_docs)

# Find the top 5 and top 10 most similar documents for each query
top_5_similar_docs_indices = cosine_similarities.argsort(axis=1)[:, -5:][:, ::-1]
top_10_similar_docs_indices = cosine_similarities.argsort(axis=1)[:, -10:][:, ::-1]

# Assuming you have the 'query' DataFrame with original queries
for index, row in query_df.iterrows():
    query_text = row['corrected_query']
    print(f"\nQuery: {query_text}")

    # Retrieve the top 5 most similar documents
    top_5_docs_indices = top_5_similar_docs_indices[index]
    print("\nTop 5 Most Similar Documents:")
    for i, doc_index in enumerate(top_5_docs_indices, start=1):
        doc_text = docs_df.iloc[doc_index]['doc_text']
        similarity_score = cosine_similarities[index][doc_index]
        print(f"{i}. Document {doc_index + 1} (Similarity Score: {similarity_score:.4f}): {doc_text}")

    # Retrieve the top 10 most similar documents
    top_10_docs_indices = top_10_similar_docs_indices[index]
    print("\nTop 10 Most Similar Documents:")
    for i, doc_index in enumerate(top_10_docs_indices, start=1):
        doc_text = docs_df.iloc[doc_index]['doc_text']
        similarity_score = cosine_similarities[index][doc_index]
        print(f"{i}. Document {doc_index + 1} (Similarity Score: {similarity_score:.4f}): {doc_text}")

In [None]:
# Assuming top_1_similar_docs_indices, top_5_similar_docs_indices, and top_10_similar_docs_indices are already calculated

# Initialize variables to store Precision@k scores
precision_at_1_sum = 0.0
precision_at_5_sum = 0.0
precision_at_10_sum = 0.0

# Loop through each query
for index, row in query_df.iterrows():
    relevant_docs = set(qdrel_df[qdrel_df['query_id'] == row['query_id']]['doc_id'])
    
    # Calculate Precision@1
    top_1_docs = set(top_1_similar_docs_indices[index])
    precision_at_1 = len(relevant_docs.intersection(top_1_docs)) / 1
    precision_at_1_sum += precision_at_1

    # Calculate Precision@5
    top_5_docs = set(top_5_similar_docs_indices[index])
    precision_at_5 = len(relevant_docs.intersection(top_5_docs)) / 5
    precision_at_5_sum += precision_at_5

    # Calculate Precision@10
    top_10_docs = set(top_10_similar_docs_indices[index])
    precision_at_10 = len(relevant_docs.intersection(top_10_docs)) / 10
    precision_at_10_sum += precision_at_10

# Calculate average Precision@k over all queries
total_queries = len(query_df)
average_precision_at_1 = precision_at_1_sum / total_queries
average_precision_at_5 = precision_at_5_sum / total_queries
average_precision_at_10 = precision_at_10_sum / total_queries

# Print the results
print(f"Average Precision@1: {average_precision_at_1:.4f}")
print(f"Average Precision@5: {average_precision_at_5:.4f}")
print(f"Average Precision@10: {average_precision_at_10:.4f}")