# Task -1:


In [21]:
import spacy
import re
import pandas as pd

In [22]:
df = pd.read_csv("./Query_Doc/docs.csv")
queries = pd.read_csv("./Query_Doc/queries.csv")
qdrel = pd.read_csv("./Query_Doc/qdrel.csv")

df.head()

Unnamed: 0.1,Unnamed: 0,doc_id,doc_text
0,0,1,What is the step by step guide to invest in sh...
1,1,2,What is the step by step guide to invest in sh...
2,2,3,What is the story of Kohinoor (Koh-i-Noor) Dia...
3,3,4,What would happen if the Indian government sto...
4,4,5,How can I increase the speed of my internet co...


In [23]:
queries.head()

Unnamed: 0.1,Unnamed: 0,query_id,query_text
0,0,4584,How can ask questions using photos?
1,1,6588,What is Atal Pension Yojana? What are its bene...
2,2,10113,Where is starch digested? How is it digested?
3,3,7957,What is a conjecture? What are some examples?
4,4,5498,What can India do to support the people suffer...


In [24]:
qdrel.head()

Unnamed: 0.1,Unnamed: 0,query_id,doc_id
0,0,318,317
1,1,378,377
2,2,379,380
3,3,399,2606
4,4,399,2607


In [25]:
nlp = spacy.blank("en")

### 1. Preprocessing of the docs and queries - removing the characters other than alphanumerics or whitespaces

In [26]:
def purify_docs(data):
    purified_doc = re.sub(r'[^a-zA-Z0-9\s]', ' ', data)
    return purified_doc

df['pure'] = (df['doc_text']).apply(purify_docs)
queries['pure'] = (queries['query_text']).apply(purify_docs)

# Display the DataFrame
print(df['pure'].head())
print(queries['pure'].head())

0    What is the step by step guide to invest in sh...
1    What is the step by step guide to invest in sh...
2    What is the story of Kohinoor  Koh i Noor  Dia...
3    What would happen if the Indian government sto...
4    How can I increase the speed of my internet co...
Name: pure, dtype: object
0                  How can ask questions using photos 
1    What is Atal Pension Yojana  What are its bene...
2        Where is starch digested  How is it digested 
3        What is a conjecture  What are some examples 
4    What can India do to support the people suffer...
Name: pure, dtype: object


### 2. We need to correct the spellings in both queries and documents. For each query, which got corrected, we need to display the original and the corrected query on two spearate lines

In [None]:
## This is problematic!!
import contextualSpellCheck

nlp = spacy.load('en_core_web_sm')
contextualSpellCheck.add_to_pipe(nlp)

def rectify_sentences(sentence):
    doc = nlp(sentence)
    return doc._.outcome_spellCheck

queries['rectified'] = queries['pure'].apply(rectify_sentences)
## df['rectified'] = df['pure'].apply(rectify_sentences)

for index, row in queries.iterrows():
    original_query = row['pure']
    corrected_query = row['rectified']
    if original_query != corrected_query:
        print(f"Original Query: {original_query}")
        print(f"Corrected Query: {corrected_query}")
        print("\n")

In [27]:
nlp = spacy.blank("en")

def derive_tokens(sentence):
    doc = nlp(sentence)
    tokensList = []
    
    for token in doc:
        tokensList.append(token.text)
        
    return tokensList

# Would have to change this after the spell checking 
df['tokens'] = df['pure'].apply(derive_tokens)


lower_limit = 5
upper_limit = 0.85

collection = []

for tokens in df['tokens']:
    for token in tokens:
        collection.append(token)
        
set_vocabulary = set(collection)

all_frequencies = {token: collection.count(token) for token in set_vocabulary}

filtered_tokens = [token for token in set_vocabulary if lower_limit <= all_frequencies[token] <= len(df) * upper_limit]
df['tokens'] = df['tokens'].apply(lambda tokens: [token for token in tokens if token in filtered_tokens])

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix_docs = vectorizer.fit_transform(df['pure'])

queries['tokens'] = queries['pure'].apply(derive_tokens)
queries['tokens'] = queries['tokens'].apply(lambda tokens: [token for token in tokens if token in filtered_tokens])

tfidf_matrix_queries = vectorizer.transform(queries['pure'])

In [31]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarities = cosine_similarity(tfidf_matrix_queries, tfidf_matrix_docs)

top_1_similar_docs_indices = cosine_similarities.argsort(axis=1)[:, -1:][:, ::-1]
top_5_similar_docs_indices = cosine_similarities.argsort(axis=1)[:, -5:][:, ::-1]
top_10_similar_docs_indices = cosine_similarities.argsort(axis=1)[:, -10:][:, ::-1]

for index, row in queries.iterrows():
    query_text = row['pure']
    print(f"\nQuery: {query_text}")

    top_5_docs_indices = top_5_similar_docs_indices[index]
    print("\nTop 5 Most Similar Documents:")
    for i, doc_index in enumerate(top_5_docs_indices, start=1):
        doc_text = df.iloc[doc_index]['doc_text']
        similarity_score = cosine_similarities[index][doc_index]
        print(f"{i}. Document {doc_index + 1} (Similarity Score: {similarity_score:.4f}): {doc_text}")

    # Retrieve the top 10 most similar documents
    top_10_docs_indices = top_10_similar_docs_indices[index]
    print("\nTop 10 Most Similar Documents:")
    for i, doc_index in enumerate(top_10_docs_indices, start=1):
        doc_text = df.iloc[doc_index]['doc_text']
        similarity_score = cosine_similarities[index][doc_index]
        print(f"{i}. Document {doc_index + 1} (Similarity Score: {similarity_score:.4f}): {doc_text}")


Query: How can ask questions using photos 

Top 5 Most Similar Documents:
1. Document 1329 (Similarity Score: 0.4481): What are some of the best photos?
2. Document 1721 (Similarity Score: 0.4353): What are the best interview questions to ask?
3. Document 45 (Similarity Score: 0.4126): What are the questions should not ask on Quora?
4. Document 4276 (Similarity Score: 0.4077): Why do people have to ask Quora for questions?
5. Document 4440 (Similarity Score: 0.4072): How do I ask questions with pictures on "Quora"?

Top 10 Most Similar Documents:
1. Document 1329 (Similarity Score: 0.4481): What are some of the best photos?
2. Document 1721 (Similarity Score: 0.4353): What are the best interview questions to ask?
3. Document 45 (Similarity Score: 0.4126): What are the questions should not ask on Quora?
4. Document 4276 (Similarity Score: 0.4077): Why do people have to ask Quora for questions?
5. Document 4440 (Similarity Score: 0.4072): How do I ask questions with pictures on "Quora"?

In [34]:
precision_at_1_sum = 0.0
precision_at_5_sum = 0.0
precision_at_10_sum = 0.0

for index, row in queries.iterrows():
    relevant_docs = set(qdrel[qdrel['query_id'] == row['query_id']]['doc_id'])
    
    top_1_docs = set(top_1_similar_docs_indices[index])
    precision_at_1 = len(relevant_docs.intersection(top_1_docs)) / 1
    precision_at_1_sum += precision_at_1

    top_5_docs = set(top_5_similar_docs_indices[index])
    precision_at_5 = len(relevant_docs.intersection(top_5_docs)) / 5
    precision_at_5_sum += precision_at_5

    top_10_docs = set(top_10_similar_docs_indices[index])
    precision_at_10 = len(relevant_docs.intersection(top_10_docs)) / 10
    precision_at_10_sum += precision_at_10

total_queries = len(queries)
average_precision_at_1 = precision_at_1_sum / total_queries
average_precision_at_5 = precision_at_5_sum / total_queries
average_precision_at_10 = precision_at_10_sum / total_queries

# Print the results
print(f"Average Precision@1: {average_precision_at_1:.4f}")
print(f"Average Precision@5: {average_precision_at_5:.4f}")
print(f"Average Precision@10: {average_precision_at_10:.4f}")

Average Precision@1: 0.0000
Average Precision@5: 0.0000
Average Precision@10: 0.0000
