# Task -1:


In [None]:
import spacy
import re
import pandas as pd

In [None]:
df = pd.read_csv("./Query_Doc/docs.csv")
queries = pd.read_csv("./Query_Doc/queries.csv")
qdrel = pd.read_csv("./Query_Doc/qdrel.csv")

df.head()

In [None]:
queries.head()

In [103]:
qdrel.head()

Unnamed: 0.1,Unnamed: 0,query_id,doc_id
0,0,318,317
1,1,378,377
2,2,379,380
3,3,399,2606
4,4,399,2607


In [104]:
nlp = spacy.blank("en")

### 1. Preprocessing of the docs and queries - removing the characters other than alphanumerics or whitespaces

In [105]:
def purify_docs(data):
    purified_doc = re.sub(r'[^a-zA-Z0-9\s]', ' ', data)
    return purified_doc

df['pure'] = (df['doc_text']).apply(purify_docs)
queries['pure'] = (queries['query_text']).apply(purify_docs)

print(df['pure'].head())
print(queries['pure'].head())

0    What is the step by step guide to invest in sh...
1    What is the step by step guide to invest in sh...
2    What is the story of Kohinoor  Koh i Noor  Dia...
3    What would happen if the Indian government sto...
4    How can I increase the speed of my internet co...
Name: pure, dtype: object
0                  How can ask questions using photos 
1    What is Atal Pension Yojana  What are its bene...
2        Where is starch digested  How is it digested 
3        What is a conjecture  What are some examples 
4    What can India do to support the people suffer...
Name: pure, dtype: object


### 2. We need to correct the spellings in both queries and documents. For each query, which got corrected, we need to display the original and the corrected query on two spearate lines

In [119]:
nlp = spacy.load("en_core_web_sm")

def derive_tokens(sentence):
    doc = nlp(sentence)
    tokensList = []
    
    for token in doc:
        tokensList.append(token.text)
        
    return tokensList

df['tokens'] = df['pure'].apply(derive_tokens)
print(df['tokens'])

collection = []

for tokens in df['tokens']:
    for token in tokens:
        collection.append(token)
        
generate_vocabulary = set(collection)

all_frequencies = {token: collection.count(token) for token in generate_vocabulary}

filtered_tokens = [token for token in generate_vocabulary if 5 <= all_frequencies[token] <= len(df) * 0.85]

df['tokens'] = df['tokens'].apply(lambda tokens: [token for token in tokens if token in filtered_tokens])
df['sentences'] = df['tokens'].apply(lambda x: ' '.join(x))


from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix_docs = vectorizer.fit_transform(df['sentences'])

queries['tokens'] = queries['pure'].apply(derive_tokens)
queries['tokens'] = queries['tokens'].apply(lambda tokens: [token for token in tokens if token in filtered_tokens])
queries['sentences'] = queries['tokens'].apply(lambda x: ' '.join(x)) 


tfidf_matrix_queries = vectorizer.transform(queries['sentences'])

0       [What, is, the, step, by, step, guide, to, inv...
1       [What, is, the, step, by, step, guide, to, inv...
2       [What, is, the, story, of, Kohinoor,  , Koh, i...
3       [What, would, happen, if, the, Indian, governm...
4       [How, can, I, increase, the, speed, of, my, in...
                              ...                        
9995    [How, will, the, introduction, of, a, GST,  , ...
9996    [Is, a, GST, directly, or, indirectly, going, ...
9997    [What, are, abiotic, factors, and, how, do, th...
9998    [How, do, biotic, and, abiotic, interaction, w...
9999          [What, is, the, most, successful, religion]
Name: tokens, Length: 10000, dtype: object


In [120]:
from sklearn.metrics.pairwise import cosine_similarity

cos_pairwise_relations = cosine_similarity(tfidf_matrix_queries, tfidf_matrix_docs)

def obtain_similar_docs(n):
    return cos_pairwise_relations.argsort(axis=1)[:, -n:][:, ::-1]

def print_similar_docs(n, docs_indices):
    print(f"\nTop {n} Similar Docs: ")
    for i, doc_index in enumerate(docs_indices, start=1):
        doc_text = df.iloc[doc_index]['doc_text']
        # similarity_score = cos_pairwise_relations[index][doc_index]
        print(f"ID: {df.iloc[doc_index]['doc_id']} : {doc_text}")

top_1_similar_docs_indices = obtain_similar_docs(1)
top_5_similar_docs_indices = obtain_similar_docs(5)
top_10_similar_docs_indices = obtain_similar_docs(10)

## We need 3 dictionaries for the calculations of the scores: data_1, data_5, data_10
for index, row in queries.iterrows():
    query = row['query_text']
    print(f"\nGiven Query: {query}")
    
    top_1_docs_indices = top_1_similar_docs_indices[index]
    print_similar_docs(1, top_1_docs_indices)

    top_5_docs_indices = top_5_similar_docs_indices[index]
    print_similar_docs(5, top_5_docs_indices)

    top_10_docs_indices = top_10_similar_docs_indices[index]
    print_similar_docs(10, top_10_docs_indices)


Given Query: How can ask questions using photos?

Top 1 Similar Docs: 
ID: 1377 : What are some of the best photos?

Top 5 Similar Docs: 
ID: 1377 : What are some of the best photos?
ID: 1782 : What are the best interview questions to ask?
ID: 9951 : Is there any way to automatically like Instagram photos with hashtags, using software?
ID: 45 : What are the questions should not ask on Quora?
ID: 4412 : Why do people have to ask Quora for questions?

Top 10 Similar Docs: 
ID: 1377 : What are some of the best photos?
ID: 1782 : What are the best interview questions to ask?
ID: 9951 : Is there any way to automatically like Instagram photos with hashtags, using software?
ID: 45 : What are the questions should not ask on Quora?
ID: 4412 : Why do people have to ask Quora for questions?
ID: 4583 : How do I ask questions with pictures on "Quora"?
ID: 2603 : What are the best questions to ask a girl while chatting?
ID: 9179 : How can I make my questions anonymous?
ID: 2366 : What are good ques

ID: 9112 : What is Quora, and what is the relevence?
ID: 4156 : Getting Started on Quora: What is Quora?
ID: 1923 : How does Anil Ambani benefit from Rafel fighter jet deal?
ID: 3118 : How will Donald Trump benefit India?
ID: 1597 : What is the benefit of going Walking every morning?
ID: 1206 : Why does Saturn benefit a house it sits in slowly?
ID: 2657 : What is the best Advantage of using Quora?
ID: 5846 : What is the purpose of Quora?
ID: 2473 : What is something good about fascism that we would benefit from applying to our liberal democracies?

Given Query: Why do I feel alone sometimes?

Top 1 Similar Docs: 
ID: 6935 : Why do I always feel alone?

Top 5 Similar Docs: 
ID: 6935 : Why do I always feel alone?
ID: 6954 : Why do I feel anxious?
ID: 1043 : Why do I not feel remorse?
ID: 7068 : How do you become comfortable being alone?
ID: 7067 : How can I become comfortable being alone?

Top 10 Similar Docs: 
ID: 6935 : Why do I always feel alone?
ID: 6954 : Why do I feel anxious?
ID: 

ID: 8879 : How certain are you that Hillary Clinton will win the 2016 election?
ID: 408 : What are the reasons that people dislike Hillary Clinton?
ID: 6258 : Why should one vote for Hillary Clinton in the 2016 presidential election?

Top 10 Similar Docs: 
ID: 859 : What will Hillary Clinton do now?
ID: 3547 : Do you think Hillary Clinton’s pneumonia will affect the election?
ID: 8879 : How certain are you that Hillary Clinton will win the 2016 election?
ID: 408 : What are the reasons that people dislike Hillary Clinton?
ID: 6258 : Why should one vote for Hillary Clinton in the 2016 presidential election?
ID: 8848 : What will be Hillary Clinton's India policy if she wins the election?
ID: 9621 : What are Hillary Clinton’s qualifications to be President?
ID: 2784 : Are there any Republicans voting for Hillary Clinton?
ID: 407 : Why do people hate Hillary Clinton?
ID: 5550 : How will Hillary Clinton beat Donald Trump?

Given Query: What are the requirements of modulation?

Top 1 Similar 

In [121]:
precision_at_1_sum = 0.0
precision_at_5_sum = 0.0
precision_at_10_sum = 0.0

for index, row in queries.iterrows():
    relevant_docs = set(qdrel[qdrel['query_id'] == row['query_id']]['doc_id'])

    
    top_1_docs_docs = top_1_similar_docs_indices[index]
    data_1 = set()
    for i, doc_index in enumerate(top_1_docs_docs, start=1):
        data_1.add(df.iloc[doc_index]['doc_id'])
    
    precision_at_1 = len(relevant_docs.intersection(data_1)) / 1
    precision_at_1_sum += precision_at_1

    top_5_docs_docs = top_5_similar_docs_indices[index]
    data_5 = set()
    for i, doc_index in enumerate(top_5_docs_docs, start=1):
        data_5.add(df.iloc[doc_index]['doc_id'])
    
    precision_at_5 = len(relevant_docs.intersection(data_5)) / 5
    precision_at_5_sum += precision_at_5


    top_10_docs_docs = top_10_similar_docs_indices[index]
    data_10 = set()
    for i, doc_index in enumerate(top_10_docs_docs, start=1):
        data_10.add(df.iloc[doc_index]['doc_id'])
    
    precision_at_10 = len(relevant_docs.intersection(data_10)) / 10
    precision_at_10_sum += precision_at_10

    
total_queries = len(queries)
average_precision_at_1 = precision_at_1_sum / total_queries
average_precision_at_5 = precision_at_5_sum / total_queries
average_precision_at_10 = precision_at_10_sum / total_queries

print(f"Average Precision@1: {average_precision_at_1:.4f}")
print(f"Average Precision@5: {average_precision_at_5:.4f}")
print(f"Average Precision@10: {average_precision_at_10:.4f}")


Average Precision@1: 0.5900
Average Precision@5: 0.1880
Average Precision@10: 0.1000


## Task - 2 (Only Stemming) :

In [138]:
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()

nlp = spacy.load("en_core_web_sm")
 

def derive_tokens(sentence):
    doc = nlp(sentence)
    tokensList = []
    
    for token in doc:
        word = st.stem(token.text)
        tokensList.append(word)
        
    return tokensList

df['tokens'] = df['pure'].apply(derive_tokens)

collection = []

for tokens in df['tokens']:
    for token in tokens:
        collection.append(token)
        
generate_vocabulary = set(collection)

all_frequencies = {token: collection.count(token) for token in generate_vocabulary}

filtered_tokens = [token for token in generate_vocabulary if 5 <= all_frequencies[token] <= len(df) * 0.85]

df['tokens'] = df['tokens'].apply(lambda tokens: [token for token in tokens if token in filtered_tokens])
df['sentences'] = df['tokens'].apply(lambda x: ' '.join(x))


from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix_docs = vectorizer.fit_transform(df['sentences'])

queries['tokens'] = queries['pure'].apply(derive_tokens)
queries['tokens'] = queries['tokens'].apply(lambda tokens: [token for token in tokens if token in filtered_tokens])
queries['sentences'] = queries['tokens'].apply(lambda x: ' '.join(x)) 


tfidf_matrix_queries = vectorizer.transform(queries['sentences'])

from sklearn.metrics.pairwise import cosine_similarity

cos_pairwise_relations = cosine_similarity(tfidf_matrix_queries, tfidf_matrix_docs)

def obtain_similar_docs(n):
    return cos_pairwise_relations.argsort(axis=1)[:, -n:][:, ::-1]

top_1_similar_docs_indices = obtain_similar_docs(1)
top_5_similar_docs_indices = obtain_similar_docs(5)
top_10_similar_docs_indices = obtain_similar_docs(10)

    
precision_at_1_sum = 0.0
precision_at_5_sum = 0.0
precision_at_10_sum = 0.0

for index, row in queries.iterrows():
    relevant_docs = set(qdrel[qdrel['query_id'] == row['query_id']]['doc_id'])

    
    top_1_docs_docs = top_1_similar_docs_indices[index]
    data_1 = set()
    for i, doc_index in enumerate(top_1_docs_docs, start=1):
        data_1.add(df.iloc[doc_index]['doc_id'])
    
    precision_at_1 = len(relevant_docs.intersection(data_1)) / 1
    precision_at_1_sum += precision_at_1

    top_5_docs_docs = top_5_similar_docs_indices[index]
    data_5 = set()
    for i, doc_index in enumerate(top_5_docs_docs, start=1):
        data_5.add(df.iloc[doc_index]['doc_id'])
    
    precision_at_5 = len(relevant_docs.intersection(data_5)) / 5
    precision_at_5_sum += precision_at_5


    top_10_docs_docs = top_10_similar_docs_indices[index]
    data_10 = set()
    for i, doc_index in enumerate(top_10_docs_docs, start=1):
        data_10.add(df.iloc[doc_index]['doc_id'])
    
    precision_at_10 = len(relevant_docs.intersection(data_10)) / 10
    precision_at_10_sum += precision_at_10

    
total_queries = len(queries)
average_precision_at_1 = precision_at_1_sum / total_queries
average_precision_at_5 = precision_at_5_sum / total_queries
average_precision_at_10 = precision_at_10_sum / total_queries

print(f"Average Precision@1: {average_precision_at_1:.4f}")
print(f"Average Precision@5: {average_precision_at_5:.4f}")
print(f"Average Precision@10: {average_precision_at_10:.4f}")

Average Precision@1: 0.7300
Average Precision@5: 0.2060
Average Precision@10: 0.1130


## With Lemmatization

In [140]:
nlp = spacy.load("en_core_web_sm")
 

def derive_tokens(sentence):
    doc = nlp(sentence)
    tokensList = []
    
    for token in doc:
        tokensList.append(token.lemma_)
        
    return tokensList

df['tokens'] = df['pure'].apply(derive_tokens)

collection = []

for tokens in df['tokens']:
    for token in tokens:
        collection.append(token)
        
generate_vocabulary = set(collection)

all_frequencies = {token: collection.count(token) for token in generate_vocabulary}

filtered_tokens = [token for token in generate_vocabulary if 5 <= all_frequencies[token] <= len(df) * 0.85]

df['tokens'] = df['tokens'].apply(lambda tokens: [token for token in tokens if token in filtered_tokens])
df['sentences'] = df['tokens'].apply(lambda x: ' '.join(x))


from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix_docs = vectorizer.fit_transform(df['sentences'])

queries['tokens'] = queries['pure'].apply(derive_tokens)
queries['tokens'] = queries['tokens'].apply(lambda tokens: [token for token in tokens if token in filtered_tokens])
queries['sentences'] = queries['tokens'].apply(lambda x: ' '.join(x)) 


tfidf_matrix_queries = vectorizer.transform(queries['sentences'])

from sklearn.metrics.pairwise import cosine_similarity

cos_pairwise_relations = cosine_similarity(tfidf_matrix_queries, tfidf_matrix_docs)

def obtain_similar_docs(n):
    return cos_pairwise_relations.argsort(axis=1)[:, -n:][:, ::-1]

top_1_similar_docs_indices = obtain_similar_docs(1)
top_5_similar_docs_indices = obtain_similar_docs(5)
top_10_similar_docs_indices = obtain_similar_docs(10)

    
precision_at_1_sum = 0.0
precision_at_5_sum = 0.0
precision_at_10_sum = 0.0

for index, row in queries.iterrows():
    relevant_docs = set(qdrel[qdrel['query_id'] == row['query_id']]['doc_id'])

    
    top_1_docs_docs = top_1_similar_docs_indices[index]
    data_1 = set()
    for i, doc_index in enumerate(top_1_docs_docs, start=1):
        data_1.add(df.iloc[doc_index]['doc_id'])
    
    precision_at_1 = len(relevant_docs.intersection(data_1)) / 1
    precision_at_1_sum += precision_at_1

    top_5_docs_docs = top_5_similar_docs_indices[index]
    data_5 = set()
    for i, doc_index in enumerate(top_5_docs_docs, start=1):
        data_5.add(df.iloc[doc_index]['doc_id'])
    
    precision_at_5 = len(relevant_docs.intersection(data_5)) / 5
    precision_at_5_sum += precision_at_5


    top_10_docs_docs = top_10_similar_docs_indices[index]
    data_10 = set()
    for i, doc_index in enumerate(top_10_docs_docs, start=1):
        data_10.add(df.iloc[doc_index]['doc_id'])
    
    precision_at_10 = len(relevant_docs.intersection(data_10)) / 10
    precision_at_10_sum += precision_at_10

    
total_queries = len(queries)
average_precision_at_1 = precision_at_1_sum / total_queries
average_precision_at_5 = precision_at_5_sum / total_queries
average_precision_at_10 = precision_at_10_sum / total_queries

print(f"Average Precision@1: {average_precision_at_1:.4f}")
print(f"Average Precision@5: {average_precision_at_5:.4f}")
print(f"Average Precision@10: {average_precision_at_10:.4f}")

Average Precision@1: 0.6900
Average Precision@5: 0.1920
Average Precision@10: 0.1070


## Task - 3