In [1]:
import spacy
import contextualSpellCheck
import pandas as pd
from tqdm.auto import tqdm, trange
from utils import *
import os
import numpy as np


In [2]:
import pickle


def save(filename, *args):
    # Get global dictionary
    glob = globals()
    d = {}
    for v in args:
        # Copy over desired values
        d[v] = glob[v]
    with open(filename, 'wb') as f:
        # Put them in the file
        pickle.dump(d, f)


def load(filename):
    # Get global dictionary
    glob = globals()
    with open(filename, 'rb') as f:
        for k, v in pickle.load(f).items():
            # Set each global variable to the value from the file
            glob[k] = v

In [None]:
tqdm.pandas()
nlp = spacy.load('en_core_web_sm')
contextualSpellCheck.add_to_pipe(nlp)

Read the docs and queries

In [None]:
if os.path.exists('cache/docs.pkl') and os.path.exists('cache/queries.pkl'):
    load('cache/docs.pkl')
    load('cache/queries.pkl')
else:
    # skip first column
    docs = pd.read_csv('Query_Doc/docs.csv', index_col=0)
    queries = pd.read_csv('Query_Doc/queries.csv', index_col=0)

    # Include another column in the queries data frame containing a list of document ids relevant
    # That list is present in a csv file named qdrel.csv
    # Headers: query_id, doc_id
    qdrel = pd.read_csv('Query_Doc/qdrel.csv', index_col=0)
    qdrel = qdrel.groupby('query_id')['doc_id'] \
        .apply(list).reset_index(name='relevant_docs')
    queries = queries.merge(qdrel, on='query_id')

    # Preprocess the text
    docs['doc_text'] = docs['doc_text'].progress_apply(preprocess)
    queries['query_text'] = queries['query_text'].progress_apply(preprocess)
    
    docs['doc_text'] = [correct_spellings(doc, False)
                        for doc in nlp.pipe(tqdm(docs['doc_text']))]
    queries['query_text'] = [correct_spellings(query, True)
                             for query in nlp.pipe(tqdm(queries['query_text']))]

    # Save the preprocessed data
    os.makedirs('cache', exist_ok=True)

    save('cache/docs.pkl', 'docs')
    save('cache/queries.pkl', 'queries')
# END OF PREPROCESSING7

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Original : What is Atal Pension Yojana What are its benefits
Corrected: What is the Pension? What are its benefits

Original : Where is starch digested How is it digested
Corrected: Where is it eaten How is it here

Original : How do introverts enjoy life
Corrected: How do they enjoy life

Original : Kindly tell me whole process of admission at vits Vellore for biotechi m a bio student in 12I dont have math there
Corrected: Kindly tell me whole process of admission at the Vellore for i m a bio student in I donot have math there

Original : How does Quora look to a moderator
Corrected: How does Quora look to a man

Original : Why does phase shift take place in the output of the common emitter amplifier when compared to the input signal
Corrected: Why does phase shift take place in the output of the common linear amplifier when compared to the input signal

Original : Why do people say Dhanush South Indian actor is ugly I dont think so
Corrected: Why do people say Dhanush South Indian ac

In [None]:
print(f"Docs   : {docs.shape}")
print(f"Queries: {queries.shape}")

print(docs.head())
print(queries.head())
print()

# print the labels
print(docs.columns)
print(queries.columns)

Docs   : (10000, 2)
Queries: (100, 3)
   doc_id                                           doc_text
0       1  What is the step by step guide to invest in sh...
1       2  What is the step by step guide to invest in sh...
2       3     What is the story of Kohinoor KohiNoor Diamond
3       4  What would happen if the Indian government sto...
4       5  How can I increase the speed of my internet co...
   query_id                                         query_text relevant_docs
0      4584                 How can ask questions using photos        [4583]
1      6588         What is the Pension? What are its benefits        [6587]
2     10113                   Where is it eaten How is it here       [10114]
3      7957        What is a conjecture What are some examples        [7956]
4      5498  What can India do to support the people suffer...        [5497]

Index(['doc_id', 'doc_text'], dtype='object')
Index(['query_id', 'query_text', 'relevant_docs'], dtype='object')


In [None]:
# Remove the spell checker from the pipeline
nlp.remove_pipe('contextual spellchecker')

('contextual spellchecker',
 <contextualSpellCheck.contextualSpellCheck.ContextualSpellCheck at 0x211f0c83850>)

Task 1:

In [None]:
doc_tokens = [' '.join(token.text for token in doc)
              for doc in nlp.pipe(tqdm(docs['doc_text']))]

query_tokens = [' '.join(token.text for token in query)
                for query in nlp.pipe(tqdm(queries['query_text']))]

docVectors, queryVectors, vocab = get_vectors(
    doc_tokens, query_tokens)
print("Vocab size  :", len(vocab))
print_scores(docs, queries, docVectors, queryVectors)

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Vocab size  : 2273
Precision@1 : 0.5400
Precision@5 : 0.7700
Precision@10: 0.8155


Task 2.1:

In [None]:
import nltk
stemmer = nltk.stem.PorterStemmer()


doc_stems = [' '.join(stemmer.stem(token.text) for token in doc)
             for doc in nlp.pipe(tqdm(docs['doc_text']))]

query_stems = [' '.join(stemmer.stem(token.text) for token in query)
               for query in nlp.pipe(tqdm(queries['query_text']))]

docVectors, queryVectors, vocab = get_vectors(
    doc_stems, query_stems)
print("Vocab size  :", len(vocab))
print_scores(docs, queries, docVectors, queryVectors)

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Vocab size  : 2011
Precision@1 : 0.6200
Precision@5 : 0.8005
Precision@10: 0.8415


Task 2.2:

In [None]:
doc_lemmas = [' '.join(token.lemma_ for token in doc)
              for doc in nlp.pipe(tqdm(docs['doc_text']))]

query_lemmas = [' '.join(token.lemma_ for token in query)
                for query in nlp.pipe(tqdm(queries['query_text']))]

docVectors, queryVectors, vocab = get_vectors(
    doc_lemmas, query_lemmas)
print("Vocab size  :", len(vocab))
print_scores(docs, queries, docVectors, queryVectors)

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Vocab size  : 1997
Precision@1 : 0.6400
Precision@5 : 0.7955
Precision@10: 0.8328


Task 3:

In [None]:
doc_ner_pos = [get_ner_pos(doc)
               for doc in nlp.pipe(tqdm(docs['doc_text']))]

query_ner_pos = [get_ner_pos(query)
                 for query in nlp.pipe(tqdm(queries['query_text']))]

docVectors, queryVectors, vocab = get_vectors(
    doc_ner_pos, query_ner_pos)

noun_idx = []
named_idx = []

for token, idx in vocab.items():
    if token.endswith('+NAMED_ENTITY') or token.endswith('+PROPN'):
        named_idx.append(idx)
    elif token.endswith('+NOUN'):
        noun_idx.append(idx)
# END for token

# postprocess the vectors [multiply noun by 2, named entity by 4]
for i in trange(docVectors.shape[0]):
    for j in named_idx:
        docVectors[i, j] *= 4
    for j in noun_idx:
        docVectors[i, j] *= 2
# END for i

for i in trange(queryVectors.shape[0]):
    for j in named_idx:
        queryVectors[i, j] *= 4
    for j in noun_idx:
        queryVectors[i, j] *= 2
# END for i

print("Vocab size  :", len(vocab))
print_scores(docs, queries, docVectors, queryVectors)

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Vocab size  : 2204
Precision@1 : 0.5500
Precision@5 : 0.7980
Precision@10: 0.8215


Task 4:

In [None]:
import nltk
stemmer = nltk.stem.PorterStemmer()

doc_special = [' '.join(stemmer.stem(token.lemma_)
                        for token in doc)
               for doc in nlp.pipe(tqdm(docs['doc_text']))]

query_special = [' '.join(stemmer.stem(token.lemma_)
                          for token in query)
                 for query in nlp.pipe(tqdm(queries['query_text']))]


  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
NUM_ITERS = 100
RANDOM_SEED = 42
vocabs = np.zeros(NUM_ITERS)
precisions = np.zeros((NUM_ITERS, 3))
np.random.seed(RANDOM_SEED)

for i in trange(NUM_ITERS):
    # Add Gaussian Noise
    docVectors, queryVectors, vocab = get_vectors(
        doc_special, query_special, add_noise=True)

    vocabs[i] = len(vocab)
    cosine_similarities = cosine_similarity(queryVectors, docVectors)

    for j, k in enumerate([1, 5, 10]):
        precisions[i, j] = calculate_precision_at_k(docs, queries,
                                                    cosine_similarities, k)
    # END for j, k

    print(precisions[i])
# END for i

print(f"Mean Vocab Size   : {vocabs.mean():.1f}")
print(f"Mean Precision@1  : {precisions[:, 0].mean():.4f}")
print(f"Mean Precision@5  : {precisions[:, 1].mean():.4f}")
print(f"Mean Precision@10 : {precisions[:, 2].mean():.4f}")

  0%|          | 0/100 [00:00<?, ?it/s]

[0.62    0.8055  0.84275]
[0.59    0.8225  0.84275]
[0.61    0.8225  0.84375]
[0.6    0.8105 0.8515]
[0.61    0.8305  0.84275]
[0.62   0.8135 0.8415]
[0.59    0.8205  0.84375]
[0.63    0.8285  0.84275]
[0.62    0.8105  0.86275]
[0.62    0.8135  0.84275]
[0.64    0.8075  0.84275]
[0.63   0.8135 0.8515]
[0.61    0.8255  0.84275]
[0.61   0.8205 0.8415]
[0.61    0.8105  0.85275]
[0.61    0.8125  0.84275]
[0.61    0.8105  0.84275]
[0.6    0.8085 0.8515]
[0.61    0.8385  0.85275]
[0.6     0.8305  0.85275]
[0.63    0.8185  0.84275]
[0.62   0.8235 0.8515]
[0.6     0.8005  0.84275]
[0.61    0.8305  0.85275]
[0.62    0.8255  0.85275]
[0.6     0.8185  0.85275]
[0.61    0.8085  0.85275]
[0.6    0.8325 0.8515]
[0.61    0.8175  0.84275]
[0.59    0.8155  0.85275]
[0.6     0.8105  0.85275]
[0.58    0.8185  0.84375]
[0.6     0.8235  0.85275]
[0.61    0.8305  0.86275]
[0.6     0.8305  0.85275]
[0.64    0.8185  0.85275]
[0.63    0.8205  0.85275]
[0.63    0.8285  0.84275]
[0.59    0.8205  0.84275]
[0.62  