In [14]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/yandex-personalized-web-search-challenge/terms_russian.pdf
/kaggle/input/yandex-personalized-web-search-challenge/random-baseline.gz
/kaggle/input/yandex-personalized-web-search-challenge/dramatica-comic-book-2004.pdf
/kaggle/input/yandex-personalized-web-search-challenge/test.gz
/kaggle/input/yandex-personalized-web-search-challenge/terms_english.pdf
/kaggle/input/yandex-personalized-web-search-challenge/non-personalised-baseline.gz
/kaggle/input/yandex-personalized-web-search-challenge/train.gz
/kaggle/input/ms-marco-queries/msmarco-doctrain-queries.tsv


In [28]:
import requests

from tqdm import tqdm
tqdm.pandas()

# 1. Importing Files

In [15]:
queries_df = pd.read_csv('../input/ms-marco-queries/msmarco-doctrain-queries.tsv', sep = '\t', names = ['qid','query'])
queries_df = queries_df.set_index('qid')
display(queries_df.head(10))
display(queries_df.tail(10))

Unnamed: 0_level_0,query
qid,Unnamed: 1_level_1
1185869,)what was the immediate impact of the success ...
1185868,_________ justice is designed to repair the ha...
1183785,elegxo meaning
645590,what does physical medicine do
186154,feeding rice cereal how many times per day
457407,most dependable affordable cars
441383,lithophile definition
683408,what is a flail chest
484187,put yourself on child support in texas
666321,what happens in a wrist sprain


Unnamed: 0_level_0,query
qid,Unnamed: 1_level_1
234427,how fast should a power chair go
285891,how many hours do you need for an aa
1164802,what causes dry cough?
87046,causes of irritated mouth tissue
562255,what are nephridia?
19285,anterolisthesis definition
558837,what are fishing flies
559149,what are fsh levels during perimenopause
706678,what is a yowie
405466,is carbonic acid soluble


In [30]:
print(os.listdir('./')) # This will print the content of current directory
print(os.listdir('../input')) # This will print the content of input directory

['__notebook_source__.ipynb']
['yandex-personalized-web-search-challenge', 'ms-marco-queries']


# 2. EDA

In [16]:
# Let's first check for any null rows

null_rows = queries_df[queries_df.isnull().any(axis = 1)]
display(null_rows)

# Cool, there aren't any

Unnamed: 0_level_0,query
qid,Unnamed: 1_level_1


In [17]:
# Now let's try to find the average length of a query

def return_length(n):
    return len(n)

total_len = np.sum(queries_df['query'].apply(return_length))
average_query_length = total_len/len(queries_df)
print(total_len)
print(average_query_length)

# 33 characters seems reasonable.

12163703
33.14243092206543


In [18]:
# Average number of word counts in the query

def number_of_words(n):
    words = n.split(' ')
    return len(words)

total_word_count = np.sum(queries_df['query'].apply(number_of_words))
average_word_count = total_word_count/len(queries_df)
print(total_word_count)
print(average_word_count)

# An average word count of 6 seems about right for a query

2185955
5.956069676006027


In [27]:
# Let's now create a corpus of the 2.1M words used in the queries

def build_vocab(sentences, verbose =  True):
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence.split(' '):
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

sentences = queries_df['query'].values
# print(sentences)
vocab = build_vocab(sentences)
print({k: vocab[k] for k in list(vocab)[:5]})

# As we can see, there are some tokens that are misspelled. I'll have to handle that later

100%|██████████| 367013/367013 [00:01<00:00, 277032.36it/s]

{')what': 1, 'was': 10493, 'the': 77087, 'immediate': 27, 'impact': 153}





In [35]:
# Let's now sort the vocab, that way we can remove the mispelled words

sorted_vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True)

# Let's first the see the top 10 most common words in the text, and their counts
print(sorted_vocab[:5])

# Let's now have a look at the last 50 terms in the sorted list (they will most probably be mispellings)
print(sorted_vocab[-50:])


# WOAH!! This is surprising, the last items of the sorted list, aren't actually mispelling, but many of them are just words ending with a question mark or bracket

[('what', 147773), ('is', 133329), ('the', 77087), ('of', 59991), ('a', 55371)]
[('abbati', 1), ('ridgedale', 1), ('diffuses', 1), ('metts', 1), ('pedipalp', 1), ('synulox', 1), ('pureblood', 1), ('atd', 1), ('westheimer', 1), ('tears?', 1), ('creo', 1), ('air,', 1), ('bulbasaur', 1), ('squirtle', 1), ('thrilling', 1), ('omelettes', 1), ('inherited?', 1), ('(how', 1), ('it)', 1), ("parent's", 1), ('electrocuted', 1), ('impinging', 1), ('industry-leading', 1), ('abuja', 1), ('bicubic', 1), ('spenic', 1), ('ex-convicts', 1), ('electroforming', 1), ('buttress?', 1), ('sparely', 1), ('diabetics?', 1), ('448', 1), ('mooring', 1), ('250k', 1), ('exercise/walking', 1), ('pulmonologist?', 1), ('tympanitis', 1), ('rheostat?', 1), ('nauseousness', 1), ('meggitt', 1), ('f81', 1), ('intergenic', 1), ('detasseling', 1), ('julington', 1), ('ciss', 1), ('fantastical', 1), ('filezilla', 1), ('clarisse.', 1), ('nephridia?', 1), ('yowie', 1)]
