# importing libaries

In [54]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
from pathlib import Path
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import math

# **loading dataset** 

In [55]:
# param limiter => used to control number of training docs being loaded because of memory overload
limiter = 1000
def load_dataset():
    results = defaultdict(list)
    path = "docs/documents"
    i = 0
    for file in Path(path).iterdir():
        if i < limiter:       
            with open(file, "r") as file_open:
                results["file_name"].append(file.name)
                results["text"].append(file_open.read())
            i = i+1

    return pd.DataFrame(results)


In [56]:

data = load_dataset()

In [57]:
#removing .txt extension
data['file_name'] = data['file_name'].apply(lambda name: name.split(".")[0])

In [58]:
data.shape

(1000, 2)

In [59]:

data.head()

Unnamed: 0,file_name,text
0,file_2572,pulse amplification using impact ionization in...
1,file_10570,apparent reflection heights and reflection pro...
2,file_6850,a new ionosphere sounder frequency sweep appa...
3,file_4119,the complex formulation of the equations for t...
4,file_1693,lunar tides in at brisbane\n


In [60]:

path1 = "docs/file_label.txt"
labels = pd.read_csv(path1, names=["file_name", "label"], sep=",", header=None)
labels.head()

Unnamed: 0,file_name,label
0,file_0,0
1,file_1,0
2,file_2,0
3,file_3,0
4,file_4,0


In [61]:
#removing nulls
labels["label"].fillna(0, inplace=True)
labels["label"] = labels["label"].apply(lambda label: label if label != None else 0)
#concat labels to the documents dataframe
labels = labels.reindex(index=labels.index[::-1])

In [62]:

labels.head()

Unnamed: 0,file_name,label
11428,file_11428,0
11427,file_11427,0
11426,file_11426,0
11425,file_11425,0
11424,file_11424,0


In [63]:
# labels["label"][:limiter]
# data["labels"] = labels["label"][:limiter]
data["labels"] = ""
i = 1
for file_name in data["file_name"]:
    index = data[data["file_name"] == file_name].index.values[0]
    label = labels[labels["file_name"] == file_name]["label"]
    data["labels"][index] = label.values[0] if len(label) > 0 else 0
  # print(data["labels"][data.iloc[data["file_name"] == file_name].index])
# data["labels"] = labels["label"][:limiter]
data.head()

Unnamed: 0,file_name,text,labels
0,file_2572,pulse amplification using impact ionization in...,0
1,file_10570,apparent reflection heights and reflection pro...,0
2,file_6850,a new ionosphere sounder frequency sweep appa...,0
3,file_4119,the complex formulation of the equations for t...,0
4,file_1693,lunar tides in at brisbane\n,0


configs

*   only necessary in google colab



In [64]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/suleman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/suleman/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

tokenization

In [65]:
def tokenize(word_list):
    tokenized_text = []
    for text in word_list:
        tokenized_text.append(word_tokenize(text))
    return tokenized_text
data["text"] = list(tokenize(data["text"]))
  

In [66]:
data.head()

Unnamed: 0,file_name,text,labels
0,file_2572,"[pulse, amplification, using, impact, ionizati...",0
1,file_10570,"[apparent, reflection, heights, and, reflectio...",0
2,file_6850,"[a, new, ionosphere, sounder, frequency, sweep...",0
3,file_4119,"[the, complex, formulation, of, the, equations...",0
4,file_1693,"[lunar, tides, in, at, brisbane]",0


change case to lower case

In [67]:

data['text'] = data['text'].apply(lambda list: [txt.lower() for txt in list])
data.head()

Unnamed: 0,file_name,text,labels
0,file_2572,"[pulse, amplification, using, impact, ionizati...",0
1,file_10570,"[apparent, reflection, heights, and, reflectio...",0
2,file_6850,"[a, new, ionosphere, sounder, frequency, sweep...",0
3,file_4119,"[the, complex, formulation, of, the, equations...",0
4,file_1693,"[lunar, tides, in, at, brisbane]",0


stop word removal

In [68]:

stop_words = set(stopwords.words('english'))
data['text'] = data['text'].apply(lambda list: [txt for txt in list if not txt in stop_words])
data.head()


Unnamed: 0,file_name,text,labels
0,file_2572,"[pulse, amplification, using, impact, ionizati...",0
1,file_10570,"[apparent, reflection, heights, reflection, pr...",0
2,file_6850,"[new, ionosphere, sounder, frequency, sweep, a...",0
3,file_4119,"[complex, formulation, equations, two, dimensi...",0
4,file_1693,"[lunar, tides, brisbane]",0


stemming

In [69]:
porter_stemmer = PorterStemmer()
data['text'] = data['text'].apply(lambda list: [porter_stemmer.stem(txt) for txt in list])
data.head()

Unnamed: 0,file_name,text,labels
0,file_2572,"[puls, amplif, use, impact, ioniz, germanium, ...",0
1,file_10570,"[appar, reflect, height, reflect, properti, io...",0
2,file_6850,"[new, ionospher, sounder, frequenc, sweep, app...",0
3,file_4119,"[complex, formul, equat, two, dimension, space...",0
4,file_1693,"[lunar, tide, brisban]",0


### **generating an inverted index of terms**
here we are going to create a dictionary with all words as keys and the values are the doc file names containing the word
*   *NOTE: The get_inverted_index function is memory and cpu intensive so it may take some time*



In [70]:
#first we get all words
wordlist = [txt for list in data['text'] for txt in list]
#get only unique values using set
wordlist = set(wordlist)
wordlist = list(wordlist)
wordlist

['accord',
 'dial',
 'dynatron',
 'airborn',
 'storag',
 'support',
 'z',
 'meter',
 'exampl',
 'discharg',
 'npn',
 'consist',
 'special',
 'swept',
 'al',
 'presenc',
 'suppos',
 'ledg',
 'among',
 'iter',
 'administr',
 'alphanumer',
 'pt',
 'spectroscopi',
 'stage',
 'eleven',
 'phoenicid',
 'sidiod',
 'creat',
 'impur',
 'lump',
 'thevenin',
 'though',
 'quantit',
 'messag',
 'state',
 'mgo',
 'carryii',
 'packag',
 'gradient',
 'evid',
 'warm',
 'lithium',
 'variant',
 'brown',
 'ideal',
 'ba',
 'thk',
 'optimum',
 'probe',
 'descript',
 'wavelength',
 'formal',
 'biharmon',
 'irrevers',
 'vhf',
 'sakatoon',
 'cl',
 'kerguelen',
 'norton',
 'ofr',
 'occurr',
 'fact',
 'medium',
 'barkhausen',
 'germinid',
 'econom',
 'waveband',
 'mar',
 'ferro',
 'langevin',
 'carbid',
 'examin',
 'ten',
 'superpos',
 'satisfi',
 'specif',
 'group',
 'didact',
 'research',
 'victoria',
 'discrim',
 'equinox',
 'curvatur',
 'proxim',
 'deliv',
 'run',
 'convect',
 'csiro',
 'trlation',
 'gorelik'

In [71]:
def get_inverted_index():
    word_dict = defaultdict(set)  
    for i, doc in enumerate(data['text']):
        for word in wordlist:
            if word in doc:
                word_dict[word].add(data.iloc[i, 0])
    return word_dict

word_dict = get_inverted_index()
# word_dict = pd.DataFrame(get_inverted_index())
# word_dict.head()
word_dict

defaultdict(set,
            {'impur': {'file_1209',
              'file_2094',
              'file_2572',
              'file_3320',
              'file_4760',
              'file_5184',
              'file_5196'},
             'germanium': {'file_2572',
              'file_3030',
              'file_3403',
              'file_4519',
              'file_6442',
              'file_8127'},
             'rang': {'file_10219',
              'file_10259',
              'file_10349',
              'file_10476',
              'file_11103',
              'file_11297',
              'file_11336',
              'file_1209',
              'file_1351',
              'file_1487',
              'file_1734',
              'file_1737',
              'file_1754',
              'file_2073',
              'file_2184',
              'file_2339',
              'file_2531',
              'file_2572',
              'file_2801',
              'file_2893',
              'file_3158',
              'file_3323',

# **modelling**

get inverse document frequency
*   idf = log(total docs/total docs containing term)



In [72]:
def get_idf(word):
    return math.log(data.shape[0] / (len(word_dict[word])))


get rsv ranking weights


1.   generate rsv of each word based on the whole training documents
2.   generate rsv of doc based on specific doc provided



In [73]:
def get_relevant_docs(docs):
  
    files = []
    for file_name in list(docs):
        if int(data[data['file_name'] == file_name]["labels"]) == 1:
            files.append( file_name )
    return files


def generate_rsv_weights():
    weights = {}
    for word in wordlist:
        terms_in_relevant_docs = len(get_relevant_docs(word_dict[word]))
        prob_of_word = terms_in_relevant_docs/(data.shape[0]+0.5)
        _log = math.log(prob_of_word/(1-prob_of_word)) if prob_of_word > 0 else 0
        weights[word] = get_idf(word) + _log
    return weights

weights = generate_rsv_weights()

In [74]:
weights

{'accord': 5.521460917862246,
 'dial': 6.907755278982137,
 'dynatron': 6.907755278982137,
 'airborn': 6.214608098422191,
 'storag': 3.9633162998156966,
 'support': 4.509860006183766,
 'z': 5.298317366548036,
 'meter': 5.809142990314028,
 'exampl': 4.3428059215206005,
 'discharg': -3.496007436424798,
 'npn': 6.214608098422191,
 'consist': 3.7297014486341915,
 'special': 4.199705077879927,
 'swept': 6.907755278982137,
 'al': 5.298317366548036,
 'presenc': 4.8283137373023015,
 'suppos': 6.907755278982137,
 'ledg': 6.907755278982137,
 'among': 6.214608098422191,
 'iter': 6.907755278982137,
 'administr': 6.907755278982137,
 'alphanumer': 6.907755278982137,
 'pt': 5.809142990314028,
 'spectroscopi': 6.214608098422191,
 'stage': 3.912023005428146,
 'eleven': 6.907755278982137,
 'phoenicid': 6.907755278982137,
 'sidiod': 6.907755278982137,
 'creat': 5.809142990314028,
 'impur': 4.961845129926823,
 'lump': 6.214608098422191,
 'thevenin': 6.907755278982137,
 'though': 5.298317366548036,
 'quanti

getting total doc rsv score

In [75]:
#calculates the rsv of a document based on the query provided
def generate_doc_rsv(file_name, query):
    result = 0
    doc_list = data[data["file_name"] == file_name]["text"]
    for doc in doc_list:
        for word in doc:
            if word in query:
                result += weights[word]
    return result


rank docs by query provided

In [76]:

#retreive all docs containing our word
#get the rsv score of the retrieved docs
def rank_docs(query):
    results = {
        'file_name': [],
        'score': []
    }
    docs = list()
    for word in query:
        docs = [*docs, *list(word_dict[word])]
    for file_name in docs:
        results['file_name'].append(file_name)
        results['score'].append(generate_doc_rsv(file_name, query))
    results = pd.DataFrame(results).sort_values('score', ascending=False)
    return results


# search results/ prediction

In [77]:
#preprocess the query first then rank
def search_query(search_query, retrieved_docs = 10):
    #tokenize query
    query = list(tokenize([search_query]))[0]
    #lower case
    query = [txt.lower() for txt in query]
    #stop words
    query = [txt for txt in query if not txt in stop_words]
    #stemming
    query = [porter_stemmer.stem(txt) for txt in query]
    return rank_docs(query).head(retrieved_docs)

get the query

get first top 10 results

In [78]:
result = search_query("calculator", 10)
result

Unnamed: 0,file_name,score
0,file_6179,-4.330233
37,file_3419,-4.330233
55,file_2249,-4.330233
54,file_4345,-4.330233
53,file_10380,-4.330233
52,file_6115,-4.330233
51,file_8874,-4.330233
50,file_9782,-4.330233
49,file_8943,-4.330233
47,file_7867,-4.330233
