# importing libaries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
from pathlib import Path
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import math

# **loading dataset** 

In [4]:
# param limiter => used to control number of training docs being loaded because of memory overload
limiter = 1000
def load_dataset():
    results = defaultdict(list)
    path = "HW04/documents"
    i = 0
    for file in Path(path).iterdir():
        if i < limiter:       
            with open(file, "r") as file_open:
                results["file_name"].append(file.name)
                results["text"].append(file_open.read())
            i = i+1

    return pd.DataFrame(results)


In [5]:

data = load_dataset()

In [6]:
#removing .txt extension
data['file_name'] = data['file_name'].apply(lambda name: name.split(".")[0])

In [7]:
data.shape

(1000, 2)

In [8]:

data.head()

Unnamed: 0,file_name,text
0,file_1,compact memories have flexible capacities a d...
1,file_10,highspeed microwave switching of semiconductor...
2,file_100,satellite observations of electrons artificial...
3,file_1000,the eta carinae nebula and centaurus a near mc...
4,file_10000,notes on the sunspot cycle analysis of a long...


In [10]:

path1 = "C:/Users/Anonymous_King/Downloads/binary-independence-model-main/binary-independence-model-main/model/HW04/file_label.txt"
labels = pd.read_csv(path1, names=["file_name", "label"], sep=",", header=None)
labels.head()

Unnamed: 0,file_name,label
0,file_0,0
1,file_1,0
2,file_2,0
3,file_3,0
4,file_4,0


In [11]:
#removing nulls
labels["label"].fillna(0, inplace=True)
labels["label"] = labels["label"].apply(lambda label: label if label != None else 0)
#concat labels to the documents dataframe
labels = labels.reindex(index=labels.index[::-1])

In [12]:

labels.head()

Unnamed: 0,file_name,label
11428,file_11428,0
11427,file_11427,0
11426,file_11426,0
11425,file_11425,0
11424,file_11424,0


In [13]:
# labels["label"][:limiter]
# data["labels"] = labels["label"][:limiter]
data["labels"] = ""
i = 1
for file_name in data["file_name"]:
  index = data[data["file_name"] == file_name].index.values[0]
  label = labels[labels["file_name"] == file_name]["label"]
  data["labels"][index] = label.values[0] if len(label) > 0 else 0
  # print(data["labels"][data.iloc[data["file_name"] == file_name].index])
# data["labels"] = labels["label"][:limiter]
data.head()

Unnamed: 0,file_name,text,labels
0,file_1,compact memories have flexible capacities a d...,0
1,file_10,highspeed microwave switching of semiconductor...,0
2,file_100,satellite observations of electrons artificial...,0
3,file_1000,the eta carinae nebula and centaurus a near mc...,0
4,file_10000,notes on the sunspot cycle analysis of a long...,0


configs

*   only necessary in google colab



In [14]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anonymous_King\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Anonymous_King\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

tokenization

In [15]:
def tokenize(word_list):
    tokenized_text = []
    for text in word_list:
        tokenized_text.append(word_tokenize(text))
    return tokenized_text
data["text"] = list(tokenize(data["text"]))
  

In [16]:
data.head()

Unnamed: 0,file_name,text,labels
0,file_1,"[compact, memories, have, flexible, capacities...",0
1,file_10,"[highspeed, microwave, switching, of, semicond...",0
2,file_100,"[satellite, observations, of, electrons, artif...",0
3,file_1000,"[the, eta, carinae, nebula, and, centaurus, a,...",0
4,file_10000,"[notes, on, the, sunspot, cycle, analysis, of,...",0


change case to lower case

In [17]:

data['text'] = data['text'].apply(lambda list: [txt.lower() for txt in list])
data.head()

Unnamed: 0,file_name,text,labels
0,file_1,"[compact, memories, have, flexible, capacities...",0
1,file_10,"[highspeed, microwave, switching, of, semicond...",0
2,file_100,"[satellite, observations, of, electrons, artif...",0
3,file_1000,"[the, eta, carinae, nebula, and, centaurus, a,...",0
4,file_10000,"[notes, on, the, sunspot, cycle, analysis, of,...",0


stop word removal

In [18]:

stop_words = set(stopwords.words('english'))
data['text'] = data['text'].apply(lambda list: [txt for txt in list if not txt in stop_words])
data.head()


Unnamed: 0,file_name,text,labels
0,file_1,"[compact, memories, flexible, capacities, digi...",0
1,file_10,"[highspeed, microwave, switching, semiconducto...",0
2,file_100,"[satellite, observations, electrons, artificia...",0
3,file_1000,"[eta, carinae, nebula, centaurus, near, mc, pa...",0
4,file_10000,"[notes, sunspot, cycle, analysis, long, series...",0


stemming

In [19]:
porter_stemmer = PorterStemmer()
data['text'] = data['text'].apply(lambda list: [porter_stemmer.stem(txt) for txt in list])
data.head()

Unnamed: 0,file_name,text,labels
0,file_1,"[compact, memori, flexibl, capac, digit, data,...",0
1,file_10,"[highspe, microwav, switch, semiconductor, part]",0
2,file_100,"[satellit, observ, electron, artifici, inject,...",0
3,file_1000,"[eta, carina, nebula, centauru, near, mc, part...",0
4,file_10000,"[note, sunspot, cycl, analysi, long, seri, sun...",0


### **generating an inverted index of terms**
here we are going to create a dictionary with all words as keys and the values are the doc file names containing the word
*   *NOTE: The get_inverted_index function is memory and cpu intensive so it may take some time*



In [20]:
#first we get all words
wordlist = [txt for list in data['text'] for txt in list]
#get only unique values using set
wordlist = set(wordlist)
wordlist = list(wordlist)
wordlist

['viewpoint',
 'germanium',
 'month',
 'hercul',
 'binari',
 'dsb',
 'nerv',
 'nikecajun',
 'principl',
 'isoin',
 'concurr',
 'underli',
 'gate',
 'crystallin',
 'fail',
 'parabol',
 'spread',
 'recov',
 'cyrat',
 'control',
 'place',
 'tightli',
 'longitudin',
 'meteor',
 'bandwidth',
 'apart',
 'et',
 'nonessenti',
 'aid',
 'quit',
 'receiv',
 'ovr',
 'wiener',
 'input',
 'correspond',
 'filter',
 'tali',
 'kaenel',
 'telescop',
 'four',
 'absorb',
 'rapidli',
 'disk',
 'rf',
 'scan',
 'lake',
 'februari',
 'infrar',
 'horizon',
 'thermal',
 'appear',
 'divers',
 'unconvent',
 'immers',
 'sharpli',
 'finit',
 'paraboloid',
 'result',
 'fourth',
 'action',
 'toward',
 'equilibrium',
 'valu',
 'carbon',
 'argu',
 'collis',
 'microelectron',
 'bilater',
 'flux',
 'squar',
 'stress',
 'creas',
 'relat',
 'leakag',
 'extraordinari',
 'resistor',
 'experiment',
 'due',
 'set',
 'breadth',
 'anisotropi',
 'ambipolar',
 'impact',
 'hydromagnet',
 'onward',
 'quinari',
 'kc',
 'clear',
 'lea

In [24]:
def get_inverted_index():
    word_dict = defaultdict(set)  
    for i, doc in enumerate(data['text']):
        for word in wordlist:
            if word in doc:
                word_dict[word].add(data.iloc[i, 0])
    return word_dict

word_dict = get_inverted_index()
# word_dict = pd.DataFrame(get_inverted_index())
# word_dict.head()
word_dict

defaultdict(set,
            {'memori': {'file_1',
              'file_10157',
              'file_10471',
              'file_10474',
              'file_10615',
              'file_10736',
              'file_10737',
              'file_10873',
              'file_10876'},
             'data': {'file_1',
              'file_10011',
              'file_10012',
              'file_10013',
              'file_10015',
              'file_10023',
              'file_10043',
              'file_1005',
              'file_1010',
              'file_1012',
              'file_10138',
              'file_10157',
              'file_10164',
              'file_10174',
              'file_1026',
              'file_10269',
              'file_10270',
              'file_10273',
              'file_1028',
              'file_10284',
              'file_10287',
              'file_10298',
              'file_10323',
              'file_10336',
              'file_1035',
              'file_1037',

# **modelling**

get inverse document frequency
*   idf = log(total docs/total docs containing term)



In [25]:
def get_idf(word):
    return math.log(data.shape[0] / (len(word_dict[word])))


get rsv ranking weights


1.   generate rsv of each word based on the whole training documents
2.   generate rsv of doc based on specific doc provided



In [26]:
def get_relevant_docs(docs):
  
    files = []
    for file_name in list(docs):
        if int(data[data['file_name'] == file_name]["labels"]) == 1:
            files.append( file_name )
    return files


def generate_rsv_weights():
    weights = {}
    for word in wordlist:
        terms_in_relevant_docs = len(get_relevant_docs(word_dict[word]))
        prob_of_word = terms_in_relevant_docs/(data.shape[0]+0.5)
        _log = math.log(prob_of_word/(1-prob_of_word)) if prob_of_word > 0 else 0
        weights[word] = get_idf(word) + _log
    return weights

weights = generate_rsv_weights()

In [27]:
weights

{'viewpoint': 6.907755278982137,
 'germanium': 6.907755278982137,
 'month': 5.521460917862246,
 'hercul': 6.907755278982137,
 'binari': 4.8283137373023015,
 'dsb': 6.907755278982137,
 'nerv': 6.907755278982137,
 'nikecajun': 6.907755278982137,
 'principl': 4.509860006183766,
 'isoin': 6.907755278982137,
 'concurr': 6.907755278982137,
 'underli': 6.907755278982137,
 'gate': 6.214608098422191,
 'crystallin': 6.214608098422191,
 'fail': 6.907755278982137,
 'parabol': 6.214608098422191,
 'spread': 4.3428059215206005,
 'recov': 6.907755278982137,
 'cyrat': 6.907755278982137,
 'control': 3.3524072174927233,
 'place': 5.298317366548036,
 'tightli': 6.907755278982137,
 'longitudin': 4.8283137373023015,
 'meteor': 4.074541934925921,
 'bandwidth': 3.9633162998156966,
 'apart': 6.907755278982137,
 'et': 6.214608098422191,
 'nonessenti': 6.907755278982137,
 'aid': 5.809142990314028,
 'quit': 6.907755278982137,
 'receiv': 3.9633162998156966,
 'ovr': 6.907755278982137,
 'wiener': 6.907755278982137,


getting total doc rsv score

In [29]:
#calculates the rsv of a document based on the query provided
def generate_doc_rsv(file_name, query):
    result = 0
    doc_list = data[data["file_name"] == file_name]["text"]
    for doc in doc_list:
        for word in doc:
            if word in query:
                result += weights[word]
    return result


rank docs by query provided

In [39]:

#retreive all docs containing our word
#get the rsv score of the retrieved docs
def rank_docs(query):
    results = {
        'file_name': [],
        'score': []
    }
    docs = list()
    for word in query:
        docs = [*docs, *list(word_dict[word])]
    for file_name in docs:
        results['file_name'].append(file_name)
        results['score'].append(generate_doc_rsv(file_name, query))
    results = pd.DataFrame(results).sort_values('score', ascending=False)
    return results


# search results/ prediction

In [40]:
#preprocess the query first then rank
def search_query(search_query, retrieved_docs = 10):
    #tokenize query
    query = list(tokenize([search_query]))[0]
    #lower case
    query = [txt.lower() for txt in query]
    #stop words
    query = [txt for txt in query if not txt in stop_words]
    #stemming
    query = [porter_stemmer.stem(txt) for txt in query]
    return rank_docs(query).head(retrieved_docs)

get the query

get first top 10 results

In [49]:
result = search_query("calculator", 10)
result

Unnamed: 0,file_name,score
12,file_10053,8.869535
0,file_10077,2.956512
39,file_10008,2.956512
29,file_10135,2.956512
30,file_10032,2.956512
31,file_10588,2.956512
32,file_10054,2.956512
33,file_10087,2.956512
34,file_10827,2.956512
35,file_10853,2.956512
