# Document Retrieval using TF-IDF Weighted Rank and TF-IDF Cosine Similarity

## Imports

In [16]:
# !unzip stories

In [17]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
# from num2words import num2words

import nltk
import os
import string
import numpy as np
import copy
import pandas as pd
import pickle
import re
import math

# %load_ext autotime

In [18]:
title = "stories"
alpha = 0.3

## Taking all folders

In [19]:
folders = [x[0] for x in os.walk(str(os.getcwd())+'/'+title+'/')]
folders[0] = folders[0][:len(folders[0])-1]

In [20]:
folders

['C:\\Users\\hasee\\Downloads\\ISMD23/stories',
 'C:\\Users\\hasee\\Downloads\\ISMD23/stories/FARNON',
 'C:\\Users\\hasee\\Downloads\\ISMD23/stories/SRE']

## Collecting the file names and titles

In [21]:
dataset = []

c = False

for i in folders:
    file = open(i+"/index.html", 'r')
    text = file.read().strip()
    file.close()

    file_name = re.findall('><A HREF="(.*)">', text)
    file_title = re.findall('<BR><TD> (.*)\n', text)

    if c == False:
        file_name = file_name[2:]
        c = True
        
    print(len(file_name), len(file_title))

    for j in range(len(file_name)):
        dataset.append((str(i) +"/"+ str(file_name[j]), file_title[j]))

452 452
0 0
15 15


In [22]:
len(dataset)

467

In [23]:
N = len (dataset)

In [24]:
def print_doc(id):
    print(dataset[id])
    file = open(dataset[id][0], 'r', encoding='cp1250')
    text = file.read().strip()
    file.close()
    print(text)

# Preprocessing

In [27]:
def convert_lower_case(data):
    return np.char.lower(data)

In [28]:
def remove_stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text +  " " + w
    return new_text

In [11]:
def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

In [12]:
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

In [13]:
def stemming(data):
    stemmer= PorterStemmer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

In [14]:
# def convert_numbers(data):
#     tokens = word_tokenize(str(data))
#     new_text = ""
#     for w in tokens:
#         try:
#             w = num2words(int(w))
#         except:
#             a = 0
#         new_text = new_text + " " + w
#     new_text = np.char.replace(new_text, "-", " ")
#     return new_text

In [34]:
def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data) #remove comma seperately
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
#     data = convert_numbers(data)
    data = stemming(data)
    data = remove_punctuation(data)
#     data = convert_numbers(data)
    data = stemming(data) #needed again as we need to stem the words
    data = remove_punctuation(data) #needed again as num2word is giving few hypens and commas fourty-one
    data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
    return data

## Extracting Data

In [37]:
processed_text = []
processed_title = []

for i in dataset[:N]:
    file = open(i[0], 'r', encoding="utf8", errors='ignore')
    text = file.read().strip()
    file.close()

    processed_text.append(word_tokenize(str(preprocess(text))))
    processed_title.append(word_tokenize(str(preprocess(i[1]))))

In [38]:
processed_text[2]

['adventur',
 'lone',
 'wolf',
 'scientif',
 'electron',
 'syndic',
 'seri',
 'follow',
 'exploit',
 'two',
 'madcap',
 'maven',
 'high',
 'technolog',
 'copyright',
 '1991',
 'michi',
 'peshota',
 'right',
 'reserv',
 'may',
 'distribut',
 'without',
 'accompani',
 'welcom',
 'lw',
 'episod',
 'lw',
 'file',
 'episod',
 '14',
 'smart',
 'bomb',
 'languag',
 'parser',
 'max',
 'attempt',
 'thwart',
 'last',
 'word',
 'bomb',
 'languag',
 'parser',
 'avail',
 'discov',
 'program',
 'code',
 'often',
 'stubborn',
 'human',
 'peshota',
 'whoever',
 'heard',
 'smart',
 'bomb',
 'languag',
 'parser',
 'heard',
 'grumbl',
 'austin',
 'watch',
 'wild',
 'hair',
 'officem',
 'bull',
 'like',
 'featur',
 'crea',
 'scowl',
 'hunch',
 'stack',
 'stack',
 'thesauru',
 'whip',
 'page',
 'cur',
 'bitterli',
 'nudnik',
 'programm',
 'would',
 'think',
 'make',
 'bomb',
 'verbal',
 'context',
 'sensit',
 'growl',
 'earlier',
 'even',
 'comput',
 'builder',
 'come',
 'condescend',
 'eye',
 'moist',
 'h

In [39]:
processed_title[2]

['smart', 'bomb', 'languag', 'parser']

## Calculating DF for all words

In [47]:
# TO COMPLETE. A utiliser à lq fois processed_title et processed_text
DF_set={}

for i in range(N):
    tokens = processed_text[i]
    for w in tokens:
        try:
            DF_set[w].add(i)
        except:
            DF_set[w] = {i}
# adventur: {2, 5, 10, 102，150，200，422}            

DF={}
for w in DF_set:
    DF[w] = len(DF_set[w])

In [48]:
DF["adventur"]

83

In [49]:
total_vocab_size = len(DF)

In [50]:
total_vocab_size

33347

In [51]:
total_vocab = [x for x in DF]

In [53]:
def doc_freq(word):
    c = 0
#     if word in DF:
#         c = DF[word]
    try:
        c = DF[word]
    except:
        pass
    return c

### Calculating TF-IDF for body, we will consider this as the actual tf-idf as we will add the title weight to this.

In [57]:
# tf_idf
# TO COMPLETE

doc = 0
tf_idf = {}

for i in range(N):
    tokens = processed_text[i]
    counter = Counter(tokens + processed_title[i] )
    words_count = len(tokens + processed_title[i])
    for token in np.unique(tokens):
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = np.log((N+1)/(df+1))
        tf_idf[doc,token] = tf*idf #doc = i
    doc += 1

In [56]:
dictionnaire = Counter(["a","a","a","b","c","b"])
dictionnaire['a']

3

In [58]:
len(tf_idf) # Si c'est une matrice ou 2D array, la taille doit etre 467*33347

345414

### Calculating TF-IDF for Title

In [60]:
# tf_idf_title
tf_idf_title = {}
doc = 0
for i in range(N):
    tokens = processed_title[i]
    counter = Counter(tokens+ processed_text[i])
    words_count = len(tokens+ processed_text[i])
    for token in np.unique(tokens):
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = np.log((N+1)/(df+1))
        tf_idf_title[doc,token] = tf*idf #doc = i
    doc += 1

In [61]:
tf_idf_title

{(0, '100'): 0.0059714916164150455,
 (0, '1990'): 0.004318449076706133,
 (0, '53'): 0.012122317672892532,
 (0, 'go'): 0.0002957099413838523,
 (0, 'jim'): 0.005360873675637692,
 (0, 'north'): 0.022298484316052403,
 (0, 'prentic'): 0.008224825448700788,
 (0, 'west'): 0.003383097675114372,
 (1, 'fox'): 0.11198195635330804,
 (1, 'sli'): 0.11239056533822733,
 (1, 'stori'): 0.0009822566737873984,
 (2, 'bomb'): 0.023990821442667936,
 (2, 'languag'): 0.028189403204991192,
 (2, 'parser'): 0.056944896820017764,
 (2, 'smart'): 0.014666842209690984,
 (3, 'garag'): 0.00883230483696307,
 (3, 'guy'): 0.003216660429842009,
 (3, 'pshota'): 0.004697072800548241,
 (3, 'two'): 0.002400159403090815,
 (4, '18'): 0.005355402262474138,
 (4, '1991'): 0.00585017476601832,
 (4, 'day'): 0.0009024936943444148,
 (4, 'earli'): 0.0028624714015357973,
 (4, 'high'): 0.007082839441307738,
 (4, 'magic'): 0.0035531435896279774,
 (4, 'novemb'): 0.003637801264739079,
 (4, 'peshota'): 0.013585856078218751,
 (4, 'start'): 0.0

## Merging the TF-IDF according to weights

In [46]:
for i in tf_idf:
    tf_idf[i] *= alpha

In [47]:
for i in tf_idf_title:
    tf_idf[i] = tf_idf_title[i]

In [None]:
# si "smart" n'apparait pas dans le titre, son score etait 0.2, son nouveau score devient 0.2*alpha = 0.06
# si "smart" apparait dans le titre, son score etait 0.2, Son nouveau score est tjs 0.2

# TF-IDF Matching Score Ranking

In [62]:
def matching_score(k, query):
    preprocessed_query = preprocess(query)
    tokens = word_tokenize(str(preprocessed_query))

    print("Matching Score")
    print("\nQuery:", query)
    print("")
    print(tokens)
    
    query_weights = {}
                        # (2, 'bomb'): 0.023990821442667936,
    for key in tf_idf:  # key = (2, 'bomb')
        
        if key[1] in tokens:   #key[1] = 'bomb'
            try:
                query_weights[key[0]] += tf_idf[key]
            except:
                query_weights[key[0]] = tf_idf[key]
    
    query_weights = sorted(query_weights.items(), key=lambda x: x[1], reverse=True)

    print("")
    
    l = []
    
    for i in query_weights[:k]:
        l.append(i[0])
    
    print(l)
    

matching_score(10, "Without the drive of Rebeccah's insistence, Kate lost her momentum. She stood next a slatted oak bench, canisters still clutched, surveying")

Matching Score

Query: Without the drive of Rebeccah's insistence, Kate lost her momentum. She stood next a slatted oak bench, canisters still clutched, surveying

['without', 'drive', 'rebeccah', 'insist', 'kate', 'lost', 'momentum', 'stood', 'next', 'slat', 'oak', 'bench', 'canist', 'still', 'clutch', 'survey']

[200, 166, 352, 433, 211, 350, 175, 187, 188, 294]


In [64]:
#print_doc(200)

# TF-IDF Cosine Similarity Ranking

In [66]:
def cosine_sim(a, b):
    cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim

### Vectorising tf-idf

In [67]:
D = np.zeros((N, total_vocab_size))
for i in tf_idf:
    try:
        ind = total_vocab.index(i[1])
        D[i[0]][ind] = tf_idf[i]
    except:
        pass

In [68]:
def gen_vector(tokens):

    Q = np.zeros((len(total_vocab)))
    
    counter = Counter(tokens)
    words_count = len(tokens)

    query_weights = {}
    
    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = math.log((N+1)/(df+1))

        try:
            ind = total_vocab.index(token)
            Q[ind] = tf*idf
        except:
            pass
    return Q

In [69]:
def cosine_similarity(k, query):
    print("Cosine Similarity")
    preprocessed_query = preprocess(query)
    tokens = word_tokenize(str(preprocessed_query))
    
    print("\nQuery:", query)
    print("")
    print(tokens)
    
    d_cosines = []
    
    query_vector = gen_vector(tokens)
    
    for d in D:
        d_cosines.append(cosine_sim(query_vector, d))
        
    out = np.array(d_cosines).argsort()[-k:][::-1]
    
    print("")
    
    print(out)

#     for i in out:
#         print(i, dataset[i][0])

Q = cosine_similarity(10, "Without the drive of Rebeccah's insistence, Kate lost her momentum. She stood next a slatted oak bench, canisters still clutched, surveying")

Cosine Similarity

Query: Without the drive of Rebeccah's insistence, Kate lost her momentum. She stood next a slatted oak bench, canisters still clutched, surveying

['without', 'drive', 'rebeccah', 'insist', 'kate', 'lost', 'momentum', 'stood', 'next', 'slat', 'oak', 'bench', 'canist', 'still', 'clutch', 'survey']

[200 433 166 175 169 402 211  63 290 151]


In [None]:

[200 433 166 175 169 402 211  63 290 151]

[200, 166, 352, 433, 211, 350, 175, 187, 188, 294]


In [55]:
print_doc(200)

('C:\\Users\\hasee\\Downloads\\2. TF-IDF Ranking - Cosine Similarity, Matching Score/stories/ghost', 'Time for Flowers, by Gay Bost')
TIME FOR FLOWERS
  by Gay Bost

They'd put flowers up. She hadn't noticed. Time wouldn't hold still.
She remembered, quite clearly, that time had been a simple thing; one
moment following the previous one, seconds strung out neatly like her
mother's pearls laid out on the dark mahogany vanity each Sunday
morning. But there had been a catch . . . 

Hung around Mother's neck the catch clicked and the tidy little line 
of seconds became a never ending circle with only the catch in the 
middle. For some reason the thought of pearls gathered from the sea, 
naturally nested within the confines of oyster shells, scattered 
haphazardly about the ocean floor disturbed her.

Now they'd put up the flowers in the same careless groupings. This,
too, disturbed her. Bright yellow trumpets, their collars spread to
catch the sun, dotted the front yard in clusters of two 

# 1ere Sceance

In [70]:
from nltk.tokenize import word_tokenize
text = "God is Great! I won a lottery."
print(word_tokenize(text))

['God', 'is', 'Great', '!', 'I', 'won', 'a', 'lottery', '.']


In [71]:
sample1="ben    ben" 
sample2 = "Ho! What a beautiful day. I don't want to stay at home." 
sample3 = "RT @angelababy: love you baby! :D http://ah.love #168cm"

In [72]:
word_tokenize(sample3) # A comparer avec  https://corenlp.run/

['RT',
 '@',
 'angelababy',
 ':',
 'love',
 'you',
 'baby',
 '!',
 ':',
 'D',
 'http',
 ':',
 '//ah.love',
 '#',
 '168cm']

In [73]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

In [74]:
plurals = ['caresses', 'flies', 'dies', 'mules', 'denied',
            'died', 'agreed', 'owned', 'humbled', 'sized',
            'meeting', 'stating', 'siezing', 'itemization',
            'sensational', 'traditional', 'reference', 'colonizer',
            'plotted']
singles = [stemmer.stem(plural) for plural in plurals]

In [75]:
singles

['caress',
 'fli',
 'die',
 'mule',
 'deni',
 'die',
 'agre',
 'own',
 'humbl',
 'size',
 'meet',
 'state',
 'siez',
 'item',
 'sensat',
 'tradit',
 'refer',
 'colon',
 'plot']

In [76]:
from nltk.stem import WordNetLemmatizer
 
lemmatizer = WordNetLemmatizer()
 
print("rocks :", lemmatizer.lemmatize("rocks"))
print("corpora :", lemmatizer.lemmatize("corpora"))
 
# a denotes adjective in "pos"
print("better :", lemmatizer.lemmatize("better", pos ="a"))

rocks : rock
corpora : corpus
better : good
