In [1]:
import pandas as pd
import nltk
import itertools
import math
import operator
from statistics import mean
from nltk.corpus import stopwords
from nltk.stem import *
import os,sys
import re, string, unicodedata
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize

In [2]:
# Preprocessing query

def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    new_words = []
    for word in words:
        new_word = re.sub(r'\d+','',word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    stop_words = set(stopwords.words("english"))
    for word in words:
        if word not in stop_words:
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lexical_analysis(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_numbers(words)
    return words

def preprocess_query(query):
    sample = query
    sample = sample.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
    tokens = word_tokenize(sample)
    lexical = lexical_analysis(tokens)
    filtered_tokens = remove_stopwords(lexical)
    stemmed_tokens = stem_words(filtered_tokens)
    filtered_tokens1 = remove_stopwords(stemmed_tokens)
    return filtered_tokens1

# Part one - Ranking pages according to query

In [3]:
df = pd.read_excel("inverted_index.xlsx",index_col="Unnamed: 0")
inverted_index = df.copy()

In [4]:
df

Unnamed: 0,T1.txt,T10.txt,T2.txt,T3.txt,T4.txt,T5.txt,T6.txt,T7.txt,T8.txt,T9.txt
abandon,0,2,0,0,0,0,0,0,0,1
abbrevy,0,0,0,0,0,0,0,1,0,0
abdom,0,0,0,0,0,0,0,1,0,0
abdomin,0,0,0,0,0,0,0,1,0,0
aberdeen,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
zint,0,0,2,0,0,0,0,0,0,12
zip,0,1,0,0,0,0,0,0,0,0
zon,0,0,0,0,0,0,0,2,0,0
zoolog,5,0,0,0,0,0,1,2,0,0


In [5]:
def normalize_tf(df):
    for column in df:
        m = max(df[column])
        if m!=0:
            df[column] = df[column]/m
    return df

def calculate_idf(df):
    idf_score = {}
    N = df.shape[1]
    all_words = df.index
    word_count = df.astype(bool).sum(axis=1)
    for word in all_words:
        idf_score[word] = math.log2(N/word_count[word])
    return idf_score

def calculate_tfidf(data, idf_score):
    scores = {}
    for key,value in data.items():
        scores[key] = data[key]
    for doc,tf_scores in scores.items():
        for token, score in tf_scores.items():
            tf = score
            idf = idf_score[token]
            tf_scores[token] = tf * idf
    return scores

In [6]:
normalized_tf = normalize_tf(df)
idf_score = calculate_idf(normalized_tf)
tf_idf_docs = calculate_tfidf(normalized_tf,idf_score)

In [7]:
query = input("Enter the query : ")
query = strip_html(query)
query_words = preprocess_query(query)

Enter the query : lung fish


In [8]:
def tf_query(query_words):
    all_words = df.index
    index = {}
    index["query"] = {}
    for word in all_words:
        index["query"][word] = 0
    for qword in query_words:
        if qword in all_words:
            index["query"][qword] = query_words.count(qword)
    return index

In [9]:
tf_for_query = tf_query(query_words)
tf_for_query = pd.DataFrame(tf_for_query)
normalized_tf_for_query = normalize_tf(tf_for_query)
tf_idf_query = calculate_tfidf(normalized_tf_for_query,idf_score)
tf_idf_query = pd.DataFrame(tf_idf_query)
tf_idf_docs = pd.DataFrame(tf_idf_docs)

In [10]:
tf_idf_docs

Unnamed: 0,T1.txt,T10.txt,T2.txt,T3.txt,T4.txt,T5.txt,T6.txt,T7.txt,T8.txt,T9.txt
abandon,0.000000,0.052178,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.022764
abbrevy,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.042050,0.0,0.000000
abdom,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.042050,0.0,0.000000
abdomin,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.042050,0.0,0.000000
aberdeen,0.036505,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...
zint,0.000000,0.000000,0.067302,0.0,0.0,0.0,0.000000,0.000000,0.0,0.273168
zip,0.000000,0.037325,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000
zon,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.084099,0.0,0.000000
zoolog,0.095438,0.000000,0.000000,0.0,0.0,0.0,0.020435,0.043974,0.0,0.000000


In [11]:
tf_idf_query

Unnamed: 0,query
abandon,0.0
abbrevy,0.0
abdom,0.0
abdomin,0.0
aberdeen,0.0
...,...
zint,0.0
zip,0.0
zon,0.0
zoolog,0.0


In [12]:
def get_similarity(tf_idf_docs,tf_idf_query):
    query_docs = {}
    query_docs["query"] = {}
    query_length = math.sqrt(sum(tf_idf_query.loc[value] ** 2 for value in tf_idf_query.index))
    
    if(query_length==0):
        print("Your terms in query did not match any document")
        for column in tf_idf_docs:
            query_docs["query"][column] = 0
        return query_docs
    
    for column in tf_idf_docs:
        
        num = 0
        sum_of_squares = 0
        
        for value in tf_idf_docs.index :
            sum_of_squares+=tf_idf_docs[column].loc[value] ** 2
            num+= tf_idf_docs[column].loc[value] * tf_idf_query["query"].loc[value] 
            
        doc_len = math.sqrt(sum_of_squares)
        cosine_sim = num/(doc_len*query_length)
        query_docs["query"][column] = cosine_sim
    
    return query_docs

In [13]:
rank = get_similarity(tf_idf_docs,tf_idf_query)
rank = pd.DataFrame(rank).sort_values("query",ascending=False)

In [14]:
rank

Unnamed: 0,query
T7.txt,0.12797
T1.txt,0.030924
T4.txt,0.01523
T6.txt,0.004762
T10.txt,0.0
T2.txt,0.0
T3.txt,0.0
T5.txt,0.0
T8.txt,0.0
T9.txt,0.0


In [15]:
for word in query_words:
    print("\n")
    if word in inverted_index.index:
        print(word)
        print(inverted_index.loc[word])
    else:
        print(word,"No entry in inverted index")
    print("\n")



lung
T1.txt     0
T10.txt    0
T2.txt     0
T3.txt     0
T4.txt     0
T5.txt     0
T6.txt     0
T7.txt     7
T8.txt     0
T9.txt     0
Name: lung, dtype: int64




fish
T1.txt     11
T10.txt     0
T2.txt      0
T3.txt      0
T4.txt      2
T5.txt      0
T6.txt      2
T7.txt     30
T8.txt      0
T9.txt      0
Name: fish, dtype: int64




# Part two (Documents similarity)

In [16]:
def compare_documents(tf_idf_docs):
    compare = {}
    for column1 in tf_idf_docs:
        compare[column1] = {}
        query_length = math.sqrt(sum(tf_idf_docs[column1].loc[value] ** 2 for value in tf_idf_docs.index))
    
        for column in tf_idf_docs:
        
            num = 0
            sum_of_squares = 0
        
            for value in tf_idf_docs.index :
                sum_of_squares+=tf_idf_docs[column].loc[value] ** 2
                num+= tf_idf_docs[column].loc[value] * tf_idf_docs[column1].loc[value] 
            
            doc_len = math.sqrt(sum_of_squares)
            cosine_sim = num/(doc_len*query_length)
            compare[column1][column] = cosine_sim
    
    return compare

In [17]:
compare = compare_documents(tf_idf_docs)
compare = pd.DataFrame(compare)

In [18]:
compare

Unnamed: 0,T1.txt,T10.txt,T2.txt,T3.txt,T4.txt,T5.txt,T6.txt,T7.txt,T8.txt,T9.txt
T1.txt,1.0,0.129088,0.21132,0.199237,0.215648,0.186144,0.220691,0.231335,0.13053,0.21202
T10.txt,0.129088,1.0,0.043512,0.045506,0.047392,0.025587,0.045124,0.032596,0.017578,0.17489
T2.txt,0.21132,0.043512,1.0,0.108591,0.185985,0.142199,0.067285,0.042213,0.023447,0.229504
T3.txt,0.199237,0.045506,0.108591,1.0,0.163657,0.122125,0.068376,0.044078,0.03192,0.045706
T4.txt,0.215648,0.047392,0.185985,0.163657,1.0,0.181649,0.073455,0.063409,0.02703,0.065671
T5.txt,0.186144,0.025587,0.142199,0.122125,0.181649,1.0,0.060386,0.051205,0.02417,0.073431
T6.txt,0.220691,0.045124,0.067285,0.068376,0.073455,0.060386,1.0,0.303141,0.128412,0.079425
T7.txt,0.231335,0.032596,0.042213,0.044078,0.063409,0.051205,0.303141,1.0,0.303478,0.048589
T8.txt,0.13053,0.017578,0.023447,0.03192,0.02703,0.02417,0.128412,0.303478,1.0,0.034061
T9.txt,0.21202,0.17489,0.229504,0.045706,0.065671,0.073431,0.079425,0.048589,0.034061,1.0


In [19]:
compare.to_excel("comparison_among_documents.xlsx")