# **Read Documents From Disk**

In [1]:
import os
import glob


def read_data(data_dir='data'):
    data = []
    
    path = os.path.join(data_dir, '*.txt')
    files = glob.glob(path)
    
    for f in files:
        with open(f,encoding="utf8") as document:
            data.append(document.read())
            
    return data

# **Preprocessing Data**

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
import re

def document_to_words(document):
    nltk.download("stopwords", quiet=True)
    stemmer = PorterStemmer()
    
    text = re.sub(r"[^a-zA-Z0-9]", " ", document.lower())
    words = text.split()
    words = [w for w in words if w not in stopwords.words("english")]
    words = [PorterStemmer().stem(w) for w in words]
    
    return words

In [3]:
def data_to_words(data):
    data_in_words = []
    for document in data:
        data_in_words.append(document_to_words(document))
    return data_in_words

# Term Frequency In All Documents

In [4]:
def term_frequency(data):
    word_count = {}
    for document in data:
        for word in document:
            if word in word_count:
                word_count[word] += 1
            else:
                word_count[word] = 1
    return word_count

# Auxiliary structure

In [5]:
def build_aux(data):
    word_document = {}
    for i in range (0,len(data)):
        for j in range (0,len(data[i])):
            if data[i][j] in word_document:
                word_document[data[i][j]][i].append(j)
            else:
                word_document[data[i][j]] = [[] for i in range(len(data))]
                word_document[data[i][j]][i].append(j)
    return word_document

In [6]:
def phrase_query(query, aux):
    values = []
    documents = {}
    query_in_words = document_to_words(query)
    
    for i in range (0, len(query_in_words)):
        if query_in_words[i] in aux:
            values.append(aux[query_in_words[i]])
        else:
            return documents  
    if len(values) == 1:
        for i in range(0, len(values[0])):
            if len(values[0][i]) > 0:
                documents[i] = values[0][i][len(values[0][i]) - 1]
    for i in range (0,len(values) - 1):
        for j in range(0, len(values[i])):
            for index in range (0, len(values[i][j])):
                if values[i][j][index] + 1 in values[i+1][j]:
                    documents[j] = values[i][j][index]+1
    return documents

# Vector Space Model

In [7]:
import numpy as np
import pandas as pd
import math

In [8]:
def tf(aux):
    tf={}
    for key, val in aux.items():
        for i in range (0, len(val)):
            if key in tf:
                tf[key][i].append(len(val[i]))
                
            else:
                tf[key] = [[] for i in range(len(val))]
                tf[key][i].append(len(val[i]))
    return tf

In [9]:
def idf(aux, n):
    idf = {}
    for key, val in aux.items():
        count = 0
        for document in val:
            if len(document) > 0:
                count += 1
        idf[key] = np.log10(n/count)
    return idf

In [10]:
def tf_idf(aux,idf):
    tf_idf = {}
    for key, val in aux.items():
        for i in range (0, len(val)):
            if key in tf_idf:
                tf_idf[key][i].append(np.log10(1 + len(val[i])) * idf[key])
                
            else:
                tf_idf[key] = [[] for i in range(len(val))]
                tf_idf[key][i].append(np.log10(1 + len(val[i])) * idf[key])
                
    return tf_idf

In [11]:
def normalized_tf_idf(data,tf_idf_df):
    normalized_tf_idf={}
    all_doc_length=[]
    for j in range(len(data)):
        doc_length =0
        for i in range(len(tf_idf_df[j])):
            doc_length+=((list(tf_idf_df[j])[i][0])**2)
        all_doc_length.append(math.sqrt(doc_length))
    all_doc_length
    for i in range(len(data)):
        for j in range(len(tf_idf_df[i])):
            key=tf_idf_df.index[j]
            if key in normalized_tf_idf:
                normalized_tf_idf[key][i].append(tf_idf_df[i][j][0]/all_doc_length[i])
            else:
                normalized_tf_idf[key] = [[] for i in range(len(data))]
                normalized_tf_idf[key][i].append(tf_idf_df[i][j][0]/all_doc_length[i])
    return normalized_tf_idf

In [12]:
def full_tf_idf(query):
    data = read_data()
    data.append(query)
    data_in_words=data_to_words(data)
    aux = build_aux(data_in_words)
    sorted_aux = dict(sorted(aux.items()))
    idf_dict = idf(sorted_aux,len(data))
    tf_idf_dict = tf_idf(sorted_aux,idf_dict)
    tf_idf_df=pd.DataFrame.from_dict(tf_idf_dict, orient='index')
    norm_tf_idf = normalized_tf_idf(data,tf_idf_df)
#     return norm_tf_idf
    return pd.DataFrame.from_dict(norm_tf_idf, orient='index')

In [13]:
def Similarity(norm_tf_idf):
    score=[]
    length =len(norm_tf_idf.columns)
    for i in range(length-1):
        score.append(sum([np.dot(x,y) for x, y  in zip(norm_tf_idf[i],norm_tf_idf[length-1])]))
    return score

In [14]:
def main():
    query =input()
    norm_tf_idf = full_tf_idf(query)
    score= Similarity(norm_tf_idf)
    query_similarity ={}
    query_similarity["query"]=score
    
    
    data = read_data()
    data_in_words=data_to_words(data)
    aux = build_aux(data_in_words)
    sorted_aux = dict(sorted(aux.items()))
    doc=phrase_query(query, sorted_aux)
    
    freq=tf(aux)
    word_freq_doc=[]
    
    return pd.DataFrame.from_dict(query_similarity, orient='index'),doc

In [15]:
data,doc=main()
if doc == {}:
    print("not found")
else:
    print(doc)
data

Complex inheritance
{0: 167}


Unnamed: 0,0,1,2,3,4
query,0.194388,0.0,0.0,0.0,0.0


In [16]:
data = read_data()
print("Data:\n",data,"\n----------------------------------------------------------------------------\n")
data_in_words=data_to_words(data)
print("Data After Tokenization:\n",data_in_words,"\n----------------------------------------------------------------------------\n")
aux = build_aux(data_in_words)
sorted_aux = dict(sorted(aux.items()))
print("Auxiliary structure:\n",sorted_aux,"\n----------------------------------------------------------------------------\n")

print("Term Frequency For Each File:\n",tf(sorted_aux),"\n----------------------------------------------------------------------------\n")

idf_dict = idf(sorted_aux,len(data))
print("IDF Matrix :\n",idf_dict,"\n----------------------------------------------------------------------------\n")
tf_idf_dict = tf_idf(sorted_aux,idf_dict)
tf_idf_df=pd.DataFrame.from_dict(tf_idf_dict, orient='index')
print("TF-IDF Matrix :\n",tf_idf_df,"\n----------------------------------------------------------------------------\n")
norm_tf_idf = normalized_tf_idf(data,tf_idf_df)
print("Normalized TF-IDF Matrix :\n",norm_tf_idf,"\n----------------------------------------------------------------------------\n")


Data:
 ['In object-oriented programming, inheritance is a way to form new classes (instances of which are called objects) using classes that have already been defined. The inheritance concept was invented in 1967 for Simula.\n\nThe new classes, known as derived classes, take over (or inherit) attributes and behavior of the pre-existing classes, which are referred to as base classes (or ancestor classes). It is intended to help reuse existing code with little or no modification.\n\nInheritance provides the support for representation by categorization in computer languages. Categorization is a powerful mechanism number of information processing, crucial to human learning by means of generalization (what is known about specific entities is applied to a wider group given a belongs relation can be established) and cognitive economy (less information needs to be stored about each specific entity, only its particularities).\n\nInheritance is also sometimes called generalization, because the i

Data After Tokenization:
 [['object', 'orient', 'program', 'inherit', 'way', 'form', 'new', 'class', 'instanc', 'call', 'object', 'use', 'class', 'alreadi', 'defin', 'inherit', 'concept', 'invent', '1967', 'simula', 'new', 'class', 'known', 'deriv', 'class', 'take', 'inherit', 'attribut', 'behavior', 'pre', 'exist', 'class', 'refer', 'base', 'class', 'ancestor', 'class', 'intend', 'help', 'reus', 'exist', 'code', 'littl', 'modif', 'inherit', 'provid', 'support', 'represent', 'categor', 'comput', 'languag', 'categor', 'power', 'mechan', 'number', 'inform', 'process', 'crucial', 'human', 'learn', 'mean', 'gener', 'known', 'specif', 'entiti', 'appli', 'wider', 'group', 'given', 'belong', 'relat', 'establish', 'cognit', 'economi', 'less', 'inform', 'need', 'store', 'specif', 'entiti', 'particular', 'inherit', 'also', 'sometim', 'call', 'gener', 'relationship', 'repres', 'hierarchi', 'class', 'object', 'instanc', 'fruit', 'gener', 'appl', 'orang', 'mango', 'mani', 'other', 'one', 'consid', 

Normalized TF-IDF Matrix :
 {'0': [[0.0], [0.06246375670387463], [0.0], [0.0], [0.0]], '1': [[0.0], [0.019825609446944312], [0.03304354907684547], [0.0], [0.03291086136359727]], '10': [[0.0], [0.06246375670387463], [0.0], [0.0], [0.0]], '1940': [[0.0], [0.0], [0.0], [0.0], [0.06542169786940584]], '1953': [[0.0], [0.0], [0.0], [0.0], [0.06542169786940584]], '1967': [[0.0848788120150187], [0.0], [0.0], [0.0], [0.0]], '2': [[0.0], [0.0], [0.03304354907684547], [0.0277323344922408], [0.03291086136359727]], '2005': [[0.0], [0.06246375670387463], [0.0], [0.0], [0.0]], '285': [[0.0], [0.06246375670387463], [0.0], [0.0], [0.0]], '3': [[0.0], [0.0], [0.059271689513391415], [0.0], [0.0372461062076927]], '336': [[0.0], [0.06246375670387463], [0.0], [0.0], [0.0]], '4': [[0.0], [0.0], [0.10410899174081839], [0.0], [0.0]], '6': [[0.0], [0.06246375670387463], [0.0], [0.0], [0.0]], '8': [[0.0], [0.06246375670387463], [0.0], [0.0], [0.0]], '999': [[0.0], [0.06246375670387463], [0.0], [0.0], [0.0]], 'ab