# Task 2: Information Retrieval

## Document Indexing

![image.png](attachment:a0b4ac6e-eff7-4755-bfbd-5e16c62e5b0c.png)

Importing all necessary libraries

In [1]:
import os
import math
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from bs4 import BeautifulSoup as bs

In [2]:
def read_file(fileName):

    content = []
    with open(fileName, 'r', encoding="utf8") as file:
        # Read each line in the file, readlines() returns a list of lines
        content = file.readlines()

        # Combine the lines in the list into a string
        content = "".join(content)
        bs_content = bs(content, "xml")

    #returns "bs4.element.Tag" object
    result = bs_content.find("raw")
    #print(result.prettify())

    return result

In [3]:
#Changing from a bs4.element.Tag object to a String.
def convert_Text(data):

    text = data.get_text()
    
    #print(text)
    return text

In [4]:
def tokenize(data):
    
    tokens = nltk.word_tokenize(data)
    
    #print(tokens)
    return tokens

In [5]:
def change_case(data):

    for i in range(len(data)):
        data[i] = data[i].casefold()

    #print(data)
    return data

In [6]:
def remove_stop_words(data):
    
    stop_words = set(stopwords.words('english'))

    filtered_list = []

    for word in data: 
        if word not in stop_words:
            filtered_list.append(word)

    #print(filtered_list)
    return filtered_list

In [7]:
def stemming(data):
    
    ps = PorterStemmer() 

    for i in range(len(data)):
        data[i] = ps.stem(data[i])

    #print(data)
    return data

In [8]:
def remove_symbols(data):
    
    temp = []

    symbols = "“”‘’!\"#$€%&()*'+-,./:;<=>?@[\]^_`{|}~\n"
    for t in data:
        if t not in symbols:
            temp.append(t)
            
    #print(temp)
    return temp

In [9]:
def preprocessing(filename):
    data = read_file(filename)
    data = convert_Text(data)
    data = tokenize(data)
    data = change_case(data)
    data = remove_stop_words(data)
    data = stemming(data)
    data = remove_symbols(data)
    
    #print("Finished PreProcessing:")
    #print(data)
    
    return data

Create a Dictionary for a file. Storing the Term Frequency in a document.

In [10]:
def create_Dictionary(data):
    docDict = {}

    for i in range(len(data)):
        tokens = data[i]
        try:
            docDict[data[i]].add(i)
        except:
            docDict[data[i]] = {i}

    for i in docDict:
        docDict[i] = len(docDict[i])

    return docDict

Get all the unique words in a document

In [11]:
def get_doc_words(docDict):
    doc_words = [x for x in docDict]
    
    return doc_words

Calculate the Term Frequency. Meaning the the number of times a word appears in a document divded by the total number of words in the document. 

In [12]:
def calculate_TF(docDict):
    tfDict = {}
    doc_words = get_doc_words(docDict)
    doc_words_count = len(doc_words)
    for word, count in docDict.items():
        tfDict[word] = count / float(doc_words_count)
    return tfDict

In [13]:
def get_TF_Matrix(tfDict):

    data = list(tfDict.items())
    tfMatrix = np.array(data)
    
    return tfMatrix

In [33]:
def print_Matrix(matrix):
    print('\n'.join([''.join(['{:18}'.format(item) for item in row]) 
      for row in matrix]))

Calculate the Inverse Data Frequency. Meaning the log of the number of documents divided by the number of documents that contain a specific word.

In [15]:
def calculate_IDF_TEST(documents):
    N = len(documents)

    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

In [16]:
def calculate_IDF(dataset):
    
    #total documents
    N = len(dataset)
    
    dfDict = calculate_DF(dataset)
    
    idfDict = {}
    
    for word, val in dfDict.items():
        idfDict[word] = math.log(N / float(val))

    return idfDict

Return a list of all the unique words from all of the files

In [17]:
def get_unique_words(dataset):
    
    unique_terms = []
    
    for f in dataset:
        dict = create_Dictionary(f)
        words = get_doc_words(dict)
        unique_terms.extend(words)
    
    unique_terms = set(unique_terms)
    return unique_terms

Calculating the Document Frequency. Number of Documents containing a particular term.

In [18]:
def calculate_DF(dataset):
    
    unique_terms = get_unique_words(dataset)
    
    DF = {}
    
    for term in unique_terms:
        found = 0    
        for i in range(0, len(dataset)):
            for t in dataset[i]:
                if term == t:
                    found += 1
                    break
        
        DF[term] = found

    return DF

Term frequency–inverse document frequency

In [19]:
def calculate_TFIDF(dataset):
    
    return

os.walk gives us the files in the directory, os.getcwd gives us the current directory and title and we are going to search in the current directory + stories folder as our data files are in the stories folder

In [20]:
filenames = []
for (root,dirs,files) in os.walk(str(os.getcwd())+'/docs-raw-texts/'):
    filenames.extend(files)
    path = root
    break

In [21]:
file_dataset = []
for f in filenames:
    temp = []
    #print(path + f)
    
    file = open(path+f, 'r', encoding="utf8")
    text = file.read().strip()
    file.close()

    bs_content = bs(text, "xml")

    #returns "bs4.element.Tag" object
    file_title = (bs_content.find("fileDesc")).get("title")
    #print(file_title)
    
    temp.append(str(path + f))
    temp.append(str(file_title))
    
    file_dataset.append(temp)
    
#file_dataset

In [22]:
#dataset conatainging all pre-processed files
dataset = []

for f in file_dataset:
    data = preprocessing(f[0])
    dataset.append(data)

Creating a matrix

In [None]:
unique_words = list(get_unique_words(dataset))

#TF_Matrix = [[0]*len(unique_words)]

#for i in range(0, len(unique_words)):
    #TF_Matrix[i].append(unique_words[i])

TF_Matrix = unique_words

for d in dataset:
    file_DF = create_Dictionary(d)
    file_TF = get_TF_Matrix(calculate_TF(file_DF))
            
    for i in range(0, len(file_TF)):
        for j in range(0, len(TF_Matrix)):
            if TF_Matrix[j] == file_TF[i][0]:
                TF_Matrix[j] = list(TF_Matrix[j])
                TF_Matrix[j].append(file_TF[i][1])

print(TF_Matrix)

In [70]:
#preprocessing of a file in the file_dataset
#file1 = preprocessing(file_dataset[3][0])

#creating a Dictionary of file1 with number of times a word is found in that document
#print(file1)
file1_DF = create_Dictionary(dataset[3])

file1_TF = get_TF_Matrix(calculate_TF(file1_DF))
print_Matrix(file1_TF)

#preprocessing of a file in the file_dataset
#file2 = preprocessing(dataset[0])

#creating a Dictionary of file1 with number of times a word is found in that document
#print(file2)
#file2_DF = create_Dictionary(file2)

#file2_TF = get_TF_Matrix(calculate_TF(file2_DF))
#print_Matrix(file2_TF)

#print(calculate_IDF(dataset))
#print(get_unique_words(dataset))
#print(dataset)
#calculate_IDF(dataset)
#print_Matrix(dataset)

walt              0.01639344262295082
disney            0.04371584699453552
steamboat         0.03825136612021858
willi             0.03278688524590164
rise              0.00546448087431694
mickey            0.07103825136612021
mous              0.04371584699453552
star              0.02185792349726776
walk              0.00546448087431694
fame              0.01092896174863388
imag              0.00546448087431694
flickr            0.00546448087431694
user              0.00546448087431694
freshwater2006    0.00546448087431694
novemb            0.01092896174863388
18                0.01092896174863388
1928              0.01092896174863388
anim              0.01092896174863388
movi              0.02185792349726776
releas            0.00546448087431694
present           0.00546448087431694
famou             0.00546448087431694
charact           0.01639344262295082
first             0.03278688524590164
time              0.01092896174863388
new               0.01639344262295082
york        