# Task 2: Information Retrieval

## Document Indexing

Importing all necessary libraries

In [1]:
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from bs4 import BeautifulSoup as bs

In [2]:
def read_file(fileName):

    content = []
    with open(fileName, encoding="utf8") as file:
        # Read each line in the file, readlines() returns a list of lines
        content = file.readlines()

        # Combine the lines in the list into a string
        content = "".join(content)
        bs_content = bs(content, "xml")

    #returns "bs4.element.Tag" object
    result = bs_content.find("raw")
    #print(result.prettify())

    return result

In [3]:
#Changing from a bs4.element.Tag object to a String.
def convert_Text(data):

    text = data.get_text()
    
    #print(text)
    return text

In [4]:
def tokenize(data):
    
    tokens = nltk.word_tokenize(data)
    
    #print(tokens)
    return tokens

In [5]:
def change_case(data):

    for i in range(len(data)):
        data[i] = data[i].casefold()

    #print(data)
    return data

In [6]:
def remove_stop_words(data):
    
    stop_words = set(stopwords.words('english'))

    filtered_list = []

    for word in data: 
        if word not in stop_words:
            filtered_list.append(word)

    #print(filtered_list)
    return filtered_list

In [7]:
def stemming(data):
    
    ps = PorterStemmer() 

    for i in range(len(data)):
        data[i] = ps.stem(data[i])

    #print(data)
    return data

In [8]:
def remove_symbols(data):
    
    temp = []

    symbols = "“!\"#$€%&()*'+-,./:;<=>?@[\]^_`{|}~\n"
    for t in data:
        if t not in symbols:
            temp.append(t)
            
    #print(temp)
    return temp

In [9]:
def preprocessing(filename):
    data = read_file(filename)
    data = convert_Text(data)
    data = tokenize(data)
    data = change_case(data)
    data = remove_stop_words(data)
    data = stemming(data)
    data = remove_symbols(data)
    
    print("Finished PreProcessing:")
    print(data)

In [10]:
preprocessing("docs-raw-texts/wes2015.d028.naf")
preprocessing("docs-raw-texts/wes2015.d082.naf")

Finished PreProcessing:
['william', 'higinbotham', 'tenni', 'two', 'tenni', 'two', 'play', 'oscilloscop', 'octob', '25', '1910', 'us-american', 'physicist', 'william', 'willi', '”', 'a.', 'higinbotham', 'born', 'member', 'manhattan', 'project', 'later', 'becam', 'leader', 'nonprolifer', 'movement', 'nuclear', 'weapon', 'moreov', 'also', 'known', 'develop', '‘', 'tenni', 'two', '‘', 'first', 'interact', 'analog', 'comput', 'game', 'one', 'first', 'electron', 'game', 'use', 'graphic', 'display', 'william', 'alfr', 'higinbotham', 'born', 'bridgeport', 'connecticut', 'grew', 'caledonia', 'new', 'york', 'father', 'minist', 'presbyterian', 'church', 'earn', 'undergradu', 'degre', 'william', 'colleg', '1932', 'continu', 'studi', 'cornel', 'univers', 'work', 'radar', 'system', 'mit', '1941', '1943', 'world', 'war', 'ii', 'work', 'lo', 'alamo', 'nation', 'laboratori', 'head', 'lab', '’', 'electron', 'group', 'later', 'year', 'war', 'team', 'develop', 'electron', 'first', 'nuclear', 'bomb', 'tea