# Task 2: Information Retrieval

## Document Indexing

Importing all necessary libraries

In [1]:
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from bs4 import BeautifulSoup as bs

Reading contents of the XML file.

In [2]:
def read_file(file):

    content = []
    with open(file, "r") as file:
        # Read each line in the file, readlines() returns a list of lines
        content = file.readlines()

        # Combine the lines in the list into a string
        content = "".join(content)
        bs_content = bs(content, "xml")

    #returns "bs4.element.Tag" object
    result = bs_content.find("raw")
    #print(result.prettify())

    return result

Changing from a bs4.element.Tag object to a String.

In [3]:
def convert_Text(data):

    text = data.get_text()
    
    #print(text)
    return text

In [4]:
def tokenize(data):
    
    tokens = nltk.word_tokenize(data)
    
    #print(tokens)
    return tokens

Processing the contents by case-folding, stop-word removal and stemming.

In [5]:
def change_case(data):

    for i in range(len(data)):
        data[i] = data[i].casefold()

    #print(data)
    return data

In [6]:
def remove_stop_words(data):
    
    stop_words = set(stopwords.words('english'))

    filtered_list = []

    for word in data: 
        if word not in stop_words:
            filtered_list.append(word)

    #print(filtered_list)
    return filtered_list

In [7]:
def stemming(data):
    
    ps = PorterStemmer() 

    for i in range(len(data)):
        data[i] = ps.stem(data[i])

    #print(data)
    return data

Removing any symbols

In [8]:
def remove_symbols(data):
    
    temp = []

    symbols = "“!\"#$€%&()*'+-,./:;<=>?@[\]^_`{|}~\n"
    for t in data:
        if t not in symbols:
            temp.append(t)
            
    #print(temp)
    return temp

In [9]:
def preprocessing(filename):
    data = read_file(filename)
    data = convert_Text(data)
    data = tokenize(data)
    data = change_case(data)
    data = remove_stop_words(data)
    data = stemming(data)
    data = remove_symbols(data)
    
    print("Finished PreProcessing:")
    print(data)
    

In [10]:
preprocessing("docs-raw-texts/wes2015.d021.naf")

Finished PreProcessing:
['oskar', 'barnack', 'â€', 'father', '35mm', 'photographi', 'oskar', 'barnack', '1879-1936', 'novemb', '1', '1879', 'german', 'optic', 'engin', 'precis', 'mechan', 'industri', 'design', 'oskar', 'barnack', 'born', 'often', 'refer', 'father', '35', 'mm', 'photographi', 'invent', 'first', 'miniatur', 'commerci', 'success', 'camera', 'leica', 'age', 'digit', 'photographi', 'someth', 'like', '35', 'mm', 'film', 'might', 'seem', 'like', 'forgotten', 'relict', 'realli', 'revolut', 'brought', 'photographi', 'mass', 'â€', 'way', 'invent', 'theâ', 'film', 'roll', 'applic', 'famou', 'kodak', 'box', 'camera', 'almost', '50', 'year', 'earlier', 'actual', 'littl', 'known', 'privat', 'life', 'oskar', 'barnack', 'known', 'inform', 'life', 'concern', 'creation', 'born', 'lynow', 'brandenburg', 'hamlet', 'south', 'berlin', 'â', 'barnack', 'master', 'mechan', 'inventor', 'work', 'carl', 'zeiss', 'compani', 'receiv', 'offer', 'join', 'ernst', 'leitz', 'optisch', 'werk', 'wetzlar',