<a href="https://colab.research.google.com/github/SreyaSalil/IR-Assignments/blob/main/IR_Assignment_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **IR Assignment 1**

*Pre-processing of a Text Document: Accent removal, stop word removal and stemming. Step by step pre-processing including necessary statistics at each step.*

## Import packages

In [8]:
import string
#For acessing files
import os,sys
# To strip HTML tags
from bs4 import BeautifulSoup
#To remove Numbers in text using RE
import re
#accent removal
import unicodedata
#stop word removal and word tokenization
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
#Stemming
from nltk.stem import LancasterStemmer, WordNetLemmatizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Document Preprocessing Functions

### Remove closed and unclosed HTML tags in document

In [9]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

### Accent Removal

In [10]:
# Remove non-ASCII and accented characters (é, â, î, ñ or ô) from list of tokenized words
def remove_accent(words):
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

### Lexical Analysis Functions

In [11]:
# Convert all characters to lowercase from list of tokenized words
def to_lowercase(words):
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

# Remove punctuation from list of tokenized words
def remove_punctuation(words):
    new_words = []
    for word in words:
        new_word = word.translate(str.maketrans("","",string.punctuation))
        if new_word != '':
            new_words.append(new_word)
    return new_words

# Replace all interger occurrences in list of tokenized words with textual representation
def remove_numbers(words):
    new_words = []
    for word in words:
        new_word = re.sub(r'\d+','',word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def lexical_analysis(words):
    words = remove_accent(words)
    words = to_lowercase(words)
    words = remove_numbers(words)
    return words

### Stop word elimination

In [12]:
def remove_stopwords(words):
    new_words = []
    stop_words = set(stopwords.words("english"))
    for word in words:
        if word not in stop_words:
            new_words.append(word)
    return new_words

### Stemming

In [13]:
def stem_words(words):
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

## Step-by-step document preprocessing 

In [14]:
vocabulary = []
for filename in os.listdir(os.getcwd()+"/Docs"):
    with open(os.path.join(os.getcwd()+"/Docs",filename),encoding="utf8", errors='ignore') as rf:
        
        print("size of",filename,":",os.stat(os.getcwd()+"/Docs/"+filename).st_size,"bytes")
        
        processed_doc_name = filename
        
        sample = rf.read()

        sample = strip_html(sample)
        # removal of punctuations
        sample = sample.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
        words = word_tokenize(sample)

        words = lexical_analysis(words)
        with open(os.path.join(os.getcwd()+"/processed_docs",processed_doc_name),"w") as wf:
            n = wf.write(" ".join(words))
        print("size after lexical analysis:",os.stat(os.getcwd()+"/processed_docs/"+processed_doc_name).st_size,"bytes")
        
        words = remove_stopwords(words)
        with open(os.path.join(os.getcwd()+"/processed_docs",processed_doc_name),"w") as wf:
            n = wf.write(" ".join(words))
        print("size after removing stopwords:",os.stat(os.getcwd()+"/processed_docs/"+processed_doc_name).st_size,"bytes")
        
        words = stem_words(words)
        with open(os.path.join(os.getcwd()+"/processed_docs",processed_doc_name),"w") as wf:
            n = wf.write(" ".join(words))
        print("size after stemming:",os.stat(os.getcwd()+"/processed_docs/"+processed_doc_name).st_size,"bytes")
        
        words = remove_stopwords(words)
        with open(os.path.join(os.getcwd()+"/processed_docs",processed_doc_name),"w") as wf:
            n = wf.write(" ".join(words))
        print("size after removing stop words once again after stemming:",os.stat(os.getcwd()+"/processed_docs/"+processed_doc_name).st_size,"bytes")
        
        vocabulary=vocabulary+words
        print("\n\n")

size of Text1.txt : 29387 bytes
size after lexical analysis: 18891 bytes
size after removing stopwords: 14495 bytes
size after stemming: 11507 bytes
size after removing stop words once again after stemming: 11437 bytes



size of Text3.txt : 27201 bytes
size after lexical analysis: 25915 bytes
size after removing stopwords: 17099 bytes
size after stemming: 12698 bytes
size after removing stop words once again after stemming: 12605 bytes



size of Text2.txt : 11588 bytes
size after lexical analysis: 11006 bytes
size after removing stopwords: 7041 bytes
size after stemming: 5494 bytes
size after removing stop words once again after stemming: 5423 bytes



size of LargeText.txt : 674425 bytes
size after lexical analysis: 625272 bytes
size after removing stopwords: 420128 bytes
size after stemming: 321472 bytes
size after removing stop words once again after stemming: 318439 bytes





## Add vocabulary to text file for future use

In [15]:
vocabulary = list(set(vocabulary))
vocabulary.sort()
with open(os.path.join(os.getcwd(),"vocabulary.txt"),"w") as wf:
    wf.write(" ".join(vocabulary))