# **Corpus Preprocessing File:**
### *The following file contains the preprocessing and exploratory analysis performed on th corpus at hand. Our goal is to get familiar with the corpus, perform preprocessing steps on each document, including case foldings, stop-word removal and lemmatization, and prepare the documents for inverted index formation*

In [43]:
# Libraries and modules used
import os
import nltk
import string
from nltk.tokenize import word_tokenize
import chardet
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [48]:
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\11th Generation\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to C:\Users\11th
[nltk_data]     Generation\AppData\Roaming\nltk_data...


True

In [38]:
CORPUS = 'Abstracts'
STOPWORD_LIST = 'Stopword-List.txt'
PREPRCESS_RSLT = 'Preprocessed_Corpus'

In [54]:
os.makedirs(PREPRCESS_RSLT, exist_ok=True)

In [39]:
# Getting Stopwords into a set for effecient stop word removal.

with open(STOPWORD_LIST, 'r', encoding= 'utf-8') as f:
    
    stopwords= set(f.read().split())
    
print(f'Stopwords list provided---> {stopwords}')


Stopwords list provided---> {'we', 'for', 'be', 'her', 'up', 'do', 'as', 'the', 'all', 'once', 'had', 'and', 'to', 'is', 'at', 'am', 'has', 'in', 'are', 'on', 'no', 'can', 'of', 'have', 'his', 'a'}


In [40]:
""" Checking if there are any documents present in the corpus 
    which might have a different encoding then utf-8
    Then normalizing all files to utf-8 encoding to ensure avoiding problems in the future
"""

def Detect_Encoding(doc_path):
    
    with open(doc_path, 'rb') as f:
        
        data= f.read()
        
        result= chardet.detect(data)
        
        return result['encoding']

def encode_to_utf_8(file_path, original_encoding):
    
    with open(file_path, 'r', encoding= original_encoding, errors= 'replace') as f:

        data= f.read()
        
    with open(file_path, 'w', encoding='utf-8') as f:

        f.write(data)

for document in os.listdir(CORPUS):
    
    doc_path= os.path.join(CORPUS, document)
    
    encoding = Detect_Encoding(doc_path)
    
    if encoding not in ('utf-8', 'ascii'):
        
        print(f'Originally --> {document} : {encoding}')
        
        encode_to_utf_8(doc_path, encoding)



Originally --> 112.txt : Windows-1252
Originally --> 116.txt : ISO-8859-1
Originally --> 121.txt : ISO-8859-1
Originally --> 165.txt : ISO-8859-1
Originally --> 229.txt : ISO-8859-1
Originally --> 256.txt : ISO-8859-1
Originally --> 275.txt : ISO-8859-1
Originally --> 287.txt : ISO-8859-1
Originally --> 307.txt : ISO-8859-1
Originally --> 319.txt : Windows-1252
Originally --> 336.txt : Windows-1252
Originally --> 365.txt : Windows-1252
Originally --> 371.txt : Windows-1252
Originally --> 379.txt : ISO-8859-1
Originally --> 392.txt : ISO-8859-1
Originally --> 420.txt : Windows-1252
Originally --> 423.txt : ISO-8859-1
Originally --> 424.txt : ISO-8859-1
Originally --> 434.txt : ISO-8859-1
Originally --> 81.txt : ISO-8859-1
Originally --> 85.txt : ISO-8859-1


In [51]:
"""
Preprocessing Text including:
    1) Case Folding
    2) Handling Punctuations
    3) Stop Word Removal
    4) Tokenization and Lemmatization
"""

lemmatizer= WordNetLemmatizer()

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def text_preprocesser(text):
    
    text= text.lower()
    
    text= text.translate(str.maketrans("", "", string.punctuation))
    
    words= word_tokenize(text)
    
    words= [word for word in words if word.isalpha() and word not in stopwords]
    
    pos_tgs= nltk.pos_tag(words)
    
    lemmas= [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tgs]
    
    return " ".join(lemmas) 

In [53]:
# Testing word preprocessor

test= "Hello, this is testing OUT the text_preprocesser steps function for preprocessing text in our corpus ran runner run stopped recommended.."

rslt= text_preprocesser(test)

print(rslt)

hello this test out textpreprocesser step function preprocessing text our corpus run runner run stop recommend


In [55]:
for document in os.listdir(CORPUS):
    path = os.path.join(CORPUS, document)
    try:
        
        with open(path, 'r', encoding= 'utf-8') as f:
            text = f.read()

        processed_text = text_preprocesser(text)

        new_path = os.path.join(PREPRCESS_RSLT, document)
        with open(new_path, 'w', encoding="utf-8") as f:
            f.write(processed_text)

    except Exception as e:
        print(f"Skipping {document} due to {e}")

print("\n All files processed successfully!")


 All files processed successfully!


# We have succesfully preprocessed the corpus and have made the choice to save the preprocessed version of each document in the **Preprocessed_Corpus** directory... 