## Import Statements

In [294]:
import nltk
from nltk import word_tokenize
import nltk.data
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords

import string

import re

import threading

## Global Variables

In [295]:
DESCRIPTION ='''
Dataset Description:
    .I -> ID
    .T -> Title
    .A -> Author
    .B -> Date
    .W -> Actual Text
'''

FILEPATH = 'Dataset/cran.all.1400'

### Function to parse content of Cranfield Dataset
Uses Regex for parsing

In [296]:
def parse_file(filename):
    '''
    Input: Filename
    Output: List of dictionaries
        [ {
            'ID':
            'Title':
            'Author':
            'Data':
            'Content':
        }, ...]
    '''
    
    with open(filename) as f:
        contents = f.read()

    pattern = re.compile(
        r"""\.I(?P<ID>.*?)\.T(?P<Title>.*?)\.A(?P<Author>.*?)\.B(?P<Date>.*?)\.W(?P<Content>.*?(?=\.I))""",
        re.DOTALL
    ) 
    
    parsed_content = [
        match.groupdict()
        for match in re.finditer(pattern, contents + '.I')
    ]
    
    return parsed_content

In [297]:
parsed = parse_file(FILEPATH)

In [298]:
class PreProcessor:
    
    def __init__(self, language):
        self.stop_words = set(stopwords.words(language))
        self.punctuations = set(string.punctuation)

    def get_tokens(self, data):
        return word_tokenize(data)
    
    def stopword_removal(self, data):
        return list(filter(lambda word: word not in self.stop_words, data))

    def stemming(self, word):
        porter_stemmer = PorterStemmer()
        return porter_stemmer.stem(word)

    def normalize(self, word):
        word = str.lower(word)
        
        word = ''.join([letter for letter in word if letter not in self.punctuations ])
        
        word = stemming(word)
        return word
    
    def validate(self, data):
        return list(filter(lambda word: word != '', data))
    

    

In [299]:
preprocess = PreProcessor('english')

In [300]:
class myThread (threading.Thread):

    def __init__(self, threadID, name, begin, end):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.name = name
        self.begin = begin
        self.end = end

    def run(self):
        preprocess_documents(self.begin, self.end)


In [301]:
def preprocess_documents(begin, end):
    while begin != end + 1:
        for key in parsed[begin]:
            data = parsed[begin][key]
            data = preprocess.get_tokens(data)
            if key == 'Content':
                data = preprocess.stopword_removal(data)
                data = list(map(lambda word: preprocess.normalize(word), data))
                data = preprocess.validate(data)
            parsed[begin][key] = data
        begin += 1

In [302]:
threads = [myThread(i, "Thread -" + str(i), 14*i, 14*i + 13) for i in range(100)]

for thread in threads:
    thread.start()
    
for thread in threads:
    thread.join()

In [303]:
print(parsed[-1])

{'ID': ['1400'], 'Title': ['the', 'buckling', 'shear', 'stress', 'of', 'simply-supported', 'infinitely', 'long', 'plates', 'with', 'transverse', 'stiffeners', '.'], 'Author': ['kleeman', ',', 'p.w', '.'], 'Date': ['arc', 'r', '+', 'm.2971', ',', '1953', '.'], 'Content': ['buckl', 'shear', 'stress', 'simplysupport', 'infinit', 'long', 'plate', 'transvers', 'stiffen', 'report', 'extens', 'previou', 'theoret', 'investig', 'elast', 'buckl', 'shear', 'flat', 'plate', 'reinforc', 'transvers', 'stiffen', 'plate', 'treat', 'infinit', 'long', 'simplysupport', 'along', 'long', 'side', 'stiffen', 'space', 'regular', 'interv', 'divid', 'plate', 'number', 'panel', 'uniform', 'size', 'effect', 'ob', 'bend', 'torsion', 'stiff', 'stiffen', 'upon', 'buckl', 'shear', 'stress', 'calcul', 'complet', 'rang', 'stiff', 'panel', 'ratio', 'width', 'stiffen', 'space', 'graphic', 'form']}
