## Import Statements

In [57]:
import nltk
from nltk import word_tokenize
import nltk.data
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords

from collections import defaultdict
import math
import string
import re

import numpy as np

import threading

## Global Variables

In [58]:
DESCRIPTION ='''
Dataset Description:
    .I -> ID
    .T -> Title
    .A -> Author
    .B -> Date
    .W -> Actual Text
'''

FILEPATH = 'Dataset/cran.all.1400'

N = 1400

### Function to parse content of Cranfield Dataset
Uses Regex for parsing

In [59]:
def parse_file(filename):
    '''
    Input: Filename
    Output: List of dictionaries
        [ {
            'ID':
            'Title':
            'Author':
            'Date':
            'Content':
        }, ...]
    '''
    
    with open(filename) as f:
        contents = f.read()

    pattern = re.compile(
        r"""\.I(?P<ID>.*?)\.T(?P<Title>.*?)\.A(?P<Author>.*?)\.B(?P<Date>.*?)\.W(?P<Content>.*?(?=\.I))""",
        re.DOTALL
    ) 
    
    parsed_content = [
        match.groupdict()
        for match in re.finditer(pattern, contents + '.I')
    ]
    
    return parsed_content

## Preprocessing
- Tokenization
- Stop-Word Removal
- Stemming and Normalization

In [60]:
class PreProcessor:
    def __init__(self, language):
        self.stop_words = set(stopwords.words(language))
        self.punctuations = set(string.punctuation)
        self.stemmer = PorterStemmer()

    def get_tokens(self, data):
        '''
        Input : Raw text data
        Output : List of tokens
        '''
        return word_tokenize(data)
    
    def stopword_removal(self, data):
        '''
        Input : List of tokens
        Output : List of tokens without stopwords
        '''
        return [word for word in data if word not in self.stop_words]

    def normalize(self, word):
        '''
        Input : Word
        Ouput : Case formatted and stemmed word
        '''
        word = str.lower(word)
        word = ''.join([letter for letter in word if letter not in self.punctuations ])
        word = self.stemmer.stem(word)
        return word
    
    def validate(self, data):
        '''
        Input : List of words
        Ouput : List of words without null words
        '''
        return list(filter(lambda word: word != '', data))

## Function to preprocess documents

In [61]:
def preprocess_documents(parsed_data, begin, end, preprocessor=PreProcessor('english')):
    '''
    Input : Parsed Data, start and end indices to preprocess data and PreProcessor Object
    Output : List of documents which are preprocessed 
    '''
    for i in range(begin, end):
        for key in parsed_data[i]:
            data = parsed_data[i][key]
            data = preprocessor.get_tokens(data)
            if key == 'ID':
                data = int(data[0])
            if key == 'Content':
                data = preprocessor.stopword_removal(data)
                data = [preprocessor.normalize(word) for word in data]
                data = preprocessor.validate(data)
            parsed_data[i][key] = data

# Vector Space Model

## Building Index

In [62]:
def build_index(data):
    '''
    Builds Posting list index structure for retrieval.
    '''
    index = defaultdict(dict)
    
    # Calculating Term Frequncy and Building Inverted Index
    for document in data:
        content = document['Content'] + document['Title']
        for term in set(content):
            
            if term not in index:
                index[term]['Posting List'] = []
                
            index[term]['Posting List'].append({
                'ID': document['ID'],
                'Term Frequency': content.count(term)
            })
    
    # Calculating Document Frequency
    for term in index:
        index[term]['Document Frequency'] = len(index[term]['Posting List'])
    
    return index

In [63]:
def get_weight(tf, df):
    '''
    Calculates tf-idf weights
    '''
    if tf == 0 or df == 0:
        return 0
    return (1 + math.log(tf, 10)) * math.log(N/df, 10)

In [64]:
def build_term_doc_matrix(vocabulary, index):
    '''
    Builds term-document matrix to vectorize documents.
    '''
    term_doc_mat = np.zeros((N, len(index)))
    
    for col, term in enumerate(vocabulary):
        for document in index[term]['Posting List']:
            term_doc_mat[document['ID'] - 1, col] = get_weight(
                tf=document['Term Frequency'],
                df=index[term]['Document Frequency']
            )

    return term_doc_mat

In [65]:
def get_query_vector(vocabulary, index, query, preprocessor=PreProcessor('english')):
    '''
    Preprocesses query and builds a query vector.
    '''
    # Preprocess Queries
    processed_query = preprocessor.get_tokens(query)
    #processed_query = preprocessor.stopword_removal(processed_query)
    processed_query = [preprocessor.normalize(term) for term in processed_query]
    
    query_vector = np.zeros((len(vocabulary), 1))
    
    for row, term in enumerate(vocabulary):
        query_vector[row] = get_weight(tf=processed_query.count(term), df=index[term]['Document Frequency'])

    return query_vector

## Putting it Together

In [66]:
parsed_data = parse_file(FILEPATH)
preprocess_documents(parsed_data, 0, len(parsed_data))

In [67]:
# Building Index
index = build_index(parsed_data)
vocabulary = list(index)

In [68]:
from sklearn import preprocessing
queries = []
queries.append('experimental results on hypersonic viscous interaction')
queries.append('properties of impact pressure probes in free molecule flow')
queries.append('manufacturing and maintainance of ideally sharp leading edges and noses is practically impossible')
queries.append('why does the compressibility transformation fail to correlate the high speed data for helium and air')
queries.append('can increasing the edge loading of a plate beyond the critical value for buckling change the buckling mode .')

In [69]:
result = build_term_doc_matrix(vocabulary, index)
result_normalized = preprocessing.normalize(result, norm='l2')

for query in queries:
    print("Query :", query)
    query_vector = get_query_vector(vocabulary, index, query)
    result = list(enumerate(np.matmul(result_normalized, query_vector), start=1))
    
    result.sort(key=lambda x : x[1], reverse=True)

    for i, score in result[:10]:
        print("\t{0:5} - {1:50}".format(i, ' '.join(parsed_data[i - 1]['Title'])))

Query : experimental results on hypersonic viscous interaction
	  305 - hypersonic strong viscous interaction on a flat plate with surface mass transfer .
	  570 - on the boundary layer equations in hypersonic flow and their approximate solutions .
	  540 - use of local similarity concepts in hypersonic viscous interaction problems .
	  573 - viscous hypersonic similitude .                   
	 1253 - hypersonic viscous flow near the stagnation point in the presence of magnetic field .
	  308 - on the hypersonic viscous flow past a flat plate with suction or injection .
	  310 - hypersonic viscous flow over a flat plate .       
	  323 - vorticity interaction at an axisymmetric stagnation point in a viscous incompressible fluid .
	 1299 - hypersonic viscous shock layer .                  
	  192 - on the hypersonic viscous flow past slender bodies of revolution .
Query : properties of impact pressure probes in free molecule flow
	  183 - properties of impact pressure probes in free mol