In [84]:
class Vector_space_model:
    
    
    def start(self, cells):
        english_document = []
        for row in cells.iter_rows():
            index = 0;
            for k in row:
                if index == 0:
                    english_document.append(k.internal_value.lower())
                    break
    
        return english_document

    def generate_tokens(self, english_document):
        final_tokens = []
        for i in range(len(english_document)):

            nltk_tokens = nltk.word_tokenize(english_document[i].lower())
            #print(nltk_tokens)
            final_tokens.append(nltk_tokens)
        #rint(final_tokens)
        return final_tokens
        
    def remove_stop_words(self, tokens):
    
        eng_stop_words = stopwords.words('english')
        little_pruned_tokens = []
        pruned_tokens = []

        for i in range(len(tokens)):
            temp = []
            for j in range(len(tokens[i])):
                if tokens[i][j] not in eng_stop_words:
                    temp.append(tokens[i][j])    
            little_pruned_tokens.append(temp)


        for i in range(len(little_pruned_tokens)):
            temp = []
            for j in range(len(little_pruned_tokens[i])):        
                if little_pruned_tokens[i][j].isalpha():
                    temp.append(little_pruned_tokens[i][j])

            pruned_tokens.append(temp)

        return pruned_tokens

    def stemming(self, pruned_tokens):
    
        ps = PorterStemmer()
        stemmed_words = []

        for i in range(len(pruned_tokens)):
            temp = []
            for j in range(len(pruned_tokens[i])):        
                temp.append(ps.stem(pruned_tokens[i][j]))

            stemmed_words.append(temp)

        return stemmed_words
    
    def frequency_count(self, stemmed_words):
    
        dictionary = {}

        for i in range(len(stemmed_words)):
            for j in range(len(stemmed_words[i])):        
                count = dictionary.get(stemmed_words[i][j],0)
                dictionary[stemmed_words[i][j]] = count + 1
        
        sorted_dict = self.sortedDictkeys(dictionary)
        
        return sorted_dict
    
    def vector_of_words(self, sorted_dict):
    
        word_vector = {}
        k = 0
        
        for (key, value) in sorted_dict.items():
            word_vector[key] = k
            k = k+1
            
        return word_vector
    
    def sortedDictkeys(self, adict):
        sort_dict = {}

        for (key, value) in sorted(adict.items()):
            sort_dict[key] = value

        return sort_dict
    
    
    def inverted_index_fun(self, documents):

        inverted_index = defaultdict(set)

        for i in range(len(documents)):
            for j in range(len(documents[i])):

                inverted_index[documents[i][j]].add(i)

        #sorting the dictionary by values:
        for keys, value in inverted_index.items():
            tmp = sorted(value)
            inverted_index[keys] = tmp

        return inverted_index
    
    def weighted_term_doc_matrix(self, stemmed_docs, word_vector):
        
        #-------- creating term matrix of all documents with dim (no of word * no of docs) ---------------#
        
        tf_matrix = np.zeros((len(word_vector), len(stemmed_docs)), dtype=int)
        
        #-------- updating term matrix ------------------------------------------------------------------#
        
        for i in range(len(stemmed_docs)):
            for j in range(len(stemmed_docs[i])):
                word = stemmed_docs[i][j]
                index = word_vector[word]
                tf_matrix[index][i] = tf_matrix[index][i] + 1
        
        #-------- calculating idf of terms -------------------------------------------------------------#
        nonzero_count = np.count_nonzero(tf_matrix, axis=1)
        N = len(stemmed_docs)
        idf = np.log(N/nonzero_count)
        tf_matrix = (tf_matrix.T * idf).T

        #-------- normalizing all the document vectors -------------------------------------------------#
        normalized_vec = np.sqrt(np.sum(np.square(tf_matrix), axis=0))
        tf_matrix = tf_matrix/normalized_vec
        
        
        return tf_matrix, idf
    
    def query_vector(self, stemmed_query, word_vector, idf):
        
        #-------- creating query matrix of all queris with dim (no of word * no of queries) ---------------#
        
        query_matrix = np.zeros((len(word_vector), len(stemmed_query)), dtype=int)
        
        #-------- updating query matrix ------------------------------------------------------------------#
        
        for i in range(len(stemmed_query)):
            for j in range(len(stemmed_query[i])):
                word = stemmed_query[i][j]
                if word in word_vector.keys(): 
                    index = word_vector[word]
                    query_matrix[index][i] = query_matrix[index][i] + 1
    
        #------- Updating query matrix by multiplying idf to it -----------------------------------------#
        
        query_matrix = (query_matrix.T * idf).T
        
        #------- Normalizing all the query vectors ------------------------------------------------------#
        normalized_query = query_matrix/(np.sqrt(np.sum(np.square(query_matrix), axis=0)))
        
        
        return normalized_query
    
    def query_doc_similarity(self, weighted_matrix, query_matrix):
        
        cosine_val = np.dot(weighted_matrix.T, query_matrix)
        
        return cosine_val

In [81]:
import openpyxl as px
import codecs
import string
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import defaultdict
import sys
import pandas as pd

[nltk_data] Downloading package punkt to /home/dell/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/dell/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [82]:
def main():
    W = px.load_workbook('Dand_Prakriya.xlsx')
    p = W.get_sheet_by_name(name = 'Sheet')
    english_document = []
    
    ret = Vector_space_model()

    #---------------- Pre-processing the documents by removing stopwords and stemming --------------------------------#
    
    english_document = ret.start(p)
    tokens = ret.generate_tokens(english_document)
    pruned_tokens = ret.remove_stop_words(tokens)
    stemmed_words = ret.stemming(pruned_tokens)
    
    
    #--------------- Creating Inverted index file --------------------------------------------------------------------#
    
    inverted_index = ret.inverted_index_fun(stemmed_words)
    inv_file = open("inverted_index_eng.txt","w")
    inv_file.write(str(inverted_index))
    inv_file.close()

    
    #--------------- Creating Bag of words and their frequency file --------------------------------------------------#
    
    frequency = ret.frequency_count(stemmed_words)
    freq_file = open("OUT_eng_freq.txt","w")
    freq_file.write(str(frequency))
    freq_file.close()
    word_vector = ret.vector_of_words(frequency)

    
    #--------------- Creating the Weighted term-idf matrix and normalizing all the document vectors ------------------#
    
    normalized_matrix, idf = ret.weighted_term_doc_matrix(stemmed_words, word_vector)
    
    #--------------- storing weighted normalized term-idf matrix in excel sheet --------------------------------------#
    
    df = pd.DataFrame(normalized_matrix)
    df.insert(0, "words", word_vector, True)
    doc = ['words']
    s = ""
    for i in range(len(tokens)):
        s = 'doc ' + str(i+1)
        doc.append(s)
    df.columns = doc
    
    filepath = 'Excel_eng.xlsx'
    df.to_excel(filepath, index=False)

    
    #--------------- Processing the Queries and calculating the cosine similarity between queries and documents --------#
    
    f=codecs.open("query_eng.txt",encoding='utf-8')
    queries = [x.strip() for x in f.readlines()]    
    query_tokens = ret.generate_tokens(queries)
    query_pruned_tokens = ret.remove_stop_words(query_tokens)
    query_stemmed_words = ret.stemming(query_pruned_tokens)
    normalized_query = ret.query_vector(query_stemmed_words, word_vector, idf)
    
    cosine_val = ret.query_doc_similarity(normalized_matrix, normalized_query)
    
    std_out = sys.stdout
    file = open('OUT_ENGLISH.txt', 'w')
    sys.stdout = file

    for i in range(len(queries)):
        doc = []
        print(queries[i], ':')
        for j in range(len(cosine_val)):
            if cosine_val[j][i] >= 0.65:
                doc.append(j+1)
        print(doc)
                
    sys.stdout = std_out
    file.close()

In [83]:
if __name__ == '__main__':
    main()

  This is separate from the ipykernel package so we can avoid doing imports until
