In [1]:
import pandas as pd
import nltk
import itertools
import math
import operator
from statistics import mean
from nltk.corpus import stopwords
from nltk.stem import *
import os,sys
import re, string, unicodedata
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize

In [2]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    new_words = []
    for word in words:
        new_word = re.sub(r'\d+','',word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    stop_words = set(stopwords.words("english"))
    for word in words:
        if word not in stop_words:
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

In [3]:
def lexical_analysis(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_numbers(words)
    return words

In [4]:
def read_data(path):
    contents = []
    for filename in os.listdir(path):
        data = strip_html(open(path+'/'+filename,'r').read())
        #filename = re.sub('\D',"",filename)
        contents.append((filename,data))
    return contents

In [5]:
def get_vocabulary(data):
    tokens = []
    with open(os.path.join(os.getcwd(),"vocabulary.txt"),"r") as rf:
        tokens = rf.read().split()
    return tokens

In [6]:
def preprocess_data(contents):
    dataDict = {}
    for content in contents:
        sample = content[1]
        sample = sample.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
        tokens = word_tokenize(sample)
        lexical = lexical_analysis(tokens)
        filtered_tokens = remove_stopwords(lexical)
        stemmed_tokens = stem_words(filtered_tokens)
        filtered_tokens1 = remove_stopwords(stemmed_tokens)
        dataDict[content[0]] = filtered_tokens1
    return dataDict

In [7]:
def generate_inverted_index(data):
    all_words = get_vocabulary(data)
    index = {}
    for word in all_words:
        index[word] = {}
        for doc, tokens in data.items():
            index[word][doc] = tokens.count(word)
    return index

In [8]:
data = read_data("docs")
preprocessed_data = preprocess_data(data)
inverted_index = generate_inverted_index(preprocessed_data)

In [9]:
inverted_index

{'abandon': {'T1.txt': 0,
  'T10.txt': 2,
  'T2.txt': 0,
  'T3.txt': 0,
  'T4.txt': 0,
  'T5.txt': 0,
  'T6.txt': 0,
  'T7.txt': 0,
  'T8.txt': 0,
  'T9.txt': 1},
 'abbrevy': {'T1.txt': 0,
  'T10.txt': 0,
  'T2.txt': 0,
  'T3.txt': 0,
  'T4.txt': 0,
  'T5.txt': 0,
  'T6.txt': 0,
  'T7.txt': 1,
  'T8.txt': 0,
  'T9.txt': 0},
 'abdom': {'T1.txt': 0,
  'T10.txt': 0,
  'T2.txt': 0,
  'T3.txt': 0,
  'T4.txt': 0,
  'T5.txt': 0,
  'T6.txt': 0,
  'T7.txt': 1,
  'T8.txt': 0,
  'T9.txt': 0},
 'abdomin': {'T1.txt': 0,
  'T10.txt': 0,
  'T2.txt': 0,
  'T3.txt': 0,
  'T4.txt': 0,
  'T5.txt': 0,
  'T6.txt': 0,
  'T7.txt': 1,
  'T8.txt': 0,
  'T9.txt': 0},
 'aberdeen': {'T1.txt': 1,
  'T10.txt': 0,
  'T2.txt': 0,
  'T3.txt': 0,
  'T4.txt': 0,
  'T5.txt': 0,
  'T6.txt': 0,
  'T7.txt': 0,
  'T8.txt': 0,
  'T9.txt': 0},
 'abey': {'T1.txt': 0,
  'T10.txt': 0,
  'T2.txt': 0,
  'T3.txt': 0,
  'T4.txt': 0,
  'T5.txt': 0,
  'T6.txt': 0,
  'T7.txt': 1,
  'T8.txt': 0,
  'T9.txt': 0},
 'abid': {'T1.txt': 0,
  '

In [10]:
inverted_index_df = pd.DataFrame(inverted_index).T

In [11]:
inverted_index_df

Unnamed: 0,T1.txt,T10.txt,T2.txt,T3.txt,T4.txt,T5.txt,T6.txt,T7.txt,T8.txt,T9.txt
abandon,0,2,0,0,0,0,0,0,0,1
abbrevy,0,0,0,0,0,0,0,1,0,0
abdom,0,0,0,0,0,0,0,1,0,0
abdomin,0,0,0,0,0,0,0,1,0,0
aberdeen,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
zint,0,0,2,0,0,0,0,0,0,12
zip,0,1,0,0,0,0,0,0,0,0
zon,0,0,0,0,0,0,0,2,0,0
zoolog,5,0,0,0,0,0,1,2,0,0


In [12]:
inverted_index_df.to_excel("inverted_index.xlsx")

In [13]:
k = 1
while(k==1):
    words = input("Enter words for which u want the inverted index : ").split()
    words = lexical_analysis(words)
    words = remove_stopwords(words)
    words = stem_words(words)
    words = remove_stopwords(words)

    for word in words:
        print("\n")
        if word in inverted_index.keys():
            print(word,inverted_index[word])
        else:
            print(word,"No entry in inverted index")
    print("\n")
    k = int(input("input 1 for more search else any other number : "))


Enter words for which u want the inverted index : possessed zoo


possess {'T1.txt': 1, 'T10.txt': 5, 'T2.txt': 0, 'T3.txt': 0, 'T4.txt': 0, 'T5.txt': 1, 'T6.txt': 0, 'T7.txt': 2, 'T8.txt': 0, 'T9.txt': 0}


zoo No entry in inverted index


input 1 for more search else any other number : 0
