<a href="https://colab.research.google.com/github/SreyaSalil/IR-Assignments/blob/main/IR_Assignment_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# IR Assignment 2

*Implementation of Inverted index: Construction and searching*

## Import Packages

In [66]:
import string
import itertools
import math
import operator
# To access files
import os
# For DataTable
import pandas as pd
# To calculate mean
from statistics import mean
# To remove HTML tags
from bs4 import BeautifulSoup
#To remove Numbers in text using RE
import re
#accent removal
import unicodedata
#stop word removal
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
#Stemming
from nltk.stem import *

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Preprocessing Functions

### Remove closed and unclosed HTML tags

In [67]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

### Lexical Analysis

In [68]:
# Remove accented characters (é, è, â, î, ô, ñ)
def remove_non_ascii(words):
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

#Convert all characters to lowercase from list of tokenized words
def to_lowercase(words):
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

# Remove punctuation from list of tokenized words
def remove_punctuation(words):
    new_words = []
    for word in words:
        new_word = word.translate(str.maketrans("","",string.punctuation))
        if new_word != '':
            new_words.append(new_word)
    return new_words

# Replace all interger occurrences in list of tokenized words with textual representation
def remove_numbers(words):
    new_words = []
    for word in words:
        new_word = re.sub(r'\d+','',word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def lexical_analysis(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_numbers(words)
    return words

### Stop word elimination

In [69]:
#Remove stop words from list of tokenized words
def remove_stopwords(words):
    new_words = []
    stop_words = set(stopwords.words("english"))
    for word in words:
        if word not in stop_words:
            new_words.append(word)
    return new_words

### Stemming

In [70]:
#Stem words in list of tokenized words
def stem_words(words):
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

### Read data and get vocabulary

In [71]:
def read_data(path):
    contents = []
    for filename in os.listdir(path):
        data = strip_html(open(path+'/'+filename,encoding="utf8", errors='ignore').read())
        contents.append((filename,data))
    return contents

In [72]:
def get_vocabulary(data):
    tokens = []
    with open(os.path.join(os.getcwd(),"vocabulary.txt"),encoding="utf8", errors='ignore') as rf:
        tokens = rf.read().split()
    return tokens

### Step-by-step document preprocessing

In [73]:
def preprocess_data(contents):
    dataDict = {}
    for content in contents:
        sample = content[1]
        sample = sample.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
        tokens = word_tokenize(sample)
        lexical = lexical_analysis(tokens)
        filtered_tokens = remove_stopwords(lexical)
        stemmed_tokens = stem_words(filtered_tokens)
        filtered_tokens1 = remove_stopwords(stemmed_tokens)
        dataDict[content[0]] = filtered_tokens1
    return dataDict

## Generating an inverted index from documents

### Function to generate inverted index

In [74]:
def generate_inverted_index(data):
    all_words = get_vocabulary(data)
    index = {}
    for word in all_words:
        index[word] = {}
        for doc, tokens in data.items():
            index[word][doc] = tokens.count(word)
    return index

### Preprocessing docs and generating inverted index

In [75]:
data = read_data("docs")
preprocessed_data = preprocess_data(data)
inverted_index = generate_inverted_index(preprocessed_data)
inverted_index

{'abandon': {'T1.txt': 0,
  'T10.txt': 2,
  'T2.txt': 0,
  'T3.txt': 0,
  'T4.txt': 0,
  'T5.txt': 0,
  'T6.txt': 0,
  'T7.txt': 0,
  'T8.txt': 0,
  'T9.txt': 1},
 'abbrevy': {'T1.txt': 0,
  'T10.txt': 0,
  'T2.txt': 0,
  'T3.txt': 0,
  'T4.txt': 0,
  'T5.txt': 0,
  'T6.txt': 0,
  'T7.txt': 1,
  'T8.txt': 0,
  'T9.txt': 0},
 'abdom': {'T1.txt': 0,
  'T10.txt': 0,
  'T2.txt': 0,
  'T3.txt': 0,
  'T4.txt': 0,
  'T5.txt': 0,
  'T6.txt': 0,
  'T7.txt': 1,
  'T8.txt': 0,
  'T9.txt': 0},
 'abdomin': {'T1.txt': 0,
  'T10.txt': 0,
  'T2.txt': 0,
  'T3.txt': 0,
  'T4.txt': 0,
  'T5.txt': 0,
  'T6.txt': 0,
  'T7.txt': 1,
  'T8.txt': 0,
  'T9.txt': 0},
 'aberdeen': {'T1.txt': 1,
  'T10.txt': 0,
  'T2.txt': 0,
  'T3.txt': 0,
  'T4.txt': 0,
  'T5.txt': 0,
  'T6.txt': 0,
  'T7.txt': 0,
  'T8.txt': 0,
  'T9.txt': 0},
 'abey': {'T1.txt': 0,
  'T10.txt': 0,
  'T2.txt': 0,
  'T3.txt': 0,
  'T4.txt': 0,
  'T5.txt': 0,
  'T6.txt': 0,
  'T7.txt': 1,
  'T8.txt': 0,
  'T9.txt': 0},
 'abid': {'T1.txt': 0,
  '

### Converting inverted index to a datatable and saving it to a Excel file

In [76]:
inverted_index_df = pd.DataFrame(inverted_index).T
inverted_index_df.to_excel("inverted_index.xlsx")
inverted_index_df

Unnamed: 0,T1.txt,T4.txt,T2.txt,T3.txt,T7.txt,T6.txt,T8.txt,T9.txt,T10.txt,T5.txt
abandon,0,0,0,0,0,0,0,1,2,0
abbrevy,0,0,0,0,1,0,0,0,0,0
abdom,0,0,0,0,1,0,0,0,0,0
abdomin,0,0,0,0,1,0,0,0,0,0
aberdeen,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
zon,0,0,0,0,2,0,0,0,0,0
zoo,0,0,0,0,0,0,0,0,0,0
zoolog,5,0,0,0,2,0,0,0,0,0
zoophyt,1,0,0,0,0,3,0,0,0,0


## Searching with inverted index

In [77]:
k = 1
while(k==1):
    words = input("Enter words for which u want the inverted index : ").split()
    words = lexical_analysis(words)
    words = remove_stopwords(words)
    words = stem_words(words)
    words = remove_stopwords(words)

    for word in words:
        print("\n")
        if word in inverted_index.keys():
            print(word,inverted_index[word])
        else:
            print(word,"No entry in inverted index")
    print("\n")
    k = int(input("input 1 for more search else any other number : "))


Enter words for which u want the inverted index : lung fish


lung {'T1.txt': 0, 'T4.txt': 0, 'T2.txt': 0, 'T3.txt': 0, 'T7.txt': 7, 'T6.txt': 0, 'T8.txt': 0, 'T9.txt': 0, 'T10.txt': 0, 'T5.txt': 0}


fish {'T1.txt': 11, 'T4.txt': 2, 'T2.txt': 0, 'T3.txt': 0, 'T7.txt': 30, 'T6.txt': 2, 'T8.txt': 0, 'T9.txt': 0, 'T10.txt': 0, 'T5.txt': 0}


input 1 for more search else any other number : 7
