<a href="https://colab.research.google.com/github/SreyaSalil/IR-Assignments/blob/main/IR_Assignment_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# IR Assignment 2

*Implementation of Inverted index: Construction and searching*

## Import Packages

In [157]:
import string
import itertools
import math
import operator
#For acessing files
import glob 
import errno
#To remove Numbers in text using RE
import re
#accent removal
import unicodedata
#stop word removal
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
#Stemming
from nltk.stem import PorterStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Preprocessing Functions

Read all lines in file

In [158]:
def readFile(file):
  line = file.read().replace("\n", " ")
  file.close()
  return line

Lexical Analysis

In [159]:
def convertToLowercase(text):
  return text.lower()

def removeNumbers(text):
  return re.sub(r'\d+', '', text)

def removePunctuation(text):
  return text.translate(str.maketrans('','',string.punctuation))

def removeWhitespace(text):
  return ' '.join(text.split())

def lexicalAnalysis(text):
  text=convertToLowercase(text)
  text=removeNumbers(text)
  text=removePunctuation(text)
  text=removeWhitespace(text)
  return text

Accent Removal

In [160]:
def removeAccent(text):
  new_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
  return new_text

Stop word elimination

In [161]:
def eliminateStopword(tokens):
  stop_words = set(stopwords.words('english'))
  result = [i for i in tokens if not i in stop_words and len(i) > 2]
  return result

Stemming

In [162]:
def stemming(tokens):
  stemmer= PorterStemmer()
  output=[stemmer.stem(word) for word in tokens]
  return output

Step-by-step document preprocessing

In [163]:
def preprocess_data(contents):
  dataDict = {}
  for content in contents:
    textLA=lexicalAnalysis(content[1])
    textRA=removeAccent(textLA)
    tokens=word_tokenize(textRA)
    tokenES=eliminateStopword(tokens)
    stems=stemming(tokenES)
    finalTokens=eliminateStopword(stems)
    dataDict[content[0]] = finalTokens
  return dataDict

Query Preprocessing

In [164]:
def preprocess_queries():
    queriesDict = {}
    queries = open('/content/sample_data/queries.txt','r').read().split('\n')
    i = 1
    for query in queries:
        textLA=lexicalAnalysis(query)
        textRA=removeAccent(textLA)
        tokens=word_tokenize(textRA)
        tokenES=eliminateStopword(tokens)
        stems=stemming(tokenES)
        queriesDict[i] = stems
        i+=1
    return queriesDict


## Generating an inverted index from documents

Retrieve all the files uploaded in Colab

In [165]:
path = '/content/sample_data/*.txt' 
files = glob.glob(path)

Create a list of tuples (documentNumber, documentText)

In [166]:
uniqueTerms=set()
fileText=[]
i=0
for name in files:
  try: 
        with open(name,encoding="utf8", errors='ignore') as f:
          i=i+1
          text=readFile(f)
          fileText.append((i,text))
  except IOError as exc: 
        if exc.errno != errno.EISDIR: 
            raise

Function to generate inverted index

In [167]:
def generate_inverted_index(data):
    all_words = get_vocabulary(data)
    index = {}
    for word in all_words:
        for doc, tokens in data.items():
            if word in tokens :
                if word in index.keys():
                    index[word].append(doc)
                else:
                    index[word] = [doc]
    return index

## Functions to calculate tf-idf values

In [168]:
def calculate_tf(tokens):
    tf_score = {}
    for token in tokens:
        tf_score[token] = tokens.count(token)
    return tf_score

In [169]:
def get_vocabulary(data):
    tokens = []
    for token_list in data.values():
        tokens = tokens + token_list
    fdist = nltk.FreqDist(tokens)
    return list(fdist.keys())

In [170]:
def calculate_idf(data):
    idf_score = {}
    N = len(data)
    all_words = get_vocabulary(data)
    for word in all_words:
        word_count = 0
        for token_list in data.values():
            if word in token_list:
                word_count += 1
        idf_score[word] = math.log10(N/word_count)
    return idf_score

In [171]:
def calculate_tfidf(data, idf_score):
    scores = {}
    for key,value in data.items():
        scores[key] = calculate_tf(value)
    for doc,tf_scores in scores.items():
        for token, score in tf_scores.items():
            tf = score
            idf = idf_score[token]
            tf_scores[token] = tf * idf
    return scores

In [172]:
def calculate_tfidf_queries(queries, idf_score):
    scores = {}
    for key, value in queries.items():
        scores[key] = calculate_tf(value)
    for key, tf_scores in scores.items():
        for token, score in tf_scores.items():
            idf = 0
            tf = score
            if token in idf_score.keys():
                idf = idf_score[token]
            tf_scores[token] = tf * idf
    return scores

## Searching Documents with Queries

Accept Queries from user

In [173]:
#Delete queries.txt before every run
numQuery = input('Enter the number of queries:')
for i in range(1,int(numQuery)+1):
  text=input('Enter query '+str(i)+':')
  with open('/content/sample_data/queries.txt' , 'a') as writefile:
      writefile.write(text+'\n')

Enter the number of queries:2
Enter query 1:A reptile's response to solar exposure
Enter query 2:Solar exposure


Preprocess queries and documents, Generate inverted index and Calculate tf-idf values

In [174]:
preprocessed_data = preprocess_data(fileText)
inverted_index = generate_inverted_index(preprocessed_data)
idf_scores = calculate_idf(preprocessed_data)
scores = calculate_tfidf(preprocessed_data,idf_scores)

queries = preprocess_queries()
query_scores = calculate_tfidf_queries(queries,idf_scores)


Rank Documents

In [175]:
query_docs = {}
for key, value in queries.items():
    doc_sim = {}
    for term in value:
        if term in inverted_index.keys():
            docs = inverted_index[term]
            for doc in docs:
                doc_score = scores[doc][term]
                doc_length = math.sqrt(sum(x ** 2 for x in scores[doc].values()))
                query_score = query_scores[key][term]
                query_length = math.sqrt(sum(x ** 2 for x in query_scores[key].values()))
                cosine_sim = (doc_score * query_score) / (doc_length * query_length)
                if doc in doc_sim.keys():
                    doc_sim[doc] += cosine_sim
                else:
                    doc_sim[doc] = cosine_sim
    ranked = sorted(doc_sim.items(), key=operator.itemgetter(1), reverse=True)
    query_docs[key] = ranked

Output Results

In [177]:
for i in range(1, len(query_docs) + 1):
  docs = query_docs[i][:10]
  doc_list = [x[0] for x in docs]
  if len(doc_list):
    print("Rank of documents for query "+str(i)+" :")
    print(doc_list)

Rank of documents for query 1 :
[10, 1, 7, 6, 4, 8, 3, 2, 9, 5]
Rank of documents for query 2 :
[7, 6, 4, 8, 3, 2, 9, 5, 10]
