# **Create an inverted index using Python and execute boolean queries on the same**


---



**Importing Necessary Libraries**

In [3]:
# Importing Libraries
import pandas as pd
import numpy as np
import re
import nltk
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
  stop_words = set(stopwords.words('english'))

**Opening the text files**

In [8]:
# Opening the text files
file1 = open('/content/drive/MyDrive/NLP/part1.txt', encoding = 'utf-8')
file_data1 = file1.read()
file_data1 = file_data1.split(" ")
file_data1 = " ".join([re.sub('[^A-Za-z0-9]',' ',i) for i in file_data1])

file2 = open('/content/drive/MyDrive/NLP/part2.txt', encoding = 'utf-8')
file_data2 = file2.read()
file_data2 = file_data2.split(" ")
file_data2 = " ".join([re.sub('[^A-Za-z0-9]',' ',i) for i in file_data2])

file3 = open('/content/drive/MyDrive/NLP/part3.txt', encoding = 'utf-8')
file_data3 = file3.read()
file_data3 = file_data3.split(" ")
file_data3 = " ".join([re.sub('[^A-Za-z0-9]',' ',i) for i in file_data3])

**Filtered text**

In [9]:
text_tokens = word_tokenize(file_data1 + " " + file_data2 + " " + file_data3)
filtered_text = [w for w in text_tokens if not w.lower() in stop_words]
 
filtered_text = " ".join(filtered_text) 

In [10]:
filtered_text



**Retrieving Unique Words**

In [11]:
# Retrieving unique words from the document
unique_words_list = [set(filtered_text.split(" "))]
unique_words = set().union(*unique_words_list)

In [12]:
unique_words

{'promising',
 'automatically',
 'intimations',
 'European',
 'partial',
 'combat',
 'killed',
 'case',
 'oblivion',
 'society',
 'physical',
 'classify',
 'appears',
 'wide',
 'HOUSE',
 'addresses',
 'message',
 'frozen',
 '84',
 'exultant',
 'peel',
 'fingers',
 'Archive',
 'Suttner',
 'illustrations',
 'clad',
 'massive',
 'credit',
 'nodded',
 'demonstration',
 'ultimate',
 'nevertheless',
 'recording',
 'deeds',
 'apprehension',
 'blended',
 'youse',
 'economic',
 'Dead',
 'repeating',
 'alteration',
 'piteously',
 'formerly',
 'magic',
 'slightest',
 'Lily',
 'Joe',
 'dull',
 'scab',
 'designated',
 'Terms',
 'due',
 'determination',
 'procedure',
 'reminded',
 'solely',
 'example',
 'BALCH',
 'forty',
 'credulity',
 'interest',
 'followed',
 'offered',
 'prosaic',
 'accumulated',
 'check',
 'based',
 'exhibiting',
 'similarity',
 'inevitable',
 'enough',
 'rate',
 'blessings',
 'us',
 'appeal',
 'strata',
 'unchanging',
 'red',
 'happiness',
 'board',
 'RIGHT',
 'evolved',
 'str

In [13]:
# Defining a dictionary which has the unique words as keys and the document number as values
# The document number is from 1 to 3.
document = defaultdict(list)
for word  in unique_words:
  if word != " " or word != '':
    if word in file_data1.split(" "):
      document[word].append('1')
    else:
      pass
    if word in file_data2.split(" "):
      document[word].append('2')
    else:
      pass
    if word in file_data3.split(" "):
      document[word].append('3')
  else:
    continue


In [14]:
document

defaultdict(list,
            {'promising': ['2'],
             'automatically': ['3'],
             'intimations': ['3'],
             'European': ['2'],
             'partial': ['2'],
             'combat': ['3'],
             'killed': ['1', '2', '3'],
             'case': ['1', '2', '3'],
             'oblivion': ['3'],
             'society': ['1', '2', '3'],
             'physical': ['3'],
             'classify': ['3'],
             'appears': ['1', '3'],
             'wide': ['1', '2', '3'],
             'HOUSE': ['1'],
             'addresses': ['3'],
             'message': ['3'],
             'frozen': ['2'],
             '84': ['1'],
             'exultant': ['2'],
             'peel': ['2'],
             'fingers': ['1'],
             'Archive': ['1', '3'],
             'Suttner': ['2'],
             'illustrations': ['3'],
             'clad': ['1'],
             'massive': ['3'],
             'credit': ['3'],
             'nodded': ['2'],
             'demonstration': ['

In [16]:
doc_coll = list(document)

In [18]:
df = pd.DataFrame(list(document.items()), columns = ['words','Posting List'])
df

Unnamed: 0,words,Posting List
0,promising,[2]
1,automatically,[3]
2,intimations,[3]
3,European,[2]
4,partial,[2]
...,...,...
5504,seem,"[1, 2, 3]"
5505,hesitating,[1]
5506,solitary,[3]
5507,curious,"[2, 3]"


In [19]:
map_dict = {'1': 'Greuze1', '2': 'Greuze2', '3': 'Greuze3'}

In [22]:
# Function for applying boolean operators to the inverted index
def inverted_index(query):
  for word_token in query.split():
    if word_token == 'and':
      query = query.replace(word_token, '&')
    elif word_token == 'or':
      query = query.replace(word_token, '|')
    elif word_token == 'not':
      query = query.replace(word_token, "{'1','2','3'}-")
  for word_token in query.split():
    if word_token.isalpha():
      query = query.replace(word_token, repr(set(document[word_token])))
  return query  





**Executing Boolean Queries**

*Query1*

In [23]:
query1 = 'fear and happy'
exp1 = inverted_index(query1)
exp1

"{'1', '3'} & {'1', '2'}"

We can conclude that the word 'fear' is contained in document 2 and the word 'happy' is contained in documents 1 and 2.

In [25]:
final_result1 = eval(exp1)
final_result1

{'1'}

In [26]:
for i in final_result1:
  print(map_dict[i])

Greuze1


From the result of Query 1, we can infer that the document 1 which corresponds to the text 'Greuze2', is the only text that contains both the words 'fear' and 'happy'.

*Query 2*

In [27]:
query2 = 'fear and not happy'
exp2 = inverted_index(query2)
exp2

"{'1', '3'} & {'1','2','3'}- {'1', '2'}"

In [28]:
final_result2 = eval(exp2)
final_result2

{'3'}

In [29]:
for i in final_result2:
  print(map_dict[i])

Greuze3


*Query 3*

In [30]:
query3 = 'fear or happy'
exp3 = inverted_index(query3)
exp3

"{'1', '3'} | {'1', '2'}"

In [31]:
final_result3 = eval(exp3)
final_result3

{'1', '2', '3'}

In [32]:
for i in final_result3:
  print(map_dict[i])

Greuze1
Greuze3
Greuze2


From the result of Query 3, we can infer that the documents 1, and 2, which corresponds to the texts 'Greuze1', 'Greuze2' respectively, contains either the word 'fear' or 'happy'.

*Query 4*

In [33]:
query4 = 'not fear or not happy'
exp4 = inverted_index(query4)
exp4

"{'1','2','3'}- {'1', '3'} | {'1','2','3'}- {'1', '2'}"

In [34]:
final_result4 = eval(exp4)
final_result4

{'2', '3'}

In [35]:
for word_token in final_result4:
  print(map_dict[word_token])

Greuze3
Greuze2


From the result of Query 4, we can infer that the documents 1 and 3 which corresponds to the texts 'Greuze1' and 'Greuze3' respectively, either do not contains the word 'fear' or do not contain the word 'broke'.

*Query 5*

In [36]:
query5="fear or not happy"
exp5=inverted_index(query5)
exp5

"{'1', '3'} | {'1','2','3'}- {'1', '2'}"

In [37]:
final_result5=eval(exp5)
final_result5

{'1', '3'}

In [38]:
for word_token in final_result5:
  print(map_dict[word_token])

Greuze1
Greuze3


From the result of query5, it is clear that the docs 2 and 3 corresponding to the texts 'Greuze2' and 'Greuze3', respectively either contains the word 'fear' or donot contain the word 'happy'.


---



---

