## **Importing libraries**

In [None]:
from google.colab import drive
import pandas as pd 
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
import re


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## **Loading the dataset**

In [None]:
drive.mount('/content/drive')
df=pd.read_csv("/content/drive/MyDrive/AIWR/Dataset/Amazon_reviews.csv")

Mounted at /content/drive


## **Preprocessing**

In [None]:
#Naming the columns
df.set_axis(["t_id", "Polarity", "Title","Review"],axis=1,inplace=True)
df.head()

Unnamed: 0,t_id,Polarity,Title,Review
0,390000,2,Love it,Love this product. Great shine even for a any ...
1,390001,2,eye-opening...,I'd just like say that this book is absolutely...
2,390002,2,A Beautiful Travelography,Alan Booth has the most wonderful way of makin...
3,390003,2,It was very helpful.,I read this book a few years ago and used the ...
4,390004,2,Goodle,A very nice introduction to large format. Give...


In [None]:
#Creating a column for documents Ids
df['Doc_ID'] = range(0,len(df))
df.head()

Unnamed: 0,t_id,Polarity,Title,Review,Doc_ID
0,390000,2,Love it,Love this product. Great shine even for a any ...,0
1,390001,2,eye-opening...,I'd just like say that this book is absolutely...,1
2,390002,2,A Beautiful Travelography,Alan Booth has the most wonderful way of makin...,2
3,390003,2,It was very helpful.,I read this book a few years ago and used the ...,3
4,390004,2,Goodle,A very nice introduction to large format. Give...,4


In [None]:
#droping unneccesary columns
df.drop('t_id', axis=1, inplace=True)
df.drop('Polarity', axis=1, inplace=True)
move = df.pop('Doc_ID')
df.insert(0,'Doc_Id',move)
df.head()

Unnamed: 0,Doc_Id,Title,Review
0,0,Love it,Love this product. Great shine even for a any ...
1,1,eye-opening...,I'd just like say that this book is absolutely...
2,2,A Beautiful Travelography,Alan Booth has the most wonderful way of makin...
3,3,It was very helpful.,I read this book a few years ago and used the ...
4,4,Goodle,A very nice introduction to large format. Give...


In [None]:
#converting sentences into lower case
df["Review"] = df["Review"].str.lower()
df.head()    

Unnamed: 0,Doc_Id,Title,Review
0,0,Love it,love this product. great shine even for a any ...
1,1,eye-opening...,i'd just like say that this book is absolutely...
2,2,A Beautiful Travelography,alan booth has the most wonderful way of makin...
3,3,It was very helpful.,i read this book a few years ago and used the ...
4,4,Goodle,a very nice introduction to large format. give...


In [None]:
df1=df['Review']
l=list()
#tokenizing
for line in df1:
  token=sent_tokenize(line)
  l.append(token)

In [None]:
df1=df['Review']
df['sentence_token'] = pd.Series(l)
df.head()

Unnamed: 0,Doc_Id,Title,Review,sentence_token
0,0,Love it,love this product. great shine even for a any ...,"[love this product., great shine even for a an..."
1,1,eye-opening...,i'd just like say that this book is absolutely...,[i'd just like say that this book is absolutel...
2,2,A Beautiful Travelography,alan booth has the most wonderful way of makin...,[alan booth has the most wonderful way of maki...
3,3,It was very helpful.,i read this book a few years ago and used the ...,[i read this book a few years ago and used the...
4,4,Goodle,a very nice introduction to large format. give...,"[a very nice introduction to large format., gi..."


In [None]:
# tokenizing words
df1=df['Review']
l1=list()
for line in df1:
  tokens=word_tokenize(line)
  l1.append(tokens)
     

In [None]:
#creating column for word tokens
df1=df['Review']
df['word_token']=l1
df.head()

Unnamed: 0,Doc_Id,Title,Review,sentence_token,word_token
0,0,Love it,love this product. great shine even for a any ...,"[love this product., great shine even for a an...","[love, this, product, ., great, shine, even, f..."
1,1,eye-opening...,i'd just like say that this book is absolutely...,[i'd just like say that this book is absolutel...,"[i, 'd, just, like, say, that, this, book, is,..."
2,2,A Beautiful Travelography,alan booth has the most wonderful way of makin...,[alan booth has the most wonderful way of maki...,"[alan, booth, has, the, most, wonderful, way, ..."
3,3,It was very helpful.,i read this book a few years ago and used the ...,[i read this book a few years ago and used the...,"[i, read, this, book, a, few, years, ago, and,..."
4,4,Goodle,a very nice introduction to large format. give...,"[a very nice introduction to large format., gi...","[a, very, nice, introduction, to, large, forma..."


In [None]:
# STOP WORDS REMOVAL
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stoplist= stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
stoplist=set(stoplist)
l2=list()
for i in l1:
  output = [w for w in i if not w in stoplist]
  l2.append(output)
df['stop_words_removed']=l2
df.head()

Unnamed: 0,Doc_Id,Title,Review,sentence_token,word_token,stop_words_removed
0,0,Love it,love this product. great shine even for a any ...,"[love this product., great shine even for a an...","[love, this, product, ., great, shine, even, f...","[love, product, ., great, shine, even, nail, p..."
1,1,eye-opening...,i'd just like say that this book is absolutely...,[i'd just like say that this book is absolutel...,"[i, 'd, just, like, say, that, this, book, is,...","['d, like, say, book, absolutely, fantastic, !..."
2,2,A Beautiful Travelography,alan booth has the most wonderful way of makin...,[alan booth has the most wonderful way of maki...,"[alan, booth, has, the, most, wonderful, way, ...","[alan, booth, wonderful, way, making, reader, ..."
3,3,It was very helpful.,i read this book a few years ago and used the ...,[i read this book a few years ago and used the...,"[i, read, this, book, a, few, years, ago, and,...","[read, book, years, ago, used, information, re..."
4,4,Goodle,a very nice introduction to large format. give...,"[a very nice introduction to large format., gi...","[a, very, nice, introduction, to, large, forma...","[nice, introduction, large, format, ., gives, ..."


In [None]:
# LEMMATIZATION 
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
final_train_lemma_word = []
wordnet_lemmatizer = WordNetLemmatizer()
for line in df['stop_words_removed']:
  lemma_word=[];
  for w in line:
      word1 = wordnet_lemmatizer.lemmatize(w, pos = "n")
      word2 = wordnet_lemmatizer.lemmatize(word1, pos = "v")
      word3 = wordnet_lemmatizer.lemmatize(word2, pos = ("a"))
      lemma_word.append(word1)
  lemma_word= [word for word in lemma_word if word.isalnum()]
  final_train_lemma_word.append(lemma_word)

print(final_train_lemma_word[0:5])

[nltk_data] Downloading package wordnet to /root/nltk_data...


[['love', 'product', 'great', 'shine', 'even', 'nail', 'polish', 'color', 'dry', 'pretty', 'quick', 'waste', 'time'], ['like', 'say', 'book', 'absolutely', 'fantastic', 'well', 'written', 'boothe', 'able', 'pull', 'japan', 'colorful', 'style', 'want', 'know', 'japanese', 'culture', 'foreigner', 'regarded', 'along', 'interesting', 'funny', 'story', 'first', 'hand', 'account', 'expect', 'amazed'], ['alan', 'booth', 'wonderful', 'way', 'making', 'reader', 'feel', 'triumph', 'sorrow', 'writing', 'style', 'give', 'book', 'visceral', 'feeling', 'easy', 'get', 'lost', 'book', 'added', 'author', 'bluntness', 'never', 'seems', 'glossing', 'truth', 'openly', 'talk', 'good', 'bad', 'done', 'read', 'alot', 'book', 'spent', 'morning', 'mister', 'donut', 'asakusa', 'trip', 'tokyo', 'photo', 'shoot', 'book', 'really', 'helped', 'get', 'creative', 'juice', 'flowing', 'made', 'want', 'return', 'japan', 'shoot', 'along', 'backside', 'also', 'made', 'realize', 'danger', 'eating', 'vindictive', 'would', '

In [None]:
df['lemmatized_words']=final_train_lemma_word
df.head()

Unnamed: 0,Doc_Id,Title,Review,sentence_token,word_token,stop_words_removed,lemmatized_words
0,0,Love it,love this product. great shine even for a any ...,"[love this product., great shine even for a an...","[love, this, product, ., great, shine, even, f...","[love, product, ., great, shine, even, nail, p...","[love, product, great, shine, even, nail, poli..."
1,1,eye-opening...,i'd just like say that this book is absolutely...,[i'd just like say that this book is absolutel...,"[i, 'd, just, like, say, that, this, book, is,...","['d, like, say, book, absolutely, fantastic, !...","[like, say, book, absolutely, fantastic, well,..."
2,2,A Beautiful Travelography,alan booth has the most wonderful way of makin...,[alan booth has the most wonderful way of maki...,"[alan, booth, has, the, most, wonderful, way, ...","[alan, booth, wonderful, way, making, reader, ...","[alan, booth, wonderful, way, making, reader, ..."
3,3,It was very helpful.,i read this book a few years ago and used the ...,[i read this book a few years ago and used the...,"[i, read, this, book, a, few, years, ago, and,...","[read, book, years, ago, used, information, re...","[read, book, year, ago, used, information, res..."
4,4,Goodle,a very nice introduction to large format. give...,"[a very nice introduction to large format., gi...","[a, very, nice, introduction, to, large, forma...","[nice, introduction, large, format, ., gives, ...","[nice, introduction, large, format, give, mome..."


## **Inverted Index**

In [None]:
import math
from collections import defaultdict

# Compute the document frequencies for each term
dfs = defaultdict(int)
for doc in final_train_lemma_word:
    for term in set(doc):
        dfs[term] += 1

# Compute the inverse document frequencies
N = len(df)
idfs = {term: math.log(N / dfa) for term, dfa in dfs.items()}

# Compute the TF-IDF weights for each term in each document
tfidfs = []
for doc in final_train_lemma_word:
    tfidf = defaultdict(float)
    for term in doc:
        tfidf[term] += 1
    tfidf = {term: tf * idfs[term] for term, tf in tfidf.items()}
    tfidfs.append(tfidf)

  # Construct the inverted index
inverted_index = defaultdict(list)
for i, doc in enumerate(tfidfs):
    for term, weight in doc.items():
        inverted_index[term].append((i, weight))

# Print the inverted index
for term, postings in inverted_index.items():
    print(f"{term}: {postings}")

Output hidden; open in https://colab.research.google.com to view.

## **Boolean Retrieval Model**

In [None]:
#FUNCTIONS FOR 'AND', 'OR', 'NOT' QUERIES

def and_query(l1, l2):
  query_terms = [l1,l2] 
  # Retrieve the postings lists for the query terms
  postings_lists = [inverted_index[term] for term in query_terms]

  # Compute the intersection of the postings lists
  doc_ids = set(postings_lists[0][i][0] for i in range(len(postings_lists[0])))
  for postings in postings_lists[1:]:
      doc_ids &= set(postings[i][0] for i in range(len(postings)))

  # Compute the TF-IDF weights for the matching documents
  matching_docs = []
  for doc_id in doc_ids:
      tfidf = sum(postings[i][1] for i, postings in enumerate(postings_lists)
                  if doc_id in set(p[0] for p in postings))
      matching_docs.append((doc_id, tfidf))

  # Sort the matching documents by their TF-IDF weights
  matching_docs.sort(key=lambda x: x[1], reverse=True)
  return matching_docs
  
def or_query(l1,l2):
  query_terms = [l1,l2]
  # Retrieve the postings lists for the query terms
  postings_lists = [inverted_index[term] for term in query_terms]

  # Compute the union of the postings lists
  doc_ids = set(postings_lists[0][i][0] for i in range(len(postings_lists[0])))
  for postings in postings_lists[1:]:
      doc_ids |= set(postings[i][0] for i in range(len(postings)))

  # Compute the TF-IDF weights for the matching documents
  matching_docs = []
  for doc_id in doc_ids:
      tfidf = sum(postings[i][1] for i, postings in enumerate(postings_lists)
                  if doc_id in set(p[0] for p in postings))
      matching_docs.append((doc_id, tfidf))

  # Sort the matching documents by their TF-IDF weights
  matching_docs.sort(key=lambda x: x[1], reverse=True)
  return matching_docs

def not_query(word):
  exclude_doc_ids = set(inverted_index[word][i][0] for i in range(len(inverted_index[word])))
  # Exclude documents containing the term
  all_ids = [x for x in range(0,len(df))]
  result = []
  for doc_ids in all_ids :
      if doc_ids not in exclude_doc_ids:
        result.append(doc_ids)

  # The resulting inverted index will not contain the excluded documents
  return result


In [None]:
# PERFORMING THE BOOLEAN QUERY

word1,operation,word2 = input("Enter the query : ").split()
if word1 in inverted_index and word2 in inverted_index:
  if operation=='and':
    matching_docs = and_query(word1,word2)
    print(f"Number of documents retrieved : {len(matching_docs)}")
    for doc_id, tfidf in matching_docs:
      print(f"Document {doc_id}: {df['Review'][doc_id]} (TF-IDF: {tfidf})")
  else:
    matching_docs = or_query(word1,word2)
    print(f"Number of documents retrieved : {len(matching_docs)}")
    for doc_id, tfidf in matching_docs:
      print(f"Document {doc_id}: {df['Review'][doc_id]} (TF-IDF: {tfidf})")
else:
  print("Entered word doesnt exist")

Enter the query : good and phone
Number of documents retrieved : 31
Document 1538: these phones have excellent receiving range, working through almost 180 degrees. the sound quality, while not very good is acceptable for listening to television. battery life is very good. however they are hideously uncomfortable!!! there is no swivel whatsoever, neither vertical nor horizontal. if your head is not shaped like the sony dummy you are out of luck. to make things worse the ear pads (not cups) are foam plastic which is an excellent insulator making them quite hot.it is so bad that i am trying to work a kludge with an old set (40 years) of pioneer phones from my radio days. (TF-IDF: 7.73355159755713)
Document 147: i was so happy to find this! i read this book when i was a child in the 80s, re-read it so many times, i wore it out.it's the simple story of a young girl trying to cope with recently divorced parents and being a joint-custody kid.although 25 years later, it seems fairly innocent (

In [None]:
#to perform "NOT" query

word = input("Enter a word : ")
result = not_query(word)
print(f"The number of documents which do not contain the term '{word}' : {len(result)}")
print("The document IDs are :  ")
print(result)

Enter a word : love
The number of documents which do not contain the term 'love' : 8987
The document IDs are :  
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 17, 18, 20, 21, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 75, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 116, 118, 119, 120, 122, 123, 124, 126, 127, 128, 130, 131, 132, 133, 134, 135, 137, 138, 139, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 200, 201, 202, 203, 204, 205, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 22

## **Wild card Query**

In [None]:
#FUNCTION FOR WILD-CARD QUERY

import re

def wild_card_query(word):
  wildcard_query = word
  # Compile a regular expression pattern from the wildcard query
  regex_pattern = re.compile(wildcard_query)

  # Find all terms in the inverted index that match the regular expression
  matching_terms = [term for term in inverted_index.keys() if regex_pattern.match(term)]

  # Retrieve the postings lists for the matching terms
  postings_lists = [inverted_index[term] for term in matching_terms]

  # Merge the postings lists into a single list of (document ID, TF-IDF weight) pairs
  merged_postings = []
  for postings in postings_lists:
      merged_postings += postings
  return merged_postings


#QUERY

word = input("Enter a wild card query (like 'poli.*') : ")
matching_docs = wild_card_query(word)
print(f"Number of documents retrieved : {len(matching_docs)}")
for doc_id, tfidf in matching_docs:
  print(f"Document {doc_id}: {df['Review'][doc_id]}")

Enter a wild card query (like 'poli.*') : sugges.
Number of documents retrieved : 182
Document 13: the author approaches the subject matter in a sensitive and proactive manner. the book provides a lot of supportive information for women that may feel they are all alone or the only one that has or is going through this process. the author really addresses the stages and situations that are part of the process providing mulitple scenarios, suggestions, and rationale without judgment. an excellent book for all parties going through this life transition.
Document 466: package shipped and arrived in great time. solid steel box and tight crisp operation. end cap fits very snugly and seems to be water resistant (not water proof, but should keep water out in all but a sustained downpour). fits great on my 2005 chevy colorado. i was surprised how durable and rugged the end cap is. feels like it could take a 10 mph hit and be fine. one suggestion for the product would be to put a rubber gasket a

## **Positional Index**

In [None]:
#CREATING POSITIONAL INDEXING

pos_index = {}
file_map = {}
def generate_positional_index(data:list):
  fileno=0
  lineno=-1
  for line in data:
    lineno+=1;
    for pos, term in enumerate(line):
      if term in pos_index:     
          if fileno in pos_index[term].keys():
            pos_index[term][0][lineno].append(pos)                
          else:
            pos_index[term][lineno] = [pos]
      else:
          pos_index[term] = {}  
          pos_index[term][lineno] = [pos]
      fileno += 1
  return pos_index

positional_index=generate_positional_index(l1)
count=0
for i in positional_index:
  count=count+1;
  if count<=20:
    print(i,positional_index[i])
  else:
    break;



love {0: [0], 12: [66], 16: [31], 19: [2], 22: [87], 44: [26], 52: [132], 74: [83], 76: [60], 77: [12], 90: [42], 91: [49], 115: [38], 117: [1], 125: [115], 129: [137], 140: [1], 179: [1], 199: [63], 206: [56], 238: [55], 256: [59], 257: [11], 258: [28], 289: [46], 306: [14], 314: [15], 316: [24], 326: [104], 334: [31], 353: [62], 383: [40], 414: [14], 418: [24], 463: [46], 491: [13], 510: [7], 513: [50], 518: [39], 534: [72], 598: [26], 635: [69], 644: [16], 646: [2], 657: [11], 684: [18], 695: [80], 699: [32], 710: [103], 721: [52], 723: [16], 743: [0], 767: [37], 770: [80], 776: [35], 787: [1], 790: [44], 817: [101], 827: [33], 836: [71], 848: [38], 853: [7], 855: [133], 865: [167], 876: [76], 881: [89], 883: [24], 897: [11], 904: [79], 910: [0], 935: [34], 936: [1], 954: [12], 980: [5], 992: [47], 1036: [89], 1045: [29], 1046: [69], 1059: [6], 1063: [24], 1065: [107], 1066: [143], 1069: [92], 1098: [29], 1100: [88], 1101: [123], 1106: [81], 1109: [106], 1113: [77], 1114: [60], 1129

## **Phrase Query**

In [None]:
from IPython.display import clear_output
import ipywidgets as widgets

#FUNCTION TO PERFORM PHRASE QUERY
def phrase_query(phrase):
  query = phrase
  # Split the query into terms
  query_terms = query.split()

  # Initialize the result list
  result = []

  # Loop through the documents and find those that contain the phrase
  for doc_id in positional_index[query_terms[0]]:
      # Get the positions of the first query term in the document
      positions = positional_index[query_terms[0]][doc_id]
      # Loop through the positions and check if the rest of the query terms are present
      for pos in positions:
          match = True
          for i in range(1, len(query_terms)):
              if doc_id not in positional_index[query_terms[i]]:
                  match = False
                  break
              if pos+i not in positional_index[query_terms[i]][doc_id]:
                  match = False
                  break
          if match:
              # If all query terms are present in the correct order, add the document to the result list
              result.append(doc_id)
  return result

phrase = input("Enter a phrase :")
result = phrase_query(phrase)
# Print the result list

data = []
print(f"Top 10 Documents containing the phrase '{phrase}':")
for i in range(10):
  temp = str(result[i])+ ":" +str(df['Title'][result[i]])
  data.append(temp) 
checkboxes = [widgets.Checkbox(value=False, description=label) for label in data]
output = widgets.VBox(children=checkboxes)
display(output)


Enter a phrase :good product
Top 10 Documents containing the phrase 'good product':


VBox(children=(Checkbox(value=False, description='751:Monster trucks'), Checkbox(value=False, description='111…

In [None]:
True_positive = 0
print("Here are the documents which you found relevant : ")
for i in range(0, len(checkboxes)):
    if checkboxes[i].value == True:
        True_positive += 1
        print(f"{result[i]} : {df['Review'][result[i]]}")

#Calculation of metrics for relevance 
#Precision
precision = True_positive/10
print(f"The precision of the retrieved documents : {precision}")

Here are the documents which you found relevant : 
751 : my nephew loves monster trucks and he is only two. they are a little smaller than i hoped, but still safe for him. hotwheeles are usually pretty durable, so i think this is a pretty good product.
1117 : i was just about to give these to my dogs and realized that they are made in china. unfortunately i realized too late and cannot return them. in the trash they go. they could be a good product, but i'm not going to take a chance. i've heard too many scary stories about treats and food from china. my pets are not worth taking that risk with.
1369 : this product could be great if you just want to mix powdered drinks. however, i bought it for ice cream shakes/malts. this is not a good product for that. it is not powerful enough for ice cream and the mixer continues to hit the side of the metal cup making loud noises. i will stick to my blender.
2273 : i should have read the label closer. seems like a good product but keep in mind you