#Importing Libraries

In [300]:
import re
import pandas as pd 
import numpy as np
import math
import re
from collections import defaultdict
from IPython.display import clear_output
import ipywidgets as widgets

In [301]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [302]:
!pip install sentence_transformers
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


#Pre-processing

In [303]:
df=pd.read_csv("data.csv")
df.head()

Unnamed: 0,review,rating
0,It was nice produt. I like it's design a lot. ...,5
1,awesome sound....very pretty to see this nd th...,5
2,awesome sound quality. pros 7-8 hrs of battery...,4
3,I think it is such a good product not only as ...,5
4,awesome bass sound quality very good bettary l...,5


In [304]:
df.drop('rating', axis=1, inplace=True)
doc_ids = [i for i in range(df.shape[0])]
df['Doc_Id'] = doc_ids

In [305]:
doc_ids = [i for i in range(df.shape[0])]
df['Doc_Id'] = doc_ids
df.head()

Unnamed: 0,review,Doc_Id
0,It was nice produt. I like it's design a lot. ...,0
1,awesome sound....very pretty to see this nd th...,1
2,awesome sound quality. pros 7-8 hrs of battery...,2
3,I think it is such a good product not only as ...,3
4,awesome bass sound quality very good bettary l...,4


In [306]:
df["review"] = df["review"].str.lower()
df.head()    

Unnamed: 0,review,Doc_Id
0,it was nice produt. i like it's design a lot. ...,0
1,awesome sound....very pretty to see this nd th...,1
2,awesome sound quality. pros 7-8 hrs of battery...,2
3,i think it is such a good product not only as ...,3
4,awesome bass sound quality very good bettary l...,4


In [307]:
# tokenizing words
reviews=df['review']
tokens=list()
lines = []
for line in reviews:
  line2 = line
  if line[-9:] == "read more":
    line2 = line[:-9]
  lines.append(line2)
  words=word_tokenize(line2)
  tokens.append(words)
df['review'] = lines
df['tokens']=tokens
df.head()

Unnamed: 0,review,Doc_Id,tokens
0,it was nice produt. i like it's design a lot. ...,0,"[it, was, nice, produt, ., i, like, it, 's, de..."
1,awesome sound....very pretty to see this nd th...,1,"[awesome, sound, ...., very, pretty, to, see, ..."
2,awesome sound quality. pros 7-8 hrs of battery...,2,"[awesome, sound, quality, ., pros, 7-8, hrs, o..."
3,i think it is such a good product not only as ...,3,"[i, think, it, is, such, a, good, product, not..."
4,awesome bass sound quality very good bettary l...,4,"[awesome, bass, sound, quality, very, good, be..."


In [308]:
#removing stop words
stoplist= stopwords.words('english')
stoplist=set(stoplist)
tokens_stpw_removed=list()
for i in tokens:
  output = [w for w in i if not w in stoplist]
  tokens_stpw_removed.append(output)
df['tokens']=tokens_stpw_removed
df.head()

Unnamed: 0,review,Doc_Id,tokens
0,it was nice produt. i like it's design a lot. ...,0,"[nice, produt, ., like, 's, design, lot, ., 's..."
1,awesome sound....very pretty to see this nd th...,1,"[awesome, sound, ...., pretty, see, nd, sound,..."
2,awesome sound quality. pros 7-8 hrs of battery...,2,"[awesome, sound, quality, ., pros, 7-8, hrs, b..."
3,i think it is such a good product not only as ...,3,"[think, good, product, per, quality, also, des..."
4,awesome bass sound quality very good bettary l...,4,"[awesome, bass, sound, quality, good, bettary,..."


In [309]:
# lemmatizing 
lemmatized = []
wordnet_lemmatizer = WordNetLemmatizer()
for line in df['tokens']:
  lemma_word=[];
  for w in line:
      word1 = wordnet_lemmatizer.lemmatize(w, pos = "n")
      word2 = wordnet_lemmatizer.lemmatize(word1, pos = "v")
      word3 = wordnet_lemmatizer.lemmatize(word2, pos = ("a"))
      lemma_word.append(word1)
  lemma_word= [word for word in lemma_word if word.isalnum()]
  lemmatized.append(lemma_word)

In [310]:
df['tokens']=lemmatized
df.head()

Unnamed: 0,review,Doc_Id,tokens
0,it was nice produt. i like it's design a lot. ...,0,"[nice, produt, like, design, lot, easy, carry,..."
1,awesome sound....very pretty to see this nd th...,1,"[awesome, sound, pretty, see, nd, sound, quali..."
2,awesome sound quality. pros 7-8 hrs of battery...,2,"[awesome, sound, quality, pro, hr, battery, li..."
3,i think it is such a good product not only as ...,3,"[think, good, product, per, quality, also, des..."
4,awesome bass sound quality very good bettary l...,4,"[awesome, bass, sound, quality, good, bettary,..."


#Constructing Inverted index

In [311]:
doc_freq = defaultdict(int)
for doc in lemmatized:
    for term in set(doc):
        doc_freq[term] += 1

N = len(df)
inv_doc_freq = {term: math.log(N / dfa) for term, dfa in doc_freq.items()}


tfidfs = []
for doc in lemmatized:
    tfidf = defaultdict(float)
    for term in doc:
        tfidf[term] += 1
    tfidf = {term: tf * inv_doc_freq[term] for term, tf in tfidf.items()}
    tfidfs.append(tfidf)


inverted_index = defaultdict(list)
for i, doc in enumerate(tfidfs):
    for term, weight in doc.items():
        inverted_index[term].append((i, weight))

for term, postings in inverted_index.items():
    print(f"{term}: {postings}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
student: [(124, 7.1284959456800365), (574, 7.1284959456800365), (753, 7.1284959456800365), (1007, 7.1284959456800365), (1761, 14.256991891360073), (5680, 7.1284959456800365), (7055, 7.1284959456800365), (9610, 7.1284959456800365)]
giving: [(124, 5.097063623186561), (130, 5.097063623186561), (133, 5.097063623186561), (157, 5.097063623186561), (189, 5.097063623186561), (203, 5.097063623186561), (240, 5.097063623186561), (244, 5.097063623186561), (308, 5.097063623186561), (360, 5.097063623186561), (405, 5.097063623186561), (434, 5.097063623186561), (475, 5.097063623186561), (793, 5.097063623186561), (892, 5.097063623186561), (971, 5.097063623186561), (1058, 5.097063623186561), (1133, 5.097063623186561), (1191, 5.097063623186561), (1209, 5.097063623186561), (1247, 5.097063623186561), (1339, 5.097063623186561), (1453, 5.097063623186561), (1489, 5.097063623186561), (1522, 5.097063623186561), (1676, 10.194127246373123), (1723, 5

# Boolean Retrieval

In [312]:
def and_query(l1, l2):
  query_terms = [l1,l2] 
  postings_lists = [inverted_index[term] for term in query_terms]

  doc_ids = set(postings_lists[0][i][0] for i in range(len(postings_lists[0])))
  for postings in postings_lists[1:]:
      doc_ids &= set(postings[i][0] for i in range(len(postings)))

  matching_docs = []
  for doc_id in doc_ids:
      tfidf = sum(postings[i][1] for i, postings in enumerate(postings_lists)
                  if doc_id in set(p[0] for p in postings))
      matching_docs.append((doc_id, tfidf))

  matching_docs.sort(key=lambda x: x[1], reverse=True)
  return matching_docs

In [313]:
def or_query(l1,l2):
  query_terms = [l1,l2]
  postings_lists = [inverted_index[term] for term in query_terms]

  doc_ids = set(postings_lists[0][i][0] for i in range(len(postings_lists[0])))
  for postings in postings_lists[1:]:
      doc_ids |= set(postings[i][0] for i in range(len(postings)))

  matching_docs = []
  for doc_id in doc_ids:
      tfidf = sum(postings[i][1] for i, postings in enumerate(postings_lists)
                  if doc_id in set(p[0] for p in postings))
      matching_docs.append((doc_id, tfidf))

  matching_docs.sort(key=lambda x: x[1], reverse=True)
  return matching_docs

In [314]:
def not_query(word):
  exclude_doc_ids = set(inverted_index[word][i][0] for i in range(len(inverted_index[word])))
  all_ids = [x for x in range(0,len(df))]
  result = []
  for doc_ids in all_ids :
      if doc_ids not in exclude_doc_ids:
        result.append(doc_ids)

  return result

And and Or query:<br>
Query format: <br>
word1 and word2 <br>
word1 or word2

In [316]:
word1,operation,word2 = input("Enter the query : ").split()
if word1 in inverted_index and word2 in inverted_index:
  if operation=='and':
    matching_docs = and_query(word1,word2)
    print(f"Number of documents retrieved : {len(matching_docs)}")
    for doc_id, tfidf in matching_docs:
      print(f"Docid {doc_id}: {df['review'][doc_id]} (TF-IDF: {tfidf})")
  else:
    matching_docs = or_query(word1,word2)
    print(f"Number of documents retrieved : {len(matching_docs)}")
    for doc_id, tfidf in matching_docs:
      print(f"Docid: {doc_id}: {df['review'][doc_id]} (TF-IDF: {tfidf})")
else:
  print("Entered word doesnt exist")

Enter the query : good and product
Number of documents retrieved : 1489
Docid 1: awesome sound....very pretty to see this nd the sound quality was too good i wish to take this product loved this product 😍😍😍 (TF-IDF: 3.1622200893340713)
Docid 8193: good product ☺️ (TF-IDF: 3.1622200893340713)
Docid 3: i think it is such a good product not only as per the quality but also the design is quite good . i m using this product from january ... in this pandamic situation it has became the most useful and helpful . overall the bass and the sound quality is pretty good and another thing that will give you such a sigh of relief that it will provide a wire that will help you in case of lacking charges. (TF-IDF: 3.1622200893340713)
Docid 4: awesome bass sound quality very good bettary long life  and i have a purchase rs.999  only really grateful product don't forget to like (TF-IDF: 3.1622200893340713)
Docid 8196: almost one year used still working fine good product (TF-IDF: 3.1622200893340713)
Doci

Not query: Just enter the word

In [317]:
word = input("Enter the word : ")
result = not_query(word)
print(f"The number of documents which do not contain the term '{word}' : {len(result)}")
print("The document IDs are :  ")
print(result)

Enter the word : good
The number of documents which do not contain the term 'good' : 6140
The document IDs are :  
[0, 5, 10, 11, 12, 14, 21, 24, 25, 26, 28, 29, 30, 31, 32, 35, 45, 46, 48, 50, 52, 57, 60, 61, 66, 69, 72, 82, 84, 86, 88, 93, 95, 96, 98, 99, 106, 107, 108, 111, 117, 119, 120, 123, 126, 129, 136, 137, 140, 144, 149, 153, 156, 158, 160, 161, 162, 163, 166, 168, 169, 172, 173, 176, 177, 188, 189, 198, 199, 201, 202, 203, 204, 205, 206, 208, 211, 212, 214, 227, 228, 229, 234, 237, 239, 241, 245, 246, 247, 248, 250, 252, 253, 254, 255, 258, 262, 264, 269, 270, 271, 274, 275, 276, 278, 279, 293, 295, 296, 297, 299, 302, 303, 306, 307, 308, 310, 311, 312, 315, 319, 323, 324, 326, 327, 328, 330, 332, 335, 342, 345, 346, 350, 351, 353, 354, 362, 363, 367, 371, 372, 373, 381, 382, 383, 384, 399, 401, 403, 404, 407, 409, 411, 413, 415, 417, 421, 423, 424, 429, 430, 432, 439, 443, 447, 450, 453, 454, 455, 458, 459, 460, 461, 464, 466, 473, 476, 478, 481, 482, 485, 486, 487, 490, 49

## **Wild card Query**
format:
pattern*

In [318]:
def wild_card_query(word):
  wildcard_query = word
  regex_pattern = re.compile(wildcard_query)
  matching_terms = [term for term in inverted_index.keys() if regex_pattern.match(term)]
  postings_lists = [inverted_index[term] for term in matching_terms]
  merged_postings = []
  for postings in postings_lists:
      merged_postings += postings
  return merged_postings

In [319]:
word = input("Enter a wild card query : ")
matching_docs = wild_card_query(word)
print(f"Number of documents retrieved : {len(matching_docs)}")
for doc_id, tfidf in matching_docs:
  print(f"Docid {doc_id}: {df['review'][doc_id]}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Docid 2451: not a good product.
Docid 2453: very good
Docid 2456: not a good product.
Docid 2458: very good
Docid 2462: good quality, battery backup,sound also very good
Docid 2465: very good product
Docid 2467: i am using it from 2 mouthsand it is too good
Docid 2468: awesome experience while using it..bit low voice is audible using mic and over all headset is good i am using arround 8months no issues with it..
Docid 2472: it interferes with wifi  basically, if your mobile is connected with wifi and also with the headphone via bluetooth then you might not get the proper wifi speed because both use similar microwave frequency so  we get slow internet speed and if you want to watch youtube video using wifi then it might buffer on higher resolutionsound wise its good doesn't have much noise cancellation in it but it has a good battery backup
Docid 2475: very good protect
Docid 2477: very good product in market ..sound quali

## **Positional Index**

In [320]:
def generate_positional_index(data:list):
  fileno=0
  lineno=-1
  for line in data:
    lineno+=1;
    for pos, term in enumerate(line):
      if term in pos_index:     
          if fileno in pos_index[term].keys():
            pos_index[term][0][lineno].append(pos)                
          else:
            pos_index[term][lineno] = [pos]
      else:
          pos_index[term] = {}  
          pos_index[term][lineno] = [pos]
      fileno += 1
  return pos_index

In [321]:
pos_index = {}
file_map = {}
positional_index=generate_positional_index(tokens)
count=0
for i in positional_index:
  count=count+1;
  if count<=20:
    print(i,positional_index[i])
  else:
    break;

it {0: [13], 2: [55], 3: [66], 5: [64], 7: [60], 8: [44], 11: [44], 13: [16], 14: [94], 15: [96], 16: [74], 19: [61], 21: [16], 22: [25], 23: [46], 24: [5], 25: [28], 28: [38], 29: [9], 30: [13], 31: [43], 32: [15], 34: [3], 35: [18], 37: [21], 39: [31], 40: [51], 41: [31], 42: [15], 43: [43], 45: [33], 46: [63], 47: [35], 48: [42], 50: [59], 51: [27], 52: [25], 53: [36], 54: [31], 55: [41], 56: [34], 58: [65], 59: [50], 60: [20], 61: [45], 62: [64], 63: [70], 64: [24], 65: [31], 68: [27], 69: [63], 71: [90], 73: [50], 75: [30], 76: [37], 77: [21], 78: [47], 79: [19], 80: [64], 82: [101], 84: [24], 85: [28], 86: [45], 88: [1], 89: [63], 90: [0], 91: [91], 92: [73], 94: [16], 95: [25], 96: [0], 97: [46], 99: [18], 100: [25], 101: [77], 102: [39], 103: [90], 104: [85], 106: [36], 108: [9], 109: [13], 110: [49], 111: [19], 115: [30], 116: [38], 118: [40], 119: [8], 121: [42], 125: [2], 127: [71], 129: [42], 131: [13], 132: [77], 133: [25], 135: [12], 136: [105], 137: [8], 139: [17], 140: 

#Precision and recall

In [322]:
model = SentenceTransformer('bert-base-nli-mean-tokens')

In [323]:
query = "good product"
query_embedding = model.encode(query)
query_embedding = [query_embedding]

In [None]:
# reviews = df['review']
# reviews_embeddings = model.encode(reviews)

In [None]:
# reviews_embeddings_np = np.array(reviews_embeddings)
# np.save("embeddings.npy",reviews_embeddings_np)

In [324]:
reviews_embeddings = np.load("embeddings.npy")

In [325]:
reviews_embeddings.shape

(9976, 768)

In [337]:
sim = cosine_similarity(
    query_embedding,
    reviews_embeddings
)
sim = sim[0]

In [327]:
relevant= []
non_relevant = []
for i in range(len(sim)):
  if sim[i] > 0.7:
    relevant.append(i)
  else:
    non_relevant.append(i)

## **Phrase Query**

In [328]:
def phrase_query(phrase):
  query = phrase
  query_terms = query.split()
  result = []
  for doc_id in positional_index[query_terms[0]]:
      positions = positional_index[query_terms[0]][doc_id]
      for pos in positions:
          match = True
          for i in range(1, len(query_terms)):
              if doc_id not in positional_index[query_terms[i]]:
                  match = False
                  break
              if pos+i not in positional_index[query_terms[i]][doc_id]:
                  match = False
                  break
          if match:
              result.append(doc_id)
  return result

For the phrase query, enter the same query for which embedding was created previously<br>

In [341]:
phrase = input("Enter a phrase :")
result = phrase_query(phrase)
r = 10 if len(result) > 10 else len(result)
data = []

result_rel = [[doc_id,sim[doc_id]] for doc_id in result]
result_rel.sort(key=lambda x:x[1],reverse=True)
print("Select relevant docs")
print(f"Top 10 Documents containing the phrase '{phrase}':")
for i in range(r):
  print(f"{result_rel[i][0]}: {df['review'][result_rel[i][0]]}")

Enter a phrase :good product
Select relevant docs
Top 10 Documents containing the phrase 'good product':
7373: good product.   very nice
7491: this is a very good product this
6586: good product very nice
8378: this is a good product
3791: it's a good product
5659: it's a very good product
6819: really good product..👍
4989: very good product and durable
7214: it is a good product
5458: very very good product


# Precision and Recall

In [330]:
precision = len(set(relevant) & set(result)) / len(result)
precision

0.712784588441331

In [331]:
recall = len(set(relevant) & set(result)) / len(relevant)
recall

0.13154492566257273

In [332]:
print("Select relevant docs")
for i in range(r):
  temp = str(result_rel[i][0])+ ":" +str(df['review'][result_rel[i][0]])
  data.append(temp) 
checkboxes = [widgets.Checkbox(value=False, description=label) for label in data]
output = widgets.VBox(children=checkboxes)
display(output)

Select relevant docs


VBox(children=(Checkbox(value=False, description='7373:good product.   very nice'), Checkbox(value=False, desc…

In [345]:
print("Here are the documents which you found relevant : ")
relevant_feed = set()
for i in range(0, len(checkboxes)):
    if checkboxes[i].value == True:
        print(f"{result_rel[i][0]} : {df['review'][result_rel[i][0]]}")
        relevant_feed.add(result_rel[i][0])

Here are the documents which you found relevant : 
7373 : good product.   very nice
7491 : this is a very good product this
6586 : good product very nice
3791 : it's a good product


# Re ranking retrieved results based on relevance feedback.

In [346]:
re_ranked = [doc_id for doc_id in relevant_feed]
new_similarities = []
if len(relevant_feed) == 0:
  print("please select relevant docs and run this cell again")
else:
  for doc_id,sim in result_rel:
    # print(doc_id)
    if doc_id not in relevant_feed:
      new_sim = 0
      for rel in relevant_feed:
        new_sim += cosine_similarity(
            [reviews_embeddings[rel]],
            [reviews_embeddings[doc_id]]
        )
      new_sim = new_sim[0]
      new_sim /= len(relevant_feed)
      new_similarities.append([doc_id,new_sim])
  new_similarities.sort(reverse=True,key = lambda x:x[1])
  print("Top 10 Re ranked docs:")
  for doc_id,new_sim in new_similarities:
    re_ranked.append(doc_id)
  for i in range(r):
    print(f"{re_ranked[i]} : {df['review'][re_ranked[i]]}")

Top 10 Re ranked docs:
6586 : good product very nice
7491 : this is a very good product this
7373 : good product.   very nice
3791 : it's a good product
4576 : it's very good product
8378 : this is a good product
5659 : it's a very good product
5589 : it is very good product
7812 : it is very good product
7817 : it is very good product
