**Ryan Reed**

**COS 470 : Introduction to Information Retrieval**

**Assignment 5 - Parts 1 & 2**

In [1]:
# importing
from gensim.models import Word2Vec
import nltk
nltk.download("punkt")
from nltk.tokenize import word_tokenize
import re
import sklearn
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
!pip install python-terrier
!pip install --upgrade git+https://github.com/terrier-org/pyterrier.git
import pyterrier as pt
pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting python-terrier
  Downloading python-terrier-0.9.1.tar.gz (102 kB)
[K     |████████████████████████████████| 102 kB 7.7 MB/s 
Collecting wget
  Downloading wget-3.2.zip (10 kB)
Collecting pyjnius>=1.4.2
  Downloading pyjnius-1.4.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 48.9 MB/s 
[?25hCollecting matchpy
  Downloading matchpy-0.5.5-py3-none-any.whl (69 kB)
[K     |████████████████████████████████| 69 kB 9.1 MB/s 
[?25hCollecting sklearn
  Downloading sklearn-0.0.post1.tar.gz (3.6 kB)
Collecting deprecated
  Downloading Deprecated-1.2.13-py2.py3-none-any.whl (9.6 kB)
Collecting chest
  Downloading chest-0.2.3.tar.gz (9.6 kB)
Collecting nptyping==1.4.4
  Downloading nptyping-1.4.4-py3-none-any.whl (31 kB)
Collecting ir_datasets>=0.3.2
  Downloading ir_datasets-0.5.4-py3-none-any.whl (311 kB)
[K     |█

PyTerrier 0.9.1 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7



In [2]:
import pandas as pd
from post_parser_record import PostParserRecord
post_reader = PostParserRecord("Posts_Coffee.xml")

# Part 1 - Word2Vec

In [3]:
# first we get the collection of all question titles, and their ids
question_titles = list()
question_ids = list()

for post in post_reader.map_questions:
  question_titles.append(post_reader.map_questions[post].title)
  question_ids.append(post_reader.map_questions[post].post_id)

# having obtained a list of question titles, we must now tokenize them into a
# list of lists of tokens
training_data = list()

for title in question_titles:
  # we quickly remove punctuation & convert to lowercase
  title = re.sub(r"[(,,.;@/>#//\\\/^`'’/_//=\"-:?*\[\]<!&$)]+\ *", " ", title.lower())
  title_tokens = word_tokenize(title)
  training_data.append(title_tokens)

# then we print the training data to ensure it's properly formatted
print(len(training_data))
print(training_data)

1370
[['how', 'should', 'i', 'store', 'whole', 'bean', 'coffee'], ['how', 'fine', 'should', 'i', 'grind', 'coffee', 'for', 'drip', 'pour', 'over', 'coffee'], ['does', 'the', 'hardness', 'of', 'water', 'matter', 'when', 'making', 'coffee'], ['what', 's', 'the', 'theory', 'behind', 'using', 'thin', 'spouted', 'kettles', 'when', 'making', 'drip', 'pour', 'over', 'coffee'], ['how', 'important', 'is', 'tamping', 'coffee', 'for', 'an', 'espresso', 'machine'], ['why', 'is', 'it', 'not', 'recommended', 'to', 'drink', 'coffee', 'immediately', 'after', 'it', 's', 'roasted'], ['what', 'factors', 'affect', 'espresso', 'crema'], ['why', 'do', 'barista', 's', 'keep', 'activating', 'the', 'doser', 'whilst', 'grinding', 'coffee'], ['how', 'should', 'i', 'store', 'ground', 'coffee'], ['how', 'long', 'can', 'i', 'store', 'ground', 'coffee', 'in', 'the', 'fridge'], ['what', 'is', 'the', 'process', 'to', 'reduce', 'bitterness', 'in', 'coffee'], ['how', 'does', 'the', 'temperature', 'of', 'the', 'water', '

In [4]:
# initializing Word2Vec model & training for 10 epochs
# the iter parameter is the number of epochs, apparently iter is deprecated
model = Word2Vec(sentences=training_data, window=5, min_count=1, workers=4, iter=10)



In [5]:
# now that we have the trained model, we need to obtain the vector representations for each title
# we need to condense the vectors into a vector representation list for all titles
# TODO, we need to get vector representations for all query titles

vector_representations = list()
for title in training_data: # we loop over each title
  # we want to hold the vectors for each token in a title
  vec_list = list()
  for token in title:  # we loop over each token
    # for each token we obtain the representation from the model
    vec_list.append(model.wv[token])
  # now we have a list of vectors for each token, to obtain a vector
  # representation of the question title, we find the average
  vec_rep = sum(vec_list) / len(vec_list)
  vector_representations.append(vec_rep)

In [6]:
# now we transform the query into a vector representation
query = "When does Coffee go off?"
query = re.sub(r"[(,,.;@/>#//\\\/^`'’/_//=\"-:?*\[\]<!&$)]+\ *", " ", query.lower())
query_data = word_tokenize(query)
query_vec_list = list()

for token in query_data:
  query_vec_list.append(model.wv[token])

query_vec_rep = sum(query_vec_list) / len(query_vec_list)
print(query_vec_rep)

[ 0.05653774  0.08089882 -0.02453135  0.20813361  0.5371747  -0.04375621
 -0.24423487  0.08312981 -0.6399589   0.07112528  0.06263952  0.08264133
 -0.06699814 -0.22661202  0.02633559 -0.11412251  0.05590806  0.00723216
  0.04997935  0.24299657  0.11772329  0.10264166  0.43222466  0.1555538
 -0.08532211  0.08987075 -0.16110787 -0.32384676  0.1400699   0.4014979
  0.24360482 -0.33742577  0.11683021 -0.24929604 -0.14297102  0.30791444
  0.22923169 -0.1041512  -0.0443842  -0.0775759   0.1978387   0.1815903
 -0.2760128  -0.41847888  0.14465392  0.38675123 -0.13620701  0.15677434
 -0.15515974 -0.23270996 -0.09903812 -0.47493753 -0.11990585  0.26034647
  0.09888896 -0.0352361   0.31940204  0.06623027  0.16808088 -0.1835509
  0.04023328  0.3597768   0.3428009  -0.24429314 -0.05647781 -0.24911396
 -0.17605269  0.05458714  0.33050174  0.3112673   0.0192652  -0.02218582
 -0.01432118 -0.03554038  0.2354103  -0.3116989   0.16538739 -0.19176686
 -0.38114315 -0.2547652  -0.05961838  0.0077024   0.038

In [7]:
#TODO find the 5 most similiar question titles to the query
cosine_sims = list()

for vector_representation in vector_representations:
  sim = cosine_similarity([query_vec_rep], [vector_representation])
  cosine_sims.append(sim)

In [8]:
title_sims = {question_ids[i]: cosine_sims[i][0][0] for i in range(len(question_titles))}
sorted_sims_titles = dict(sorted(title_sims.items(), key=lambda item: item[1], reverse=True))
print("Posts with the highest cosine similiarity: ", {id:sim for (id, sim) in [question for question in sorted_sims_titles.items()][:5]})

Posts with the highest cosine similiarity:  {123: 1.0, 4943: 0.999987, 5670: 0.9999868, 3133: 0.99998534, 5861: 0.99998486}


# Part 2 - Classification

In [9]:
# first we obtain a collection of the text of all question posts
q_ids = list()
q_text = list()
q_tags = list()

for post in post_reader.map_questions:
  q_text.append(post_reader.map_questions[post].body.lower().strip())
  q_ids.append(str(post_reader.map_questions[post].post_id))
  q_tags.append(post_reader.map_questions[post].tags)

df = pd.DataFrame({'docno': q_ids, 'text': q_text})

!rm -rf ./pd_index
pd_indexer = pt.DFIndexer("./pd_index")
indexref = pd_indexer.index(df["text"], df)

# we use the pyterrier implementation of bm25
def bm25(query):
  result = pt.BatchRetrieve(indexref, num_results =1000, wmodel="BM25").search(query)
  return result

In [10]:
# creating a list with the requested queries
queries = ["how to make espresso",
           "moka pot",
           "coffee caffeine"]
# creating a list with k for top-k
k_list = [1, 5, 10]

# creating a list to hold the bm25 search results for each query
# essentially q_results is a list of dataframes for each query
q_results = list()
for query in queries:
  q_results.append(bm25(query))

In [11]:
# function to get the top k records
def get_k(k, df):
  return df.nlargest(k, 'score')

# k results for query 1
q1_topk = list()
for k in k_list:
  q1_topk.append(get_k(k, q_results[0]))

# k results for query 2
q2_topk = list()
for k in k_list:
  q2_topk.append(get_k(k, q_results[1]))

# k results for query 3
q3_topk = list()
for k in k_list:
  q3_topk.append(get_k(k, q_results[2]))

# index should be the same for both ids and tags

# getting all of the docnos (ids) of the topk results
def getids(data):
  id_list = list()
  for x in data:
    id_list.append(list(x.docno))
  return id_list

In [12]:
# now that we have a list of lists of the docnos, we can get
# the tags from these
def gettags(data):
  tags_list = list()
  for idlist in data:
    tags = list()
    for id in idlist:
      id_ind = q_ids.index(id)
      tags.append(q_tags[id_ind])
    tags_list.append(tags)
  return tags_list

# now a function to process this nested list of tags, to find the most common
def top_tag(data):
  overall_tag_occ = list()
  for x in data:
    for taglist in x:
      tag_occ = dict()
      for tag in taglist:
        if tag not in tag_occ:
          tag_occ[tag] = 0
        tag_occ[tag] += 1
      tag_occ = dict(sorted(tag_occ.items(), key=lambda item: item[1], reverse=True))
    overall_tag_occ.append(list(tag_occ)[0])
  return overall_tag_occ

In [26]:
# query 1
q1_x = getids(q1_topk) # we get the ids
q1_y = gettags(q1_x) # we get the tags

# query 2
q2_x = getids(q2_topk) # we get the ids
q2_y = gettags(q2_x) # we get the tags

# query 3
q3_x = getids(q3_topk) # we get the ids
q3_y = gettags(q3_x) # we get the tags

# compiling all class results
top_tag_results = [top_tag(q1_y), top_tag(q2_y), top_tag(q3_y)]

# formatting a printed representation of results
for x in range(0,3):
  for y in range(0,3):
    print('Query:', queries[x], '| k:', k_list[y], '| Topical Class:', top_tag_results[x][y])

Query: how to make espresso | k: 1 | Topical Class: espresso
Query: how to make espresso | k: 5 | Topical Class: espresso
Query: how to make espresso | k: 10 | Topical Class: espresso
Query: moka pot | k: 1 | Topical Class: equipment
Query: moka pot | k: 5 | Topical Class: moka
Query: moka pot | k: 10 | Topical Class: moka
Query: coffee caffeine | k: 1 | Topical Class: health
Query: coffee caffeine | k: 5 | Topical Class: nespresso
Query: coffee caffeine | k: 10 | Topical Class: ground-coffee
