# Importation, Queries, and Text Processor

In [233]:
# importing & initializing necessities
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import defaultdict
import json
from collections import OrderedDict
import math
import scipy
from sklearn.metrics import ndcg_score, dcg_score

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [234]:
from post_parser_record import PostParserRecord
post_reader = PostParserRecord("Posts_Coffee.xml")
stop_words = set(stopwords.words('english'))

In [235]:
# this is a utility function designed to deal with raw text.
def text_processor (input_text):
  # remove punctuation & convert text to lowercase
  input_text = re.sub(r"[(,,.;@/>#//\\\/^`'’/_//=\"-:?*\[\]<!&$)]+\ *", " ", input_text.lower())
  # text tokenized into a list
  list_of_words = word_tokenize(input_text)
  # list comprehension to filter out any stopwords
  list_of_words_res = [x for x in list_of_words if (x not in stop_words) and (x != 'p')]
  return list_of_words_res

In [236]:
# queryset
queries = {
    "espresso",
    "turkish coffee",
    "making a decaffeinated coffee",
    "can I use the same coffee grounds twice?",
}


# TF-IDF Model

In [237]:
# TF-IDF
# 1) create TF such that tf_dict -> Term : (Doc : TF)
# 2) create IDF such that idf_dict -> Term : IDF
# 3) Handle query, return tf-idf score

In [238]:
# this function builds the term frequency dictionary.
def build_TF():
    tf_dict = dict() # this will hold the result
    for post in post_reader.map_questions: # going over questions
      question_text = post_reader.map_questions[post].title + " " + post_reader.map_questions[post].body # joining question text
      question_text = text_processor(question_text) # processing the question text
      currentPost_ID = post_reader.map_questions[post].post_id # getting the document id
      for term in question_text:
        if term not in tf_dict:
          tf_dict[term] = dict()
        if currentPost_ID not in tf_dict[term]:
          tf_dict[term][currentPost_ID] = 1/len(question_text)
        tf_dict[term][currentPost_ID] += 1/len(question_text)

    for post in post_reader.map_just_answers: # going over answers
      answer_text = post_reader.map_just_answers[post].body # joining question text
      answer_text = text_processor(answer_text) # processing the question text
      currentPost_ID = post_reader.map_just_answers[post].post_id # getting the document id
      for term in answer_text:
        if term not in tf_dict:
          tf_dict[term] = dict()
        if currentPost_ID not in tf_dict[term]:
          tf_dict[term][currentPost_ID] = 1/len(answer_text)
        tf_dict[term][currentPost_ID] += 1/len(answer_text)
    return OrderedDict(sorted(tf_dict.items()))

output_file = open("tf.json", "w")
json.dump(build_TF(), output_file)
output_file.close()

In [239]:
# these are helper functions to gather the total number of posts
def total_posts():
  total = 0
  for post in post_reader.map_questions:
    total += 1
  for post in post_reader.map_just_answers:
    total += 1
  return total
N = total_posts()

# we also load the tf dictionary back up for a convient way to find df
with open('tf.json') as tf_file:
  tf_data = json.load(tf_file)


In [240]:
# this function builds the inverse document frequency dictionary.
def inverse_df():
  idf_dict = dict()
  for term in tf_data:
    if term not in idf_dict:
      idf_dict[term] = 0
    idf_dict[term] = math.log2(N / len(tf_data[term].keys()))
  return OrderedDict(sorted(idf_dict.items()))

# writing out file
output_file = open("idf.json", "w")
json.dump(inverse_df(), output_file)
output_file.close()

# reading in file
with open('idf.json') as idf_file:
  idf_data = json.load(idf_file)

In [241]:
# this function computes a TF-IDF dictionary
def build_tf_idf():
  tf_idf_dict = dict()
  for term in tf_data:
    for doc in tf_data[term]:
      if term not in tf_idf_dict:
        tf_idf_dict[term] = dict()
      temp_dict = {doc: tf_data[term][doc]*idf_data[term]}
      tf_idf_dict[term].update(temp_dict)
  return tf_idf_dict

# writing out file
output_file = open("tf_idf.json", "w")
json.dump(build_tf_idf(), output_file)
output_file.close()

# reading in file
with open('tf_idf.json') as tf_idf_file:
  tf_idf_data = json.load(tf_idf_file)

In [242]:
# query handler
def query_handler_tfidf(query):
  query_terms = text_processor(query)
  list_of_docs = dict()
  for term in query_terms:
    if term in tf_idf_data:
      doc_ids = tf_idf_data[term]
      for doc in doc_ids:
        if doc not in list_of_docs:
          list_of_docs.update(doc_ids)
        else:
          list_of_docs[doc] += doc_ids[doc]
  return dict((list((dict(sorted(list_of_docs.items(), key=lambda element: element[1], reverse=True))).items())[:5]))

In [243]:
query_handler_tfidf("espresso")

{'3904': 1.4571853159493842,
 '4404': 1.311466784354446,
 '3168': 1.0928889869620382,
 '2867': 0.9714568772995894,
 '93': 0.9367619888246042}

# Vector Space Model

In [244]:
# getting all terms in collection, and all unique terms
def get_terms():
  terms = list()
  for post in post_reader.map_questions: # going over questions
      question_text = post_reader.map_questions[post].title + " " + post_reader.map_questions[post].body # joining question text
      question_text = text_processor(question_text) # processing the question text
      for term in question_text:
        terms.append(term)

  for post in post_reader.map_just_answers: # going over answers
    answer_text = post_reader.map_just_answers[post].body # joining question text
    answer_text = text_processor(answer_text) # processing the question text
    for term in answer_text:
      terms.append(term)
      
  return terms

terms = get_terms()
num_terms = len(terms)
num_uterms = len(set(terms))

print("Total number of terms:", num_terms, "\nTotal number of unique terms:", num_uterms)

# creating a dictionary for terms
term_dict = dict()
index = 0
for term in set(terms):
  term_dict[term] = index
  index += 1
# this function houses the bulk of the vsm algorithm
def vsm():
  doc_vectors = dict()
  for post in post_reader.map_questions: # going over the question posts
    vector = np.zeros(num_uterms)
    question_text = post_reader.map_questions[post].title + " " + post_reader.map_questions[post].body # joining question text
    question_text = text_processor(question_text) # processing the question text
    currentPost_ID = post_reader.map_questions[post].post_id # getting the document id
    for term in question_text:
      if term in term_dict:
        vector[term_dict[term]] += 1
    doc_vectors[currentPost_ID] = vector

  for post in post_reader.map_just_answers: # going over the answer posts
    vector = np.zeros(num_uterms)
    answer_text = post_reader.map_just_answers[post].body # getting answer text
    answer_text = text_processor(answer_text) # processing the answer text
    currentPost_ID = post_reader.map_just_answers[post].post_id # getting the document id
    for term in answer_text:
      if term in term_dict:
        vector[term_dict[term]] += 1
    doc_vectors[currentPost_ID] = vector
  return doc_vectors

vectors_dict = vsm()

Total number of terms: 344882 
Total number of unique terms: 18372


In [245]:
# query handling for vsm
def query_handler_vsm(query):
  query_terms = text_processor(query)
  q_vector = np.zeros(num_uterms)
  for term in query_terms:
    if term in term_dict:
      q_vector[term_dict[term]] += 1

  results_dict = dict()
  for doc in vectors_dict:
    results_dict[doc] = 1 - scipy.spatial.distance.cosine(q_vector, vectors_dict[doc])
  return dict((list(dict(sorted(results_dict.items(), key=lambda element: element[1], reverse=True)).items())[:5]))

In [246]:
print(query_handler_vsm("espresso"))

{2766: 0.6644105970267493, 4175: 0.6401843996644798, 3168: 0.6396021490668313, 26: 0.629940788348712, 5528: 0.6155870112510924}


# BM25 Model

In [247]:
#  average length of documents
avg_len = len(terms) / N

# this function computes a dictionary of the length of each document
def len_collection():
  len_dict = dict()
  for post in post_reader.map_questions: # going over questions
      question_text = post_reader.map_questions[post].title + " " + post_reader.map_questions[post].body # joining question text
      question_text = text_processor(question_text) # processing the question text
      currentPost_ID = post_reader.map_questions[post].post_id # getting the document id
      if currentPost_ID not in len_dict:
        len_dict[currentPost_ID] = 0
      len_dict[currentPost_ID] = len(question_text)
  for post in post_reader.map_just_answers: # going over answers
      answer_text = post_reader.map_just_answers[post].body # joining question text
      answer_text = text_processor(answer_text) # processing the question text
      currentPost_ID = post_reader.map_just_answers[post].post_id # getting the document id
      if currentPost_ID not in len_dict:
        len_dict[currentPost_ID] = 0
      len_dict[currentPost_ID] = len(answer_text)
  return len_dict
len_dict = len_collection()

In [248]:
# function to perform BM25
k1 = 1.2
b = 0.75

def query_handler_bm25(query):
  query_terms = text_processor(query)
  list_of_docs = dict()
  for term in query_terms:
    if term in tf_data:
      for doc in tf_data[term]:
        if doc not in list_of_docs:
          list_of_docs[doc] = 0
        idoc = int(doc)
        len_eq = len_dict[idoc] / avg_len
        top_eq = (k1 + 1) * tf_data[term][doc]
        bot_eq = k1 * ((1-b) + b * (len_eq)) + tf_data[term][doc]
        list_of_docs[doc] += idf_data[term] * (top_eq / bot_eq)
  return dict((list((dict(sorted(list_of_docs.items(), key=lambda element: element[1], reverse=True))).items())[:5]))


In [249]:
query_handler_bm25("espresso")

{'3904': 2.2994190498647242,
 '4404': 2.042602576406589,
 '3168': 1.6696453819078618,
 '93': 1.5540809001677207,
 '2867': 1.4977023874695319}

# Model Discussion (Question 2)

In [250]:
# TF-IDF results
print("TF-IDF Results:")
for query in queries:
  print("\nQuery :", query)
  results = query_handler_tfidf(query)
  i = 1
  for result in results:
    print(i, ":", result, "-", results[result])
    i += 1

TF-IDF Results:

Query : can I use the same coffee grounds twice?
1 : 3966 - 1.5942030004901218
2 : 3818 - 1.5158453532697278
3 : 2683 - 1.5004263534024675
4 : 4703 - 1.2632044610581064
5 : 3568 - 1.0716828097153608

Query : making a decaffeinated coffee
1 : 3225 - 1.726014751206339
2 : 97 - 0.9643953105943388
3 : 2867 - 0.8036627588286156
4 : 3321 - 0.6289534634310905
5 : 1656 - 0.6199684139535035

Query : turkish coffee
1 : 4486 - 1.7029595989046178
2 : 3369 - 1.4596796562039582
3 : 5690 - 0.69666529046098
4 : 2879 - 0.6192580359653156
5 : 4551 - 0.4541225597078981

Query : espresso
1 : 3904 - 1.4571853159493842
2 : 4404 - 1.311466784354446
3 : 3168 - 1.0928889869620382
4 : 2867 - 0.9714568772995894
5 : 93 - 0.9367619888246042


In [251]:
# VSM results
print("VSM Results:")
for query in queries:
  print("\nQuery :", query)
  results = query_handler_vsm(query)
  i = 1
  for result in results:
    print(i, ":", result, "-", results[result])
    i += 1

VSM Results:

Query : can I use the same coffee grounds twice?
1 : 2683 - 0.6565321642986127
2 : 1749 - 0.6002450479987809
3 : 3258 - 0.5454545454545455
4 : 5121 - 0.5388159060803248
5 : 2609 - 0.5151021148075838

Query : making a decaffeinated coffee
1 : 120 - 0.560448538317805
2 : 4193 - 0.501280411827603
3 : 2158 - 0.5003702332976757
4 : 3293 - 0.5
5 : 204 - 0.492365963917331

Query : turkish coffee
1 : 5094 - 0.7715167498104596
2 : 2522 - 0.7252406676228423
3 : 3074 - 0.7071067811865476
4 : 2379 - 0.6832312780114155
5 : 45 - 0.649519052838329

Query : espresso
1 : 2766 - 0.6644105970267493
2 : 4175 - 0.6401843996644798
3 : 3168 - 0.6396021490668313
4 : 26 - 0.629940788348712
5 : 5528 - 0.6155870112510924


In [252]:
# BM25 results
print("BM25 Results:")
for query in queries:
  print("\nQuery :", query)
  results = query_handler_bm25(query)
  i = 1
  for result in results:
    print(i, ":", result, "-", results[result])
    i += 1

BM25 Results:

Query : can I use the same coffee grounds twice?
1 : 3966 - 3.982112714577462
2 : 1749 - 3.384308239743852
3 : 2683 - 3.360534880339401
4 : 5121 - 2.3823629682447676
5 : 4149 - 2.2385955631336705

Query : making a decaffeinated coffee
1 : 204 - 5.884320197351069
2 : 3293 - 5.332624717914693
3 : 2897 - 3.244946523493822
4 : 3225 - 3.1908449708781115
5 : 120 - 2.6304449823904488

Query : turkish coffee
1 : 5182 - 6.372043626061505
2 : 5094 - 5.540444119684456
3 : 483 - 4.139037461760955
4 : 209 - 3.638937818625787
5 : 2522 - 3.333600118754522

Query : espresso
1 : 3904 - 2.2994190498647242
2 : 4404 - 2.042602576406589
3 : 3168 - 1.6696453819078618
4 : 93 - 1.5540809001677207
5 : 2867 - 1.4977023874695319


In [253]:
# TF-IDF relevancies
relevancies_tfidf = [
    [0, 0, 1, 0, 0],
    [1, 0, 0, 0, 0],
    [1, 0, 1, 0, 0],
    [1, 1, 1, 1, 1]
]
print("TF-IDF Precision at 5 :", ((1 + 1 + 2 + 5)/5)/5)

#computing tf-idf ndcg
ideal_order_relevancies = [sorted(item, reverse=True) for item in relevancies_tfidf]
ndcg_list = list()

for ideal, relevance in zip(ideal_order_relevancies, relevancies_tfidf):
  ndcg_list.append(ndcg_score(np.array([ideal]), np.array([relevance])))

print("TF-IDF nDCG at 5 :", (sum(ndcg_list) / len(ndcg_list)))

# VSM relevancies
relevancies_vsm = [
    [1, 1, 1, 0, 1],
    [1, 1, 0, 1, 1],
    [1, 1, 1, 1, 1],
    [1, 1, 1, 1, 1]
]

print("\nVSM Precision at 5 :", ((4 + 4 + 5 + 5)/5)/5)

#computing vsm ndcg
ideal_order_relevancies = [sorted(item, reverse=True) for item in relevancies_vsm]
ndcg_list = list()

for ideal, relevance in zip(ideal_order_relevancies, relevancies_vsm):
  ndcg_list.append(ndcg_score(np.array([ideal]), np.array([relevance])))

print("VSM nDCG at 5 :", (sum(ndcg_list) / len(ndcg_list)))

# BM25 relevancies
relevancies_bm25 = [
    [0, 1, 1, 0, 0],
    [1, 0, 1, 1, 0],
    [1, 1, 1, 1, 1],
    [1, 1, 1, 1, 1]
]
print("\nBM25 Precision at 5 :", ((2 + 3 + 5 + 5)/5)/5)

#computing bm25 ndcg
ideal_order_relevancies = [sorted(item, reverse=True) for item in relevancies_bm25]
ndcg_list = list()

for ideal, relevance in zip(ideal_order_relevancies, relevancies_bm25):
  ndcg_list.append(ndcg_score(np.array([ideal]), np.array([relevance])))

print("BM25 nDCG at 5 :", (sum(ndcg_list) / len(ndcg_list)))

TF-IDF Precision at 5 : 0.36
TF-IDF nDCG at 5 : 0.81409864757368

VSM Precision at 5 : 0.72
VSM nDCG at 5 : 0.9505098091139026

BM25 Precision at 5 : 0.6
BM25 nDCG at 5 : 0.9069427616901222


Of all the three models, the vector space model displayed the highest effectiveness. It had the highest precision, and nDCG, at cut 5. TF-IDF performed the worst, and had a precision of only 0.36. Despite the low precision at cut 5, it still performed quite well. Best Match 25 performed middle of the models, but was fairly close to surpassing the nDCG of the vector space model. Overall, all three models displayed no abnormalities in time efficiency, this was not specifically measured, but at a high view, all ran well below 2 seconds. Mentionable, as well as noted by way of Google Colab's UI. 

In terms of the low precision for the TF-IDF, the model seemed to run into issues with posts that had less specific keywords and a lower text length. An example for the query "Can I use the same coffee grounds twice?", was https://coffee.stackexchange.com/questions/3028/why-does-french-press-not-give-enough-caffeine-effect/3966#3966. The answer post that the model ranked as first, had non-specific keywords such as 'twice', 'use', and 'coffee', and was of a short length. Although not pertaining to this assignment, a potential solution to improve the TF-IDF algorithm would be to weight query terms differently, putting more weight onto the terms 'twice' and 'grounds'.