In [14]:
from post_parser_record import PostParserRecord
post_reader = PostParserRecord("Posts_Coffee.xml")

In [15]:
# imports
import math
from scipy import spatial
import os
from itertools import islice
import nltk
from nltk.corpus import stopwords
from nltk.corpus.reader.tagged import word_tokenize
import re, string
nltk.download('punkt')
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
def getPostTerms():
  terms = []
  # For each question
  for answer_id in post_reader.map_questions:
    # Get the text
    text = (post_reader.map_questions[answer_id].title + " " + post_reader.map_questions[answer_id].body)

    # Remove punctuations, make lowercase
    token_words = re.sub("<.*?>|\\n|&quot;", " ", text.lower())
    token_words = word_tokenize(token_words.translate(str.maketrans('', '', string.punctuation)))
    token_words = [word for word in token_words if word not in stop_words]

    # Save terms
    for word in token_words:
      terms.append(word)

  # For each question
  for answer_id in post_reader.map_just_answers:
    # Get the text
    text = (post_reader.map_just_answers[answer_id].body)

    # Remove punctuations, make lowercase
    token_words = re.sub("<.*?>|\\n|&quot;", " ", text.lower())
    token_words = word_tokenize(token_words.translate(str.maketrans('', '', string.punctuation)))
    token_words = [word for word in token_words if word not in stop_words]

    # Save terms
    for word in token_words:
      terms.append(word)

  return terms

In [17]:
def generateTermIndices(terms):
  temp_dic = {}
  count = 0
  for term in terms:
    temp_dic[term] = count
    count += 1
  return temp_dic.copy()

In [18]:
def generateDocVectors(indices, size):
  doc_to_vector = {}
  for doc_id in post_reader.map_questions:
    # Get the text
    question = (post_reader.map_questions[doc_id].title + " " + post_reader.map_questions[doc_id].body)
    doc_to_vector[doc_id] = generateSingleVector(indices, size, question).copy()
    
  for doc_id in post_reader.map_just_answers:
    answer = post_reader.map_just_answers[doc_id].body
    doc_to_vector[doc_id] = generateSingleVector(indices, size, answer).copy()

  return doc_to_vector

In [19]:
def generateSingleVector(indices, size, text):
  current_vector = [0] * size

  # Remove punctuations, make lowercase
  token_words = re.sub("<.*?>|\\n|&quot;", " ", text.lower())
  token_words = word_tokenize(token_words.translate(str.maketrans('', '', string.punctuation)))
  token_words = [word for word in token_words if word not in stop_words]

  for word in token_words:
    if word in indices:
      current_vector[indices[word]] += 1;
  
  return current_vector

In [20]:
def getVSMScores(query, vectors, indices, size):
  doc_scores = {}
  # Create vector for query
  query_vector = generateSingleVector(indices, size, query)

  # For every document, get cosine similarity between query vector and doc vector
  for doc_id in vectors:
    doc_vector = vectors[doc_id]
    # Find cosine similarity:
    result = 1 - spatial.distance.cosine(query_vector, doc_vector)
    doc_scores[doc_id] = result
  return doc_scores

In [21]:
def printScores(query, scores):
  print(query + ": ")
  for doc in scores:
    print('%d \t\t %.2f' % (doc, scores[doc]))
  print()

In [22]:
# Non-unique terms:
all_terms = getPostTerms()

# Unique terms:
unique_terms = set(all_terms)

# Create dic of {term: index}
term_index = generateTermIndices(unique_terms)

# Create vectors for every word
doc_vectors = generateDocVectors(term_index, len(unique_terms))

In [None]:
query_results = []
# Queries:
query = "espresso"
scores = getVSMScores(query, doc_vectors, term_index, len(unique_terms))
scores = (dict(islice(sorted(scores.items(), key=lambda item: item[1], reverse=True), 5)))
for doc in scores:
  scores[doc] = round(scores[doc], 2)
printScores(query, scores)

query = "turkish coffee"
scores = getVSMScores(query, doc_vectors, term_index, len(unique_terms))
scores = (dict(islice(sorted(scores.items(), key=lambda item: item[1], reverse=True), 5)))
for doc in scores:
  scores[doc] = round(scores[doc], 2)
printScores(query, scores)

query = "making a decaffeinated coffee"
scores = getVSMScores(query, doc_vectors, term_index, len(unique_terms))
scores = (dict(islice(sorted(scores.items(), key=lambda item: item[1], reverse=True), 5)))
for doc in scores:
  scores[doc] = round(scores[doc], 2)
printScores(query, scores)

query = "can i use the same coffee grounds twice"
scores = getVSMScores(query, doc_vectors, term_index, len(unique_terms))
scores = (dict(islice(sorted(scores.items(), key=lambda item: item[1], reverse=True), 5)))
for doc in scores:
  scores[doc] = round(scores[doc], 2)
printScores(query, scores)

In [None]:
# Relevance_docs = {{doc: 0, doc: 1, doc: 0, ...}, {doc: 0, doc: 1, ...}}
# Results = {{doc: score, doc: score, doc: score}, {doc: score, doc:score, ...}}
def createQrel(relevance_docs, results, filename):
  qrel_filename = os.path.join("/content", (filename + ".txt"))
  qrelresults_filename = os.path.join("/content", (filename + "Results.txt"))
  question = "Q00"
  count = 1;

  with open(qrel_filename, 'w') as f:
    for relevance in relevance_docs:
      question_str = question + str(count)
      count += 1
      for doc in relevance:
        line = question_str + " 0 " + str(doc) + " " + str(relevance[doc])
        f.write(line + "\n")
  
  count = 1
  with open(qrelresults_filename, 'w') as f:
    for query_results in results:
      question_str = question + str(count)
      count += 1
      for doc in query_results:
        line = question_str + " Q0 " + str(doc) + " 1 " + str(query_results[doc]) + " TF-IDF"
        f.write(line + "\n")

In [None]:
# Relevance 
doc_relevance = [{2766: 0, 1574: 1, 2095: 0, 26: 1, 5528: 0}, 
                 {5094: 1, 3074: 0, 2379: 1, 1833: 1, 5095: 1}, 
                 {120: 0, 3746: 0, 2555: 1, 373: 0, 3293: 0}, 
                 {2683: 1, 1749: 1, 3258: 1, 5121: 0, 3663: 0}]

createQrel(doc_relevance, query_results, "CoffeePostsVSM")