In [15]:
from post_parser_record import PostParserRecord
post_reader = PostParserRecord("Posts_Coffee.xml")

In [16]:
# Imports
import math
from itertools import islice
import nltk
import os
from nltk.corpus import stopwords
from nltk.corpus.reader.tagged import word_tokenize
import re, string
nltk.download('punkt')
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
def createDocIDF():
  total_docs = len(post_reader.map_questions) + len(post_reader.map_just_answers)
  for word in term_total_docs:
    doc_idf[word] = math.log(total_docs / float(term_total_docs[word]))

In [18]:
def createDocTF():
  for answer_id in doc_term_count:
    words = doc_term_count[answer_id]
    doc_dic = {}
    for word in words:
      doc_dic[word] = words[word] / doc_total_terms[answer_id]
    doc_tf[answer_id] = doc_dic.copy()

In [19]:
term_total_docs = {}    # {term:count, term:count, ...}.

def createIndex():
  # For each question
  for answer_id in post_reader.map_questions:
    # Get the text
    text = (post_reader.map_questions[answer_id].title + " " + post_reader.map_questions[answer_id].body)

    # Remove punctuations, make lowercase
    token_words = re.sub("<.*?>|\\n|&quot;", " ", text.lower())
    token_words = word_tokenize(token_words.translate(str.maketrans('', '', string.punctuation)))
    token_words = [word for word in token_words if word not in stop_words]

    # Save a dictionary with {term:count, term:count, term:count}
    doc_dic = {}
    count = 0;
    for word in token_words:
      count += 1;
      if word in doc_dic:
        doc_dic[word] += 1
      else:
        doc_dic[word] = 1;
      if word not in term_total_docs:
        term_total_docs[word] = 1
      else:
        term_total_docs[word] += 1;
    # Save dictionary to doc_term_count as {docid: {term:count, term:count}, ...}
    doc_total_terms[answer_id] = count;
    doc_term_count[answer_id] = doc_dic.copy()

  # For each question
  for answer_id in post_reader.map_just_answers:
    # Get the text
    text = (post_reader.map_just_answers[answer_id].body)

    # Remove punctuations, make lowercase
    token_words = re.sub("<.*?>|\\n|&quot;", " ", text.lower())
    token_words = word_tokenize(token_words.translate(str.maketrans('', '', string.punctuation)))
    token_words = [word for word in token_words if word not in stop_words]

    # Save a dictionary with {term:count, term:count, term:count}
    doc_dic = {}
    count = 0;
    for word in token_words:
      count += 1;
      if word in doc_dic:
        doc_dic[word] += 1
      else:
        doc_dic[word] = 1;

      if word not in term_total_docs:
        term_total_docs[word] = 1
      else:
        term_total_docs[word] += 1;
    doc_total_terms[answer_id] = count;
    # Save dictionary to doc_term_count as {docid: {term:count, term:count}, ...}
    doc_term_count[answer_id] = doc_dic.copy()

  createDocTF()
  createDocIDF()

In [20]:
def getAverageDocLength(documents):
  size = len(documents)
  count = 0
  for doc in documents:
    count += documents[doc]
  return count / size

In [21]:
def getBM25Score(query):
  scores = {}
  # Equation: IDF * (A / B)
  # A: (1.2 + 1) * tf
  # B: 1.2 * ((1 - 0.75) + 0.75 * (doc length / avg doc length)) + tf
  k1 = 1.2
  b = 0.75
  query_parse = query.split(" ")
  for word in query_parse:
    for doc_id in doc_tf:
      try:
        tf = doc_tf[doc_id][word]
      except:
        tf = 0
      try:
        idf = doc_idf[word]
      except:
        idf = 0
      doc_length = doc_total_terms[doc_id]
      A = (k1 + 1) * tf
      B = k1 * ((1 - b) + b * (doc_length/avg_doc_length)) + tf
      if doc_id in scores:
        scores[doc_id] = scores[doc_id] + (idf * (A / B))
      else:
        scores[doc_id] = (idf * (A/B))
  
  return scores

In [22]:
def printScores(query, scores):
  print(query + ": ")
  for doc in scores:
    print('%d \t\t %.2f' % (doc, scores[doc]))
  print()

In [23]:
# Create the TF Dictionary
doc_tf = {}             # {docid: {term:TERM_FREQ, term:TERM_FREQ}, docid: {term:TERM_FREQ}, ...}

# Create the IDF Dictionary
doc_idf = {}            # {term: idf_score, term: idf_score}

# Keeps track of term counts for each document
doc_term_count = {}     # {doc: {term:count, term:count}, doc:{term:count,...},...}

# Keeps track of total number of terms for each document
doc_total_terms = {}    # {doc: #_of_terms, doc: #_of_terms, ...}

# Index through posts
createIndex()

# Get average document length for BM25 calculations
avg_doc_length = getAverageDocLength(doc_total_terms)

In [24]:
query_results = []
# Now that we have TF and IDF, we need our query:
query = "espresso"
scores = getBM25Score(query)
scores = (dict(islice(sorted(scores.items(), key=lambda item: item[1], reverse=True), 5)))
for doc in scores:
  scores[doc] = round(scores[doc], 2)
printScores(query, scores)
query_results.append(scores)

query = "turkish coffee"
scores = getBM25Score(query)
scores = (dict(islice(sorted(scores.items(), key=lambda item: item[1], reverse=True), 5)))
for doc in scores:
  scores[doc] = round(scores[doc], 2)
printScores(query, scores)
query_results.append(scores)

query = "making a decaffeinated coffee"
scores = getBM25Score(query)
scores = (dict(islice(sorted(scores.items(), key=lambda item: item[1], reverse=True), 5)))
for doc in scores:
  scores[doc] = round(scores[doc], 2)
printScores(query, scores)
query_results.append(scores)

query = "can i use the same coffee grounds twice"
scores = getBM25Score(query)
scores = (dict(islice(sorted(scores.items(), key=lambda item: item[1], reverse=True), 5)))
for doc in scores:
  scores[doc] = round(scores[doc], 2)
printScores(query, scores)
query_results.append(scores)

espresso: 
4404 		 0.51
3904 		 0.49
2867 		 0.43
3168 		 0.38
4619 		 0.34

turkish coffee: 
5182 		 1.50
5094 		 1.13
483 		 1.05
4486 		 1.03
209 		 0.98

making a decaffeinated coffee: 
204 		 1.96
120 		 1.65
3293 		 1.08
3225 		 1.07
2897 		 1.05

can i use the same coffee grounds twice: 
2683 		 0.97
3966 		 0.87
3818 		 0.69
5582 		 0.66
4703 		 0.60



In [25]:
##### UNCOMMENT TO VIEW VALUES:
# Keeps track of {docid: {term:count, term:count,...}, docid: {term:count, term:count}, ...}
#doc_term_count

# Keeps track of {docid: total_term_count, docid: total_term_count, ...}
#doc_total_terms

# Keeps track of {term: #_of_docs, term: #_of_docs, ...}
#term_total_docs

# Keeps track of {docid: {term:TERM_FREQ, term:TERM_FREQ}, docid: {term:TERM_FREQ}, ...}
#doc_tf

# Keeps track of {term: idf_score, term: idf_score}
#doc_idf

In [26]:
# Relevance_docs = {{doc: 0, doc: 1, doc: 0, ...}, {doc: 0, doc: 1, ...}}
# Results = {{doc: score, doc: score, doc: score}, {doc: score, doc:score, ...}}
def createQrel(relevance_docs, results, filename):
  qrel_filename = os.path.join("/content", (filename + ".txt"))
  qrelresults_filename = os.path.join("/content", (filename + "Results.txt"))
  question = "Q00"
  count = 1;

  with open(qrel_filename, 'w') as f:
    for relevance in relevance_docs:
      question_str = question + str(count)
      count += 1
      for doc in relevance:
        line = question_str + " 0 " + str(doc) + " " + str(relevance[doc])
        f.write(line + "\n")
  
  count = 1
  with open(qrelresults_filename, 'w') as f:
    for query_results in results:
      question_str = question + str(count)
      count += 1
      for doc in query_results:
        line = question_str + " Q0 " + str(doc) + " 1 " + str(query_results[doc]) + " BM25"
        f.write(line + "\n")

In [27]:
# Relevance 
doc_relevance = [{4404: 0, 3904: 0, 2867: 0, 3168: 0, 4619: 0}, 
                 {5182: 1, 5094: 0, 483: 0, 4486: 0, 209: 1}, 
                 {204: 0, 120: 0, 3293: 0, 3225: 1, 2897: 0}, 
                 {2683: 1, 3966: 0, 3818: 0, 5582: 1, 4703: 0}]

createQrel(doc_relevance, query_results, "CoffeePostsBM25")