In [10]:
from post_parser_record import PostParserRecord
post_reader = PostParserRecord("Posts_Coffee.xml")

In [11]:
# imports
import math
from itertools import islice
import csv, os
import nltk
from nltk.corpus import stopwords
from nltk.corpus.reader.tagged import word_tokenize
import re, string
nltk.download('punkt')
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
def createDocIDF():
  total_docs = len(post_reader.map_questions) + len(post_reader.map_just_answers)
  for word in term_total_docs:
    doc_idf[word] = math.log(total_docs / float(term_total_docs[word]))

In [13]:
def createDocTF():
  for answer_id in doc_term_count:
    words = doc_term_count[answer_id]
    doc_dic = {}
    for word in words:
      doc_dic[word] = words[word] / doc_total_terms[answer_id]
    doc_tf[answer_id] = doc_dic.copy()

In [14]:
doc_term_count = {}     # {doc: {term:count, term:count}, doc:{term:count,...},...}
doc_total_terms = {}    # {doc: #_of_terms, doc: #_of_terms, ...}
term_total_docs = {}    # {term:count, term:count, ...}.

def createIndex():
  # For each question
  for answer_id in post_reader.map_questions:
    # Get the text
    text = (post_reader.map_questions[answer_id].title + " " + post_reader.map_questions[answer_id].body)

    # Remove punctuations, make lowercase
    token_words = re.sub("<.*?>|\\n|&quot;", " ", text.lower())
    token_words = word_tokenize(token_words.translate(str.maketrans('', '', string.punctuation)))
    token_words = [word for word in token_words if word not in stop_words]

    # Save a dictionary with {term:count, term:count, term:count}
    doc_dic = {}
    count = 0;
    for word in token_words:
      if word not in stop_words:
        count += 1;
        if word in doc_dic:
          doc_dic[word] += 1
        else:
          doc_dic[word] = 1;

        if word not in term_total_docs:
          term_total_docs[word] = 1
        else:
          term_total_docs[word] += 1;
    # Save dictionary to doc_term_count as {docid: {term:count, term:count}, ...}
    doc_total_terms[answer_id] = count;
    doc_term_count[answer_id] = doc_dic.copy()

  # For each question
  for answer_id in post_reader.map_just_answers:
    # Get the text
    text = (post_reader.map_just_answers[answer_id].body)

    # Remove punctuations, make lowercase
    token_words = re.sub("<.*?>|\\n|&quot;", " ", text.lower())
    token_words = word_tokenize(token_words.translate(str.maketrans('', '', string.punctuation)))
    token_words = [word for word in token_words if word not in stop_words]

    # Save a dictionary with {term:count, term:count, term:count}
    doc_dic = {}
    count = 0;
    for word in token_words:
      if word not in stop_words:
        count += 1;
        if word in doc_dic:
          doc_dic[word] += 1
        else:
          doc_dic[word] = 1;

        if word not in term_total_docs:
          term_total_docs[word] = 1
        else:
          term_total_docs[word] += 1;
    doc_total_terms[answer_id] = count;
    # Save dictionary to doc_term_count as {docid: {term:count, term:count}, ...}
    doc_term_count[answer_id] = doc_dic.copy()

  createDocTF()
  createDocIDF()

In [15]:
# Create the TF Dictionary
doc_tf = {}

# Create the IDF Dictionary
doc_idf = {}

# Index through posts
createIndex()

In [16]:
def getTFIDFScore(query):
  query_parse = query.split(" ")
  word_scores = {}     # {word: {docid: tf-idf, docid: tf-idf}, word: {docid: tf-idf}}
  for word in query_parse:
    if word in doc_idf:
      idf = doc_idf[word] # Get our current term IDF
      # For every document:
      for doc_id in doc_tf:
        # Check if the term exists in the document:
        if word in doc_tf[doc_id]:
          if word in word_scores:
            curr_score = word_scores[word]
            curr_score[doc_id] = idf * doc_tf[doc_id][word]
          else:
            word_scores[word] = {doc_id: (idf * doc_tf[doc_id][word])}
        else:
          if word in word_scores:
            curr_score = word_scores[word]
            curr_score[doc_id] = 0
          else:
            word_scores[word] = {doc_id: 0}
  # at this point, we need to sum all like doc_ids to get final scores
  doc_scores = {} # {docid: score, docid: score, ...}
  for word_docs in word_scores: 
    word_tfidfs = word_scores[word_docs] #word_tfidfs = {docid: tf-idf, docid: tf-idf...}
    for doc_id in word_tfidfs:
      score = word_tfidfs[doc_id]
      if doc_id in doc_scores:
        doc_scores[doc_id] = doc_scores[doc_id] + score
      else:
        doc_scores[doc_id] = score

  return doc_scores

In [17]:
def printScores(query, scores):
  print(query + ": ")
  for doc in scores:
    print('%d \t\t %.2f' % (doc, scores[doc]))
  print()

In [28]:
query_results = []
# Now that we have TF and IDF, we need our query:
query = "espresso"
scores = getTFIDFScore(query)
scores = (dict(islice(sorted(scores.items(), key=lambda item: item[1], reverse=True), 5)))
for doc in scores:
  scores[doc] = round(scores[doc], 2)
printScores(query, scores)
query_results.append(scores)

query = "turkish coffee"
scores = getTFIDFScore(query)
scores = (dict(islice(sorted(scores.items(), key=lambda item: item[1], reverse=True), 5)))
for doc in scores:
  scores[doc] = round(scores[doc], 2)
printScores(query, scores)
query_results.append(scores)

query = "making a decaffeinated coffee"
scores = getTFIDFScore(query)
scores = (dict(islice(sorted(scores.items(), key=lambda item: item[1], reverse=True), 5)))
for doc in scores:
  scores[doc] = round(scores[doc], 2)
printScores(query, scores)
query_results.append(scores)

query = "can i use the same coffee grounds twice"
scores = getTFIDFScore(query)
scores = (dict(islice(sorted(scores.items(), key=lambda item: item[1], reverse=True), 5)))
for doc in scores:
  scores[doc] = round(scores[doc], 2)
printScores(query, scores)
query_results.append(scores)

print(query_results)

espresso: 
4404 		 0.14
2867 		 0.13
3168 		 0.12
3904 		 0.12
3800 		 0.11

turkish coffee: 
5182 		 0.38
483 		 0.29
5094 		 0.28
209 		 0.27
4486 		 0.25

making a decaffeinated coffee: 
204 		 0.55
120 		 0.52
2897 		 0.31
3225 		 0.27
3293 		 0.22

can i use the same coffee grounds twice: 
2683 		 0.35
3966 		 0.23
5582 		 0.19
3818 		 0.18
2585 		 0.17

[{4404: 0.14, 2867: 0.13, 3168: 0.12, 3904: 0.12, 3800: 0.11}, {5182: 0.38, 483: 0.29, 5094: 0.28, 209: 0.27, 4486: 0.25}, {204: 0.55, 120: 0.52, 2897: 0.31, 3225: 0.27, 3293: 0.22}, {2683: 0.35, 3966: 0.23, 5582: 0.19, 3818: 0.18, 2585: 0.17}]


In [40]:
# Relevance_docs = {{doc: 0, doc: 1, doc: 0, ...}, {doc: 0, doc: 1, ...}}
# Results = {{doc: score, doc: score, doc: score}, {doc: score, doc:score, ...}}
def createQrel(relevance_docs, results, filename):
  qrel_filename = os.path.join("/content", (filename + ".txt"))
  qrelresults_filename = os.path.join("/content", (filename + "Results.txt"))
  question = "Q00"
  count = 1;

  with open(qrel_filename, 'w') as f:
    for relevance in relevance_docs:
      question_str = question + str(count)
      count += 1
      for doc in relevance:
        line = question_str + " 0 " + str(doc) + " " + str(relevance[doc])
        f.write(line + "\n")
  
  count = 1
  with open(qrelresults_filename, 'w') as f:
    for query_results in results:
      question_str = question + str(count)
      count += 1
      for doc in query_results:
        line = question_str + " Q0 " + str(doc) + " 1 " + str(query_results[doc]) + " TF-IDF"
        f.write(line + "\n")

In [41]:
# Relevance 
doc_relevance = [{4404: 0, 2867: 0, 3168: 0, 3904: 0, 3800: 0}, 
                 {5182: 1, 483: 0, 5094: 0, 209: 1, 4486: 0}, 
                 {204: 0, 120: 0, 2897: 0, 3225: 1, 3293: 0}, 
                 {2683: 1, 3966: 0, 5582: 1, 3818: 0, 2585: 0}]

createQrel(doc_relevance, query_results, "CoffeePostsTFIDF")

In [37]:
##### UNCOMMENT TO VIEW VALUES:
# Keeps track of {docid: {term:count, term:count,...}, docid: {term:count, term:count}, ...}
#doc_term_count

# Keeps track of {docid: total_term_count, docid: total_term_count, ...}
#doc_total_terms

# Keeps track of {term: #_of_docs, term: #_of_docs, ...}
#term_total_docs

# Keeps track of {docid: {term:TERM_FREQ, term:TERM_FREQ}, docid: {term:TERM_FREQ}, ...}
#doc_tf

# Keeps track of {term: idf_score, term: idf_score}
#doc_idf