Ryan Reed - COS 470 - Assignment 2 - Question 3

In [193]:
# importing
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from post_parser_record import PostParserRecord
from collections import defaultdict
import re

# creating the post_reader object to read in questions & answers
post_reader = PostParserRecord("Posts_Coffee.xml")
# initializing a set that contains stop words, for easier removal operations
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [194]:
# utility function to clean input text.
# full functions are : conversion to lowercase, removing stopwords, removing punctuation.
# I implemented this seperately for purposes of code readability, as well as simplification.
def text_processor (input_text):
  # remove punctuation & convert text to lowercase
  input_text = re.sub(r"[(,,.;@/>#'’//=\"-:?*\[\]<!&$)]+\ *", " ", input_text.lower())
  # text tokenized into a list
  list_of_words = word_tokenize(input_text)
  # list comprehension to filter out any stopwords
  list_of_words_res = [x for x in list_of_words if (x not in stop_words) and (x != 'p')]
  # slicing off the xml headers, and returning it
  # Reasoning for returning as a set: duplicate removal.
  return set(list_of_words_res)

In [195]:
# inverted index function that creates our hashtable object for a collection of documents
# the python implementation of a hashtable is a dictionary, and our collection is the set
# of questions and answers within the snapshot of the coffee stack exchange.
def InvertedIndex(reader_object):
  #initializing empty dictionary
  II_dict = {}
  count = 0

  # the function technically has two collections that it will need to go over.
  # the first are the posts that are questions, the second are the posts that are answers.
  
  # for question posts
  for post in reader_object.map_questions:
    
    # text_processor function parses the post title and body into tokens.
    # text_processor also performs data processing, documented in the function code block.
    # returned as a set, no need to worry about duplicates.
    set_of_tokens = text_processor(reader_object.map_questions[post].title + " " + reader_object.map_questions[post].body)
    for token in set_of_tokens:
      if (token not in II_dict):
        II_dict[token] = set()
      II_dict[token].add(reader_object.map_questions[post].post_id)
      
  
  # same as the process for questions, but for answers.
  for post in reader_object.map_just_answers:
    # same as for questions
    set_of_tokens = text_processor(reader_object.map_just_answers[post].body)
    for token in set_of_tokens:
      if (token not in II_dict):
        II_dict[token] = set()
      II_dict[token].add(reader_object.map_just_answers[post].post_id)
  
  # sorting the index sets in the dictionary.
  # this algorithm is merging the indexes of tokens in questions and answers.
  # so we must sort the sets of indexes, as we merge the tokens from both types of posts.
  for key in II_dict:
    II_dict[key] = sorted(II_dict[key])
  return II_dict

In [196]:
# runs the function with the post_reader
inverted_index = InvertedIndex(post_reader)

In [2]:
# this is for error checking, ensuring that the correct postings are found
# without any issues.

# print(inverted_index['espresso'])

In [198]:
# this function handles both querying, and computing similiarity score
# implementing operators would be easy, but the assignment doesn't
# request that. so it'll be explicit AND for more than one term.
def query(input_terms):
  # creates a temporary dictionary
  temp_dict = dict()
  # single term query
  # the loop below returns a dataframe 
  if len(input_terms) == 1:
    term = input_terms[0]
    if term in inverted_index:
      temp_dict = {'Query': [term], 'PostIDs': [inverted_index[term][:10]], 'Similarity Score' : ['N/A']}
      return pd.DataFrame(temp_dict)
  # two-term query, explicit AND for this assignment
  if len(input_terms) == 2:
    # obtains terms
    term1, term2 = input_terms
    set_indices = set()
    # query_string is simply for conversion later on into a dataframe.
    query_string = term1, " AND ", term2
    similarity_score = 0.0
    if term1 in inverted_index:
      if term2 in inverted_index:
        # for computing similarity score, I'm assuming based on the assignment that
        # document count means the total number of documents each appears in independently.
        for index1 in inverted_index[term1]:
          if index1 in inverted_index[term2]:
            set_indices.add(index1)
        
        set_indices = sorted(set_indices)

        len_term1 = len(inverted_index[term1])
        len_term2 = len(inverted_index[term2])

        # calculation of similarity scores
        if (len_term1 > len_term2):
          similarity_score = (1- (len(inverted_index[term1]) - len(inverted_index[term2])) / (len(inverted_index[term1])))
        else:
          similarity_score = (1 - (len(inverted_index[term2]) - len(inverted_index[term1])) / (len(inverted_index[term2])))

        temp_dict = {'Query': (''.join(query_string)), 'PostIDs': [set_indices[:10]],
                  'Similarity Score' : [similarity_score]}
    # handles the instance of a term not appearing in the collection.
    # in our three queries, persian AND coffee meets this. 
    elif (term1 not in inverted_index) or (term2 not in inverted_index):
      temp_dict = {'Query': (''.join(query_string)), 'PostIDs': ['N/A'], 'Similarity Score' : [similarity_score]}
    return pd.DataFrame(temp_dict)
  # number of arguments must be 1 or 2, anything more will be returned with an error statement.
  else:
    print("Error: Incorrect number of terms for Query.")
    return 0

In [210]:
# queries
# handles each query seperately, and merges the returned dataframes into one singular one.

q1 = query(["espresso"])
q2 = query(["turkish", "coffee"])
q3 = query(["persian", "coffee"])
queries_result = pd.concat([q1, q2, q3])
queries_result

Unnamed: 0,Query,PostIDs,Similarity Score
0,espresso,"[2, 5, 7, 9, 10, 17, 22, 26, 27, 30]",
0,turkish AND coffee,"[42, 45, 81, 106, 165, 209, 216, 349, 365, 419]",0.036829
0,persian AND coffee,,0.0


In [209]:
# writing indexes to tsv
import csv

with open('indexes.tsv', 'w') as csvfile:
  writer = csv.writer(csvfile, delimiter='\t')

  indexes_dict = dict()
  for x in inverted_index:
    indexes_dict[x] = inverted_index[x]
  writer.writerows(indexes_dict.items())

In [1]:
# calculation of kendall's tau correlation
# from compiled lists of P@10 and MRR.
import scipy.stats as stats
MRR_list = [0.6103, 0.4339 , 0.1003 ,0.0854, 0.8640, 0.2811, 0.4829, 0.0237, 0.4196, 0.3857, 0.2432]
P_10_list = [0.3566, 0.1818, 0.0584 ,0.0519, 0.5479, 0.1883, 0.2974, 0.0039, 0.2403 , 0.2390 , 0.1390]

tau, pvalue = stats.kendalltau(MRR_list, P_10_list)
print(tau)



0.8909090909090909
