## Exercise 1

In this exercise we will understand the functioning of TF/IDF ranking. 

Implement the vector space retrieval model, based on the code framework provided below.

For testing we have provided a simple document collection with 5 documents in file bread.txt:

  DocID | Document Text
  ------|------------------
  1     | How to Bake Breads Without Baking Recipes
  2     | Smith Pies: Best Pies in London
  3     | Numerical Recipes: The Art of Scientific Computing
  4     | Breads, Pastries, Pies, and Cakes: Quantity Baking Recipes
  5     | Pastry: A Book of Best French Pastry Recipes

Now, for the query $Q = ``baking''$, find the top ranked documents according to the TF/IDF rank.

For further testing, use the collection __epfldocs.txt__, which contains recent tweets mentioning EPFL.

Compare the results also to the results obtained from the reference implementation using the scikit-learn library.

In [4]:
# Loading of libraries and documents

from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import string
from nltk.corpus import stopwords
import math
from collections import Counter
nltk.download('stopwords')
nltk.download('punkt')

# Tokenize, stem a document
stemmer = PorterStemmer()
def tokenize(text):
    text = "".join([ch for ch in text if ch not in string.punctuation])
    tokens = nltk.word_tokenize(text)
    return " ".join([stemmer.stem(word.lower()) for word in tokens])

# Read a list of documents from a file. Each line in a file is a document
with open("epfldocs.txt") as f:
    content = f.readlines()
original_documents = [x.strip() for x in content] 
documents = [tokenize(d).split() for d in original_documents]

[nltk_data] Downloading package stopwords to /home/yawen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/yawen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
# TF/IDF code

# create the vocabulary
vocabulary = set([item for sublist in documents for item in sublist])
vocabulary = [word for word in vocabulary if word not in stopwords.words('english')]
vocabulary.sort()

print(vocabulary)

['0', '0203', '0206', '032017', '06', '1', '10', '100', '1000', '100kg', '1011', '1012', '1030', '10h', '10pm', '10th', '11', '1112', '11am', '12', '120', '1200', '12000', '1200cet', '1215h', '13', '1315', '1315th', '14h', '15', '150', '15billion', '15h', '15h15', '15th', '16', '16h', '17', '170', '1717', '17h', '18', '18h00', '18h30', '18th', '19052017', '1946', '1980', '1989', '1er', '1st', '1ère', '2', '20', '2015', '2016', '20162017', '2017', '2018', '2018hpeworkshop', '2020', '20242028', '2030agenda', '21', '2101', '21st', '227', '22march', '23rd', '23rdthing', '24', '247', '24heuresch', '25042017', '2526', '25kmh', '25novemb', '26', '26th', '27', '28082017', '299', '2eme', '2nd', '3', '30', '3001', '30th', '31', '31052017', '32', '35', '359', '3d', '3dprint', '3dprintabl', '3dprinter', '3e', '3ème', '4', '40', '400k', '42', '42born2cod', '45', '45th', '4k', '4person', '4pm', '4th', '5', '50', '500', '50000', '5060', '51', '55', '5d', '5g', '5gppp', '5pm', '5ten', '5th', '5ème', '

In [18]:
# compute term occurence in documents
def term_occurence(term, documents):
    occ = 0
    for doc in documents:
        if term in doc:
            occ = occ + 1
    return occ

# compute IDF, storing idf values in a dictionary
def idf_values(vocabulary, documents):
    idf = {}
    num_documents = len(documents)
    for i, term in enumerate(vocabulary):
        # nb documents having the term
#         ni = term_occurence(term, documents)
        ni = sum(term in document for document in documents)
        if ni != 0:
            idf[term] = max(0, math.log(num_documents/ni, math.e))
        else:
            idf[term] = 0

    return idf

# Function to generate the vector for a document (with normalisation)
def vectorize(document, vocabulary, idf):
    vector = [0]*len(vocabulary)
    # term-frequency in document
    counts = Counter(document)
    # most common elements for a list
    max_count = counts.most_common(1)[0][1]
    for i,term in enumerate(vocabulary):
        # tf = freq / max-freq
        tf = counts[term] / max_count
        # = tf * idf
        vector[i] = tf * idf[term]
    return vector

# Compute IDF values and vectors
idf = idf_values(vocabulary, documents)
# print("IDFs are: \n",  idf)
# print("Document vectors are:")
document_vectors = [vectorize(s, vocabulary, idf) for s in documents]
# print(document_vectors)

# Function to compute cosine similarity
def cosine_similarity(v1,v2):
    sumxx, sumxy, sumyy = 0, 0, 0
    for i in range(len(v1)):
        x = v1[i]; y = v2[i]
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y
    if sumxy == 0:
        result = 0
    elif sumxx == 0 or sumyy == 0:
        result = 0
    else:
            result = sumxy / math.sqrt(sumxx*sumyy)
    return result

import numpy as np

def cosine_sim(v1, v2):
    dot = np.dot(v1, v2)
    norm_v1 = np.linalg.norm(v1)
    norm_v2 = np.linalg.norm(v2)
    return dot/(norm_v1 * norm_v2)
    

# computing the search result (get the topk documents)
def search_vec(query, topk=10):
    q = query.split()
    q = [stemmer.stem(w) for w in q]
    query_vector = vectorize(q, vocabulary, idf)
    scores = [[cosine_similarity(query_vector, document_vectors[d]), d] for d in range(len(documents))]
    scores.sort(key=lambda x: -x[0])
    doc_ids = []
    for i in range(topk):
#         print(original_documents[scores[i][1]])
        doc_ids.append(scores[i][1]) 
    return doc_ids
# HINTS
# natural logarithm function
#     math.log(n,math.e)
# Function to count term frequencies in a document
#     Counter(document)
# most common elements for a list
#     counts.most_common(1)

In [19]:
search_vec('computer science', 5)

[4, 838, 795, 30, 89]

In [106]:
# Reference code using scikit-learn
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,1), min_df = 1, stop_words = 'english')
features = tf.fit_transform(original_documents)
npm_tfidf = features.todense()
new_features = tf.transform(['computer science'])

cosine_similarities = linear_kernel(new_features, features).flatten()
related_docs_indices = cosine_similarities.argsort()[::-1]
topk = 5
for i in range(topk):
    print(related_docs_indices[i])
    print(original_documents[related_docs_indices[i]])

4
Exciting News: "World University Rankings 2016-2017 by subject: computer science" No1 @ETH &amp; @EPFL on No8. Congrats https://t.co/ARSlXZoShQ
838
New computer model shows how proteins are controlled "at a distance" https://t.co/zNjK3bZ6mO  via @EPFL_en #VDtech https://t.co/b9TglXO4KD
795
An interview with Patrick Barth, a new @EPFL professor who combines protein #biophysics with computer modeling https://t.co/iJwBaEbocj
420
Exposure Science Film Hackathon 2017 applications open! Come join our Scicomm-film-hacking event! #Science #scicomm https://t.co/zwtKPlh6HT
300
Le mystère Soulages éblouit la science @EPFL  https://t.co/u3uNICyAdi



## Exercise 2: Evaluate retrieval results

In this exercise, we consider the scikit reference code as an “oracle” that supposedly gives the correct result. Your exercise is to compare the above tf-idf retrieval model with this oracle for the following queries "computer science", "IC school", "information systems".



In [21]:
from operator import itemgetter

# Retrieval oracle 
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,1), min_df = 1, stop_words = 'english')
features = tf.fit_transform(original_documents)
npm_tfidf = features.todense()

# Return all document ids that that have cosine similarity with the query larger than a threshold
def search_vec_sklearn(query, features, threshold=0.1):
    new_features = tf.transform([query])
    cosine_similarities = linear_kernel(new_features, features).flatten()
    related_docs_indices, cos_sim_sorted = zip(*sorted(enumerate(cosine_similarities), key=itemgetter(1), 
                                                       reverse=True))
    doc_ids = []
    for i, cos_sim in enumerate(cos_sim_sorted):
#         print(cos_sim_sorted)
        if cos_sim < threshold:
            break
        doc_ids.append(related_docs_indices[i])
    return doc_ids

In [85]:
ret_ids = search_vec_sklearn('computer science', features)
print(ret_ids)
# for i, v in enumerate(ret_ids):
#     print(original_documents[v])

[4, 838, 795, 420, 300, 810, 713, 426, 730, 778, 131, 904, 616, 201, 1056, 600, 764, 358, 837, 524, 250, 443, 969, 49, 210, 1054]


In [14]:
queries = ["computer science", "IC school", "information systems"]

## Exercise 2.1: Compute the precision and recall at k

In [106]:
def compute_recall_at_k(predict, gt, k):
    # predict: documents of my own funcs
    # k: int. top k docs recall
    # gt: ground truth
#     tp = float(np.sum(predict[:k] == gt[:k])) FALSE!!
    # We need to use a set cuz there might be recall with slight differences =>
    # we don't care about order of retrieval
    tp = len(set(predict[:k]).intersection(set(gt[:k])))
    fn = len(gt) - tp
    recall_k = float(tp) / float(tp + fn)
    if recall_k == 0 :
        print("tp: {}, fn: {}".format(tp, fn))
    return recall_k

In [107]:
def compute_precision_at_k(predict, gt, k):
    # predict: documents of my own funcs
    # k: int. top k docs precision
    # gt: ground truth
#     tp = float(np.sum(predict[:k] == gt[:k]))
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    tp = len(set(predict[:k]).intersection(set(gt[:k])))
    fp = k - tp
    precision_k = tp / float(tp + fp)
    if precision_k == 0 :
        print("tp: {}, fn: {}".format(tp, fp))
    return precision_k
    

## Exercise 2.2: Compute the MAP score

In [111]:
# MISSING!
def compute_interpolated_precisions(prec_rec):
    max_prec = []
    current_rec = prec_rec[-1][1] # last (precision, recall)
    for k, (prec, rec) in enumerate(prec_rec):
        max_precision = prec
        for j, (prec_, rec_) in enumerate(prec_rec[k+1:]): # K+1!!!
#             if prec_ > max_precision:
            if prec_ > max_precision and rec_ >= rec:
                max_precision = prec_
        max_prec.append(max_precision)
    return max_prec

In [117]:
def compute_map(queries):
    map_score = 0
    prec_rec_dict = []
    for i, query in enumerate(queries):
#         precision_for_query = 0
        predict = search_vec(query, 10) # my own func
        gt = search_vec_sklearn(query, features) # relevant doc ids
        # to compute interpolated precision
        prec_rec = []
#         for k in range(1, len(gt)):
        for k in range(1, len(gt) + 1):
            precision_k = compute_precision_at_k(predict, gt, k)
            recall_k = compute_recall_at_k(predict, gt, k)
            print('k, precision: {}, recall: {}'.format(k, precision_k, recall_k))

#  FALSE!!           precision_for_query = precision_for_query + precision_k
            prec_rec.append((precision_k, recall_k))
    
    # Need to compute the interpolated precision!!
        precs_int = compute_interpolated_precisions(prec_rec)
        # Sum up the interpolated precision
#         map_score += sum(precs_int) / len(original_documents) #
        map_score += sum(precs_int)/len(gt) # ONLY RELEVANT DOCS!
        prec_rec_dict.append(prec_rec)
    map_score = map_score / len(queries)
    return map_score, prec_rec_dict

In [118]:
map_score, prec_rec_dict = compute_map(queries)

k, precision: 1, recall: 1.0
k, precision: 2, recall: 1.0
k, precision: 3, recall: 1.0
k, precision: 4, recall: 0.75
k, precision: 5, recall: 0.6
k, precision: 6, recall: 0.6666666666666666
k, precision: 7, recall: 0.5714285714285714
k, precision: 8, recall: 0.625
k, precision: 9, recall: 0.6666666666666666
k, precision: 10, recall: 0.7
k, precision: 11, recall: 0.6363636363636364
k, precision: 12, recall: 0.5833333333333334
k, precision: 13, recall: 0.5384615384615384
k, precision: 14, recall: 0.5
k, precision: 15, recall: 0.4666666666666667
k, precision: 16, recall: 0.4375
k, precision: 17, recall: 0.4117647058823529
k, precision: 18, recall: 0.3888888888888889
k, precision: 19, recall: 0.3684210526315789
k, precision: 20, recall: 0.35
k, precision: 21, recall: 0.3333333333333333
k, precision: 22, recall: 0.3181818181818182
k, precision: 23, recall: 0.30434782608695654
k, precision: 24, recall: 0.2916666666666667
k, precision: 25, recall: 0.28
k, precision: 26, recall: 0.269230769230

In [119]:
map_score

0.5874681113152389