# Information Retrieval and Web Analytics

# Part 2: Indexing and evaluation


In [1]:
# mount google drive if using google collab, else skip
# we are not using it because it is more comfortable to use jupyter lab

BASEDIR = '.'

try:
    from google.colab import drive
    drive.mount('/content/drive')
    BASEDIR = 'drive/MyDrive'
    
except ModuleNotFoundError:
    pass

Mounted at /content/drive


In [62]:
# required imports for the notebook

import json
import csv
import math
import numpy as np
from array import array

from nltk.stem import PorterStemmer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import pandas as pd

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [59]:
# open results from last practice
tweets = pd.read_csv(f'{BASEDIR}/data/processed_tweets.csv')
tweets = tweets.reset_index()  # make sure indexes pair with number of rows

### Inverted index

In [73]:
#create inverted index
def create_index(tweets):
    """
    Impleent the inverted index
    
    Argument:
    collection of tweets
    
    Returns:
    index - the inverted index containing terms as keys and the corresponding 
    list of tweets these keys appears in (and the positions) as values.
    tf - normalized term frequency for each term in each tweet
    idf - inverse document frequency of each term
    """
    
    index = [] 
    
    tf = []         # term frequencies of terms in tweets 
    df = []         # tweet frequencies of terms in the document collection
    idf = []
    
    N = len(tweets)
    
    for index, tweet in tweets.iterrows(): 
        tweet_id = tweet['id']        # get the id of the tweet
        terms = tweet['full_text']    # get the tweet content
                
        termdictTweet = {}

        for position, term in enumerate(terms): # terms in the tweet
            try:
                # if the term is already in the index for the current tweet
                # append the position to the corrisponding list
                termdictTweet[term][tweet_id].append(position)  
            except:
                # Add the new term as dict key and initialize the array of positions and add the position
                termdictTweet[term]=[tweet_id, array('I',[position])] #'I' indicates unsigned int (int in python)
        
        
        # normalize term frequencies
        # Compute the denominator to normalize term frequencies
        # norm is the same for all terms of a tweet.
        norm = 0
        for term, posting in termdictTweet.items(): 
            # posting is a list containing tweet_id and the list of positions for current term in current tweet: 
            # posting ==> [tweet_id, [list of positions]] 
            # you can use it to inferr the frequency of current term.
            norm += len(posting[1])**2
        
        norm = math.sqrt(norm)

        # calculate the tf (dividing the term frequency by the above computed norm) and df weights
        for term, posting in termdictTweet.items():     
            # append the tf for current term (tf = term frequency in current tweet/norm)
            tf[term].append(np.round(len(posting[1])/norm ,4))  
            # increment the document frequency of current term (number of tweets containing the current term)
            df[term] += 1  # increment df for current term
        
        # Compute idf 
        for term in df:
            idf[term] = np.round(np.log(float(N/df[term])),4)
        
        # merge the current tweet index with the main index
        for termpage, postingpage in termdictTweet.items():
            index[termpage].append(postingpage)
                      
                    
    return index, tf, idf


In [74]:
inverted_index, tf, idf= create_index(tweets)

TypeError: ignored

### 5 Text queries

### Ranking results

### Evaluation

In [94]:
# useful functions for the evaluation
def precision_k (y_true, y_score, k=10):
    order = y_score.argsort()[::-1]
    y_true = y_true.take(order)
    relevant = np.sum(y_true[:k])
    return float(relevant/k)

def recall_k (y_true, y_score, k=10):
    order = y_score.argsort()[::-1]
    y_true = y_true.take(order)
    relevant = np.sum(y_true[:k])
    total_relevant = np.sum(y_true[:])
    return float(relevant/total_relevant)

def f1_score(precision, recall):
    return (2*precision*recall)/(precision+recall)

def avg_precision_at_k(doc_score, y_score, k=10):
    gtp = np.sum(doc_score == 1)
    order = np.argsort(y_score)[::-1]
    doc_score = np.take(doc_score, order[:k])
    if gtp == 0:
        return 0
    n_relevant_at_i = 0
    prec_at_i = 0
    for i in range(len(doc_score)):
        if doc_score[i] == 1:
            n_relevant_at_i += 1
            prec_at_i += n_relevant_at_i / (i + 1)
    return prec_at_i / gtp

def map_at_k(search_res, k=10):
    avp = []
    for q in search_res["query_id"].unique():
        curr_data = search_res[search_res["query_id"] == q]
        avp.append(avg_precision_at_k(np.array(curr_data["is_relevant"]), 
                   np.array(curr_data["predicted_relevance"]), k))
    return np.sum(avp) / len(avp), avp

def rr_at_k(doc_score, y_score, k=10):
    order = np.argsort(y_score)[::-1]
    doc_score = np.take(doc_score, order[:k])
    if np.sum(doc_score) == 0:
        return 0
    return 1 / (np.argmax(doc_score == 1) + 1)

def dcg_at_k(doc_score, y_score, k=10):
    order = np.argsort(y_score)[::-1]
    doc_score = np.take(doc_score, order[:k])
    gain = 2 ** doc_score - 1
    discounts = np.log2(np.arange(len(doc_score)) + 2)
    return np.sum(gain / discounts)


def ndcg_at_k(doc_score, y_score, k=10):
    dcg_max = dcg_at_k(doc_score, doc_score, k)
    if not dcg_max:
        return 0
    return np.round(dcg_at_k(doc_score, y_score, k) / dcg_max, 4)

#### Baseline


In [95]:
# read the new csv file as a dataframe
with open(f'{BASEDIR}/data/evaluation_gt.csv', 'r') as file:
    ev_array = file.readlines()
    ev_array = [row.rstrip().split(',') for row in ev_array]
df = pd.DataFrame(ev_array[1:], 
             columns=[ev_array[0]])

In [96]:
current_query_res = df[df["query_id"] == 0]
k = 10
print("==> Precision@{}: {}\n".format(k, precision_k(current_query_res["label"], current_query_res["label"], k)))


AttributeError: ignored

#### Defined queries


In [80]:
df

Unnamed: 0,doc,query_id,label
0,doc_12,1,1
1,doc_9,1,1
2,doc_18,1,1
3,doc_45,1,1
4,doc_501,1,1
5,doc_52,1,1
6,doc_82,1,1
7,doc_100,1,1
8,doc_122,1,1
9,doc_165,1,1


### Tweet representation