# Information Retrieval and Web Analytics

# Part 2: Indexing and evaluation


In [50]:
# mount google drive if using google collab, else skip
# we are not using it because it is more comfortable to use jupyter lab

BASEDIR = '.'

try:
    from google.colab import drive
    drive.mount('/content/drive')
    BASEDIR = 'drive/MyDrive'
    
except ModuleNotFoundError:
    pass

In [51]:
# required imports for the notebook

import json
import csv
import math
import numpy as np
from array import array
from collections import defaultdict, Counter
import functools


from nltk.stem import PorterStemmer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import pandas as pd

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rafaelbardisarodes/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [52]:
import time

RED = "\033[91m"
WHITE = "\033[0m"


def benchmark(func):
    """
    Decorador que te mide el tiempo que tarda la funcion en ejecutarse.
    Se puede usar como cualquier funcion, e.g. benchmark(func),
    pero al ser un decorador la gracia que tiene es que al hacer
    @benchmark
    def func():...
    cada vez que uses func() estaras usando benchmark(func)()
    :param func: la funcion que quieres testear
    :return: la funcion original envuelta por el codigo de testeo
    """
    def inner(*args, **kwargs):
        start = time.perf_counter()
        result = func(*args, **kwargs)
        end = time.perf_counter()
        print(f'Time taken for {RED}{func.__name__}{WHITE}: {end - start:.4f}')
        return result

    return inner

In [53]:
# open results from last practice
tweets = pd.read_csv(f'{BASEDIR}/data/processed_tweets.csv')
tweets = tweets.reset_index()  # make sure indexes pair with number of rows

### Inverted index

In [54]:
# reuse of the function shown in class to transform text into lowercase and erase stop words in queries
def build_terms(line):
    """
    Preprocess the line removing stop words, stemming,
    transforming in lowercase and return the tokens of the text.
    
    Argument:
    line -- string (text) to be preprocessed
    
    Returns:
    line - a list of tokens corresponding to the input text after the preprocessing
    """

    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))
    line = line.lower()
    line = line.split()  # Tokenize the text to get a list of terms
    line = [x for x in line if x not in stop_words]  # eliminate the stopwords
    line = [stemmer.stem(word) for word in line] # perform stemming (HINT: use List Comprehension)
    return line


@benchmark
def create_index(tweets):
    """
    Implement the inverted index
    
    Argument:
    lines -- collection of Wikipedia articles
    
    Returns:
    index - the inverted index (implemented through a Python dictionary) containing terms as keys and the corresponding
    list of documents where these keys appears in (and the positions) as values.
    """
    index = defaultdict(list)
    title_index = {}  # dictionary to map page titles to page ids

    for tweet in tweets.itertuples(index=True):  # Remember, lines contain all documents from file
        tweet_text = tweet.full_text
        
        # tweet id
        tweet_id = int(tweet.id.split("_")[1])

        terms = str(tweet_text).split(" ")  # page_title + page_text

        title_index[tweet_id] = tweet.user  ## we do not need to apply get terms to title because it used only to print titles and not in the index
        
        ## ===============================================================        
        ## create the index for the current page and store it in current_page_index (current_page_index)
        ## current_page_index ==> { ‘term1’: [current_doc, [list of positions]], ...,‘term_n’: [current_doc, [list of positions]]}

        ## Example: if the curr_doc has id 1 and his text is "web retrieval information retrieval":

        ## current_page_index ==> { ‘web’: [1, [0]], ‘retrieval’: [1, [1,4]], ‘information’: [1, [2]]}

        ## the term ‘web’ appears in document 1 in positions 0, 
        ## the term ‘retrieval’ appears in document 1 in positions 1 and 4
        ## ===============================================================

        current_page_index = {}

        for position, term in enumerate(terms): # terms contains page_title + page_text. Loop over all terms
            try:
                # if the term is already in the index for the current page (current_page_index)
                # append the position to the corresponding list
                current_page_index[term][1].append(position)
            except:
                # Add the new term as dict key and initialize the array of positions and add the position
                current_page_index[term] = [tweet_id, array('I', [position])]  #'I' indicates unsigned int (int in Python)

        # merge the current page index with the main index
        for term_page, posting_page in current_page_index.items():
            index[term_page].append(posting_page)
    return index, title_index


In [55]:
index, title_index= create_index(tweets)

Time taken for [91mcreate_index[0m: 1.5668


### 5 Text queries

In [56]:
# def query(text)?:
# build terms(query)

# index = tremendo index
# foreach term in query
# index = index[term in entry] <- boolean mask, conjunctive AND

# return index <- the doc ids
# or
# return tweets[original_text][index] <- get original text of tweets containing all elements in query

def query(text, tweet_index=""):
    """
    search for a given text in the tweet collection using the
    inverted index we previously computed
    :param text: the query text
    :param tweet_index: inverted index of the collection, named as such because context of practice
    :return: list of tweet ids containing all (treated) terms in the query
    """
    
    # necessary step since same treatment applied to tweets
    terms = build_terms(text)
    
    # select tweet index, defaults to global index but can be specified
    tweet_index = tweet_index if tweet_index else index
    
    plausible_ids = []
    for query_term in terms:
        # tweet_index[query_term] is list of tweet ids containing query term + position(s) in text, could be useful in the future
        # plausible_ids[query_term] = tweet_index[query_term]
        
        # using sets is convenient for using reduce
        plausible_ids.append(set(term_pos[0] for term_pos in tweet_index[query_term]))
        
    # reduce list of sets to intersection of all
    relevant_ids = functools.reduce(lambda a, b: a.intersection(b), plausible_ids)
                             
    return relevant_ids
        

In [57]:
query("keeps us")

{1, 484, 762, 2045, 3377, 3996}

### Ranking results

In [113]:
# rank(query):
# query = build_terms(query)

# scores = {}
# length = {}
# foreach term in query:
# w_q = TF-IDF(term, query), docids = query(term)
# foreach docid in docids:
# scores[docid] += TF-IDF(term, tweets.full_text[docid]) * w_q
# ::
# foreach docid:
# scores[d] /= len(tweets.full_text[docid])

# sort scores, return top K


# relevant documents = query(query)
# foreach document in relevant_documents
# TF-IDF(document, query)



# TF-IDF(document, query):

# len(query(term)) is df(term if term is one word 


def tf_idf(term_freq, document_freq, collection_len):
    if term_freq == 0 or document_freq == 0:
        return 0
    return (1 + math.log(term_freq)) * math.log(collection_len/document_freq)


def cosine_score(query_text, collection_index="", collection="", k=10):
               
    collection_index = collection_index if collection_index else index
    collection = collection if collection else {tweet.id: tweet.full_text for tweet in tweets.itertuples(index=True)}
    collection_len = len(collection)
           
    scores = {doc_id: 0 for doc_id in collection.keys()}
    length = {doc_id: len(str(document).split(" ")) for doc_id, document in collection.items()}
    
    query_terms = build_terms(query_text) # necessary step since same treatment applied to tweets
    
    # dictionary of frequency of each term in the query
    query_frequencies = Counter(query_terms)
        
    for term in query_terms:
        
        # query of a term returns the set of documents containing the term
        document_freq = len(query(term, tweet_index=collection_index))
        
        query_weight = tf_idf(query_frequencies[term], document_freq, collection_len)
        
        # hasta aqui esta bien probablemente, despues pasa algo raro
        for doc_id, document in collection.items():
            # counter of distinct terms in document
            term_frequencies = Counter(str(document).split(" "))
            document_weight = tf_idf(term_frequencies[term], document_freq, collection_len)
            scores[doc_id] += query_weight * document_weight

            
    for doc_id, doc_len in length.items():
        scores[doc_id] = scores[doc_id] / doc_len
        
    doc_ids_sorted = sorted(scores, key=scores.get, reverse=True)[:k]
    return {doc_id: scores[doc_id] for doc_id in doc_ids_sorted}


In [116]:
cosine_score("keep us posted")

{'doc_1069': 7.227550714690679,
 'doc_433': 6.396429655820483,
 'doc_1873': 5.420663036018009,
 'doc_3377': 5.391766993819088,
 'doc_1320': 5.037112527693429,
 'doc_2677': 5.037112527693429,
 'doc_1546': 4.336530428814408,
 'doc_1': 3.8512621384422054,
 'doc_2640': 3.6137753573453395,
 'doc_3175': 2.5185562638467145}

In [80]:

A = {"f": 3, "fds": 67, "d": 2}

B = sorted(A, key=A.get, reverse=True)[:2]
{key: A[key] for key in B}

{'fds': 67, 'f': 3}

In [75]:
test = Counter(build_terms("keeps us"))
len(query("help"))
len(tweets.full_text)
for idx, tweet in enumerate(tweets.full_text):
    print(idx, end="\r")
test["a"]
{tweet.id: tweet.full_text for tweet in tweets.itertuples(index=True)}

3999

{'doc_1': 'keep spin us 7 pm…go away already.',
 'doc_2': 'heart go affect wish everyon road current brave condit safe travels. 💙',
 'doc_3': 'kissimme neighborhood michigan ave.',
 'doc_4': 'one tree backyard scare poltergeist tree it’ storm windi like this.',
 'doc_5': 'pray everyon affect associ winknews. sympathi anim abusers, liars, condon it.',
 'doc_6': 'ace handyman servic hope everyon safe hurricane. damag caus hurrican first priority! call schedul appoint one multi-skil craftsmen today! 📞813-565-2022',
 'doc_7': 'storm surg issu georgetown, sc',
 'doc_8': 'thought students, teachers, parents, communities, suffer wake hurrican ian.',
 'doc_9': 'brace make landfal within hour via',
 'doc_10': 'piss god send florida south carolina!? cult anger god pay sins.',
 'doc_11': "today' edit smoke eater photo essay also, anim bleps.",
 'doc_12': "offici made landfal 2:15 pm near georgetown, sc. ian made landfal cat 1 sustain wind 85 mph. ian' 3rd landfall.",
 'doc_13': 'togeth rais $20,0

### Evaluation

In [None]:
# useful functions for the evaluation
def precision_k (y_true, y_score, k=10):
    order = y_score.argsort()[::-1]
    y_true = y_true.take(order)
    relevant = np.sum(y_true[:k])
    return float(relevant/k)

def recall_k (y_true, y_score, k=10):
    order = y_score.argsort()[::-1]
    y_true = y_true.take(order)
    relevant = np.sum(y_true[:k])
    total_relevant = np.sum(y_true[:])
    return float(relevant/total_relevant)

def f1_score(precision, recall):
    return (2*precision*recall)/(precision+recall)

def avg_precision_at_k(doc_score, y_score, k=10):
    gtp = np.sum(doc_score == 1)
    order = np.argsort(y_score)[::-1]
    doc_score = np.take(doc_score, order[:k])
    if gtp == 0:
        return 0
    n_relevant_at_i = 0
    prec_at_i = 0
    for i in range(len(doc_score)):
        if doc_score[i] == 1:
            n_relevant_at_i += 1
            prec_at_i += n_relevant_at_i / (i + 1)
    return prec_at_i / gtp

def map_at_k(search_res, k=10):
    avp = []
    for q in search_res["query_id"].unique():
        curr_data = search_res[search_res["query_id"] == q]
        avp.append(avg_precision_at_k(np.array(curr_data["is_relevant"]), 
                   np.array(curr_data["predicted_relevance"]), k))
    return np.sum(avp) / len(avp), avp

def rr_at_k(doc_score, y_score, k=10):
    order = np.argsort(y_score)[::-1]
    doc_score = np.take(doc_score, order[:k])
    if np.sum(doc_score) == 0:
        return 0
    return 1 / (np.argmax(doc_score == 1) + 1)

def dcg_at_k(doc_score, y_score, k=10):
    order = np.argsort(y_score)[::-1]
    doc_score = np.take(doc_score, order[:k])
    gain = 2 ** doc_score - 1
    discounts = np.log2(np.arange(len(doc_score)) + 2)
    return np.sum(gain / discounts)


def ndcg_at_k(doc_score, y_score, k=10):
    dcg_max = dcg_at_k(doc_score, doc_score, k)
    if not dcg_max:
        return 0
    return np.round(dcg_at_k(doc_score, y_score, k) / dcg_max, 4)

#### Baseline


In [95]:
# read the new csv file as a dataframe
with open(f'{BASEDIR}/data/evaluation_gt.csv', 'r') as file:
    ev_array = file.readlines()
    ev_array = [row.rstrip().split(',') for row in ev_array]
df = pd.DataFrame(ev_array[1:], 
             columns=[ev_array[0]])

In [96]:
current_query_res = df[df["query_id"] == 0]
k = 10
print("==> Precision@{}: {}\n".format(k, precision_k(current_query_res["label"], current_query_res["label"], k)))


AttributeError: ignored

#### Defined queries


In [80]:
df

Unnamed: 0,doc,query_id,label
0,doc_12,1,1
1,doc_9,1,1
2,doc_18,1,1
3,doc_45,1,1
4,doc_501,1,1
5,doc_52,1,1
6,doc_82,1,1
7,doc_100,1,1
8,doc_122,1,1
9,doc_165,1,1


### Tweet representation