# Information Retrieval and Web Analytics

# Part 3: Ranking


In [1]:
# mount google drive if using google collab, else skip
# we are not using it because it is more comfortable to use jupyter lab

BASEDIR = '.'

try:
    from google.colab import drive
    drive.mount('/content/drive')
    BASEDIR = 'drive/MyDrive'
    
except ModuleNotFoundError:
    pass

In [2]:
# required imports for the notebook

import json
import csv
import math
import numpy as np
from array import array
from collections import defaultdict, Counter
import functools
from sklearn.manifold import TSNE


from nltk.stem import PorterStemmer

import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
import pandas as pd
import rank_bm25

from utils import *

# queries for testing
queries = [
    "keep us posted",
    "ian update",
    "disney world",
    "climate change",
    "hit state"
]

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rafaelbardisarodes/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# open results from last practice
tweets = pd.read_csv(f'{BASEDIR}/data/processed_tweets.csv')
tweets = tweets.reset_index()  # make sure indexes pair with number of rows

### Content from practice part 2

In [4]:
# reuse of the function shown in class to transform text into lowercase and erase stop words in queries
def build_terms(line):
    """
    Preprocess the line removing stop words, stemming,
    transforming in lowercase and return the tokens of the text.
    
    Argument:
    line -- string (text) to be preprocessed
    
    Returns:
    line - a list of tokens corresponding to the input text after the preprocessing
    """

    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))
    line = str(line).lower()
    
    # tremendo pero aligual rompe algo despues
    line = remove_punctuation(line)
    
    line = line.split()  # Tokenize the text to get a list of terms
    line = [x for x in line if x not in stop_words]  # eliminate the stopwords
    line = [stemmer.stem(word) for word in line] # perform stemming (HINT: use List Comprehension)
    return line


@benchmark
def create_index(tweets):
    """
    Implement the inverted index
    
    Argument:
    lines -- collection of Wikipedia articles
    
    Returns:
    index - the inverted index (implemented through a Python dictionary) containing terms as keys and the corresponding
    list of documents where these keys appears in (and the positions) as values.
    """
    index = defaultdict(list)
    title_index = {}  # dictionary to map page titles to page ids

    for tweet in tweets.itertuples(index=True):  # Remember, lines contain all documents from file
        tweet_text = tweet.full_text
        
        # tweet id
        tweet_id = int(tweet.id.split("_")[1])

        terms = str(tweet_text).split(" ")  # page_title + page_text

        title_index[tweet_id] = tweet.user  ## we do not need to apply get terms to title because it used only to print titles and not in the index
        
        ## ===============================================================        
        ## create the index for the current page and store it in current_page_index (current_page_index)
        ## current_page_index ==> { ‘term1’: [current_doc, [list of positions]], ...,‘term_n’: [current_doc, [list of positions]]}

        ## Example: if the curr_doc has id 1 and his text is "web retrieval information retrieval":

        ## current_page_index ==> { ‘web’: [1, [0]], ‘retrieval’: [1, [1,4]], ‘information’: [1, [2]]}

        ## the term ‘web’ appears in document 1 in positions 0, 
        ## the term ‘retrieval’ appears in document 1 in positions 1 and 4
        ## ===============================================================

        current_page_index = {}

        for position, term in enumerate(terms): # terms contains page_title + page_text. Loop over all terms
            try:
                # if the term is already in the index for the current page (current_page_index)
                # append the position to the corresponding list
                current_page_index[term][1].append(position)
            except:
                # Add the new term as dict key and initialize the array of positions and add the position
                current_page_index[term] = [tweet_id, array('I', [position])]  #'I' indicates unsigned int (int in Python)

        # merge the current page index with the main index
        for term_page, posting_page in current_page_index.items():
            index[term_page].append(posting_page)
    return index, title_index


In [5]:
index, title_index= create_index(tweets)

Time taken for [91mcreate_index[0m: 0.1900


### 5 Text queries

In [6]:
# def query(text)?:
# build terms(query)

# index = tremendo index
# foreach term in query
# index = index[term in entry] <- boolean mask, conjunctive AND

# return index <- the doc ids
# or
# return tweets[original_text][index] <- get original text of tweets containing all elements in query

def query(text, tweet_index=""):
    """
    search for a given text in the tweet collection using the
    inverted index we previously computed
    :param text: the query text
    :param tweet_index: inverted index of the collection, named as such because context of practice
    :return: list of tweet ids containing all (treated) terms in the query
    """
    
    # necessary step since same treatment applied to tweets
    terms = build_terms(text)
    
    # select tweet index, defaults to global index but can be specified
    tweet_index = tweet_index if tweet_index else index
    
    plausible_ids = []
    for query_term in terms:
        # tweet_index[query_term] is list of tweet ids containing query term + position(s) in text, could be useful in the future
        # plausible_ids[query_term] = tweet_index[query_term]
        
        # using sets is convenient for using reduce
        plausible_ids.append(set(term_pos[0] for term_pos in tweet_index[query_term]))
        
    # reduce list of sets to intersection of all
    relevant_ids = functools.reduce(lambda a, b: a.intersection(b), plausible_ids) if plausible_ids else []
                             
    return relevant_ids
        

In [7]:
query("")

[]

### Ranking results: TF-IDF + cosine score

In [8]:
# rank(query):
# query = build_terms(query)

# scores = {}
# length = {}
# foreach term in query:
# w_q = TF-IDF(term, query), docids = query(term)
# foreach docid in docids:
# scores[docid] += TF-IDF(term, tweets.full_text[docid]) * w_q
# ::
# foreach docid:
# scores[d] /= len(tweets.full_text[docid])

# sort scores, return top K


# relevant documents = query(query)
# foreach document in relevant_documents
# TF-IDF(document, query)



# TF-IDF(document, query):

# len(query(term)) is df(term) if term is one word 


def tf_idf(term_freq, document_freq, collection_len):
    if term_freq == 0 or document_freq == 0:
        return 0
    return (1 + math.log(term_freq)) * math.log(collection_len/document_freq)


def doc_score(doc_id, collection_index=index, collection=""):
    """
    vector de scores para el documento dado, es lo que hay que usar para
    la document length
    
    tremendo usarlo como {doc_id: doc_score(doc_id)} para todos los ids
    :param doc_id: document id que mirar
    :params: se supone que así será más flexible pero los defaults van finos asi que na
    :return: diccionario de terms y pesos, util para normalizar documentos
    """
    result={}
    
    collection = collection if collection else {tweet.id: tweet.full_text for tweet in tweets.itertuples(index=True)}
    collection_len = len(collection)
    
    document = str(collection[doc_id]).split(" ")
    term_frequencies = Counter(document)
    
    for term in document:
        document_freq = len(query(term, tweet_index=collection_index))
        result[term] = tf_idf(term_frequencies[term], document_freq, collection_len)
    return result


@benchmark
def collection_vectors(collection="", collection_index=index):
    """
    multi diccionario de documentos, terms y sus valores tf-idf
    """
    document_vectors = {}
    
    collection = collection if collection else {tweet.id: tweet.full_text for tweet in tweets.itertuples(index=True)}
    for doc_id, document in collection.items():
        document_vectors[doc_id] = doc_score(doc_id, collection_index=collection_index, collection=collection)
        
    return document_vectors


In [9]:
document_lengths = collection_vectors()

Time taken for [91mcollection_vectors[0m: 13.2206


In [10]:
def cosine_score(query_text, collection_index=index, collection="", lengths=document_lengths, k=10):
    """
    computes cosine score of all documents in a collection against a query and ranks them
    accordingly
    """

    collection = collection if collection else {tweet.id: tweet.full_text for tweet in tweets.itertuples(index=True)}
    collection_len = len(collection)
           
    scores = {doc_id: 0 for doc_id in collection.keys()}
    
    # esto seguramente este mal
    # length = {doc_id: len(str(document).split(" ")) for doc_id, document in collection.items()}
    
    query_terms = build_terms(query_text) # necessary step since same treatment applied to tweets
    
    # dictionary of frequency of each term in the query
    query_frequencies = Counter(query_terms)
    
    for term in query_terms:
        # query of a term returns the set of documents containing the term
        document_freq = len(query(term, tweet_index=collection_index))
        
        query_weight = tf_idf(query_frequencies[term], document_freq, collection_len)
        
        """
        for term in query_terms:
        
        # query of a term returns the set of documents containing the term
        document_freq = len(query(term, tweet_index=collection_index))
        
        query_weight = tf_idf(query_frequencies[term], document_freq, collection_len)
        
        # hasta aqui esta bien probablemente, despues pasa algo raro
        for doc_id, document in collection.items():
            # counter of distinct terms in document
            term_frequencies = Counter(str(document).split(" "))
            document_weight = tf_idf(term_frequencies[term], document_freq, collection_len)
            scores[doc_id] += query_weight * document_weight
        """
            
        for doc_id, document in collection.items():

            term_frequencies = Counter(str(document).split(" "))
            document_weight = tf_idf(term_frequencies[term], document_freq, collection_len)

            doc_vec = list(lengths[doc_id].values())
            scores[doc_id] = query_weight * document_weight
            
    scores = {doc_id: score/np.linalg.norm(list(lengths[doc_id].values())) for doc_id, score in scores.items()}
        
    # if k is 0 return whole doc id list
    k = k if k else collection_len
    
    doc_ids_sorted = sorted(scores, key=scores.get, reverse=True)[:k]
    return {doc_id: scores[doc_id] for doc_id in doc_ids_sorted}

In [11]:
def our_score(query_text, collection_index=index, collection="", lengths=document_lengths, k=10):
    """
    uhh multiply tf idf score by log popularity of tweet so our ranking is sensitive to tweet popularity
    """
    collection = collection if collection else {tweet.id: tweet.full_text for tweet in tweets.itertuples(index=True)}
    
    # tf-idf and cosine score
    base_score = cosine_score(query_text, collection_index=index, collection=collection, lengths=document_lengths, k=0)
    
    return base_score

In [12]:
our_score("keep us posted")

  scores = {doc_id: score/np.linalg.norm(list(lengths[doc_id].values())) for doc_id, score in scores.items()}


{'doc_1546': 2.3995671065397546,
 'doc_1069': 2.248496421335798,
 'doc_1873': 1.9930095562822965,
 'doc_2330': 1.8829233229707707,
 'doc_866': 1.631200350199638,
 'doc_2640': 1.4893358764902744,
 'doc_558': 1.4580932450433082,
 'doc_91': 1.2663704245846188,
 'doc_1390': 1.2356896416202623,
 'doc_2863': 1.2321737061746545,
 'doc_3700': 1.2268644274196678,
 'doc_3870': 1.1861659465221739,
 'doc_3923': 1.1847728686276873,
 'doc_2387': 1.1451752376155806,
 'doc_1008': 1.1181990613896085,
 'doc_2643': 1.114729526185563,
 'doc_2862': 1.0764840789644972,
 'doc_2415': 1.0726214892060202,
 'doc_342': 1.070452944240281,
 'doc_3259': 1.0633127804861142,
 'doc_1427': 1.0618494113841714,
 'doc_114': 1.048422548058951,
 'doc_2294': 0.9871546693794904,
 'doc_1245': 0.9668915051119629,
 'doc_2842': 0.9631461246650799,
 'doc_2160': 0.9284837746364702,
 'doc_2578': 0.9211371192238395,
 'doc_1187': 0.8918884604357993,
 'doc_625': 0.8911903622796723,
 'doc_3456': 0.870185300258331,
 'doc_2384': 0.86873381

In [13]:
tweets[tweets['id'] == 'doc_3870']['full_text']

3869    sorri post much storm. hope understand. here’ ...
Name: full_text, dtype: object

### Ranking results: our score + cosine similarity 

### Ranking results: BM25

In [14]:
from rank_bm25 import BM25Okapi

corpus = [
    "Hello there good man!",
    "It is quite windy in London",
    "How is the weather today?"
]

bm_collection = {tweet.id: build_terms(tweet.full_text) for tweet in tweets.itertuples(index=True)}
bm_docs = bm_collection.values()
#bm_collection = [build_terms(tweet.full_text) for tweet in tweets.itertuples(index=True)]

bm25 = BM25Okapi(bm_docs)

In [15]:
query_test = "keep us"
retrieved_docs = bm25.get_top_n(build_terms(query_test), list(bm_collection.values()), n=20)

bm_doc_ids = bm_collection.keys()

for doc in retrieved_docs:
    print(f"{RED}{list(bm_doc_ids)[list(bm_docs).index(doc)]}:{WHITE}\t{doc}")

[91mdoc_3377:[0m	['thank', 'plea', 'keep', 'us', 'post']
[91mdoc_1:[0m	['keep', 'spin', 'us', 'pm…go', 'away', 'alreadi']
[91mdoc_433:[0m	['keep', 'swimmingjust', 'keep', 'swim']
[91mdoc_2045:[0m	['hurrican', 'ian', 'left', 'devast', 'impact', 'mani', 'part', 'florida', 'join', 'us', 'keep', 'impact', 'storm', 'thought']
[91mdoc_2677:[0m	['keep', 'pray', 'path']
[91mdoc_1320:[0m	['keep', 'friend', 'safe']
[91mdoc_3996:[0m	['cfrd', 'carrboro', 'public', 'work', 'servic', 'place', 'stand', 'respond', 'best', 'thing', 'help', 'us', 'stay', 'home', 'abl', 'help', 'keep']
[91mdoc_2925:[0m	['🌊⛈🌴sinc', 'underwat', 'due', 'keep', 'rebuild']
[91mdoc_3175:[0m	['can’t', 'keep', 'buy', 'muln', 'today', '🌀💨🌧']
[91mdoc_2776:[0m	['best', 'place', 'keep', 'make', 'landfal', 'home', 'state', 'sc', 'keep', 'famili', 'low', 'countri', 'prayer', '🙏🏻', '🌀', '🌬', '🌧']
[91mdoc_2046:[0m	['plea', 'share', 'lifesav', 'tip', 'keep', 'anim', 'safe']
[91mdoc_2114:[0m	['keep', 'close', 'eye'

### Top-20 list of documents