# Information Retrieval and Web Analytics

# Part 1: Text Processing


In [1]:
# mount google drive if using google collab, else skip
# we are not using it because it is more comfortable to use jupyter lab

BASEDIR = '.'

try:
    from google.colab import drive
    drive.mount('/content/drive')
    BASEDIR = 'drive/MyDrive'
    
except ModuleNotFoundError:
    pass

In [2]:
# required imports for the notebook

import json
import csv

from nltk.stem import PorterStemmer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import pandas as pd

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rafaelbardisarodes/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# read the json file as a dataframe
df = pd.read_json(f'{BASEDIR}/data/tw_hurricane_data.json',lines=True)

In [4]:
# create a dataframe with the features wanted
tweets = df[['id','full_text', 'user','created_at','entities', 'favorite_count', 'retweet_count']].copy()

In [5]:
# correct different features inside the dataframe
# create lists for the features we want to modify
url=[]
hashtags=[]
user=[]
# iterate through the whole dataset
for ele in range(len(tweets)):
    hashtags.append([hashtag['text'] for hashtag in tweets['entities'][ele]['hashtags']])  # extract the hashtags
    user.append(tweets['user'][ele]['name'])                    # extract user names
    try:
        url.append(tweets['entities'][ele]['media'][0]["expanded_url"]) # extract url if this exists
    except: 
        url.append('')
# assign this lists to columns to dataframe
tweets['url'] = url
tweets['hashtags'] = hashtags
tweets['user'] = user

In [6]:
# drop the column used before that would not be needed after
tweets.drop(['entities'], axis=1, inplace=True)

In [7]:
def remove_punctuation(text):

    """
    Removes the characters:
    !\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~0123456789
    from the text.
    """

    chars_to_remove = "!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~0123456789"

    tr = str.maketrans("", "", chars_to_remove)

    return text.translate(tr)

In [8]:
# reuse of the function shown in class to transform text into lowercase and erase stop words...
def build_terms(line):
    """
    Preprocess the article text (title + body) removing stop words, stemming,
    transforming in lowercase and return the tokens of the text.
    
    Argument:
    line -- string (text) to be preprocessed
    
    Returns:
    line - a list of tokens corresponding to the input text after the preprocessing
    """

    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))
    line = line.lower()
    
    # tremendo pero aligual rompe algo despues
    line = remove_punctuation(line)
    
    line = line.split()  # Tokenize the text to get a list of terms
    line = [x for x in line if x not in stop_words]  # eliminate the stopwords
    line = [stemmer.stem(word) for word in line] # perform stemming (HINT: use List Comprehension)
    return line

In [9]:
# use the function above to correct the tweet and also convert into lowercase the hastags and usernames
for ele in range(len(tweets)):
    text = build_terms(tweets['full_text'][ele])
    text = [word for word in text if word.startswith('#')==False]
    text = [word for word in text if word.startswith('@')==False]
    text = [word for word in text if word.startswith('http')==False]
    tweets['full_text'][ele] = ' '.join(text)
    tweets['user'][ele] = tweets['user'][ele].lower()
    # hashtags may be more than one, so apply lowercase function to all its elements
    tweets['hashtags'][ele] = list(map(str.lower, tweets['hashtags'][ele]))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets['full_text'][ele] = ' '.join(text)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets['user'][ele] = tweets['user'][ele].lower()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets['hashtags'][ele] = list(map(str.lower, tweets['hashtags'][ele]))


In [10]:
# get dictionary to map tweet ids to doc ids
# we know the ids file is a list of [doc_id \t tweet_id]
with open(f'{BASEDIR}/data/tweet_document_ids_map.csv', 'r') as id_file:
    ids = csv.reader(id_file, delimiter="\t")
    dict_ids = {id_to_id[1]: id_to_id[0] for id_to_id in list(ids)}

In [11]:
# map tweet ids with doc ids
for ele in range(len(tweets)):
    tweet_id = str(tweets["id"][ele])
    tweets["id"][ele] = dict_ids[tweet_id]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets["id"][ele] = dict_ids[tweet_id]


In [12]:
# get the head of the dataframe to visualize our result
tweets.head()

Unnamed: 0,id,full_text,user,created_at,favorite_count,retweet_count,url,hashtags
0,doc_1,keep spin us pm…go away alreadi hurricaneian,suz👻,2022-09-30 18:39:08+00:00,0,0,https://twitter.com/suzjdean/status/1575918182...,[hurricaneian]
1,doc_2,heart go affect hurricaneian wish everyon road...,lytx,2022-09-30 18:39:01+00:00,0,0,,[hurricaneian]
2,doc_3,kissimme neighborhood michigan ave hurricaneian,christopher heath,2022-09-30 18:38:58+00:00,0,0,https://twitter.com/CHeathWFTV/status/15759181...,[hurricaneian]
3,doc_4,one tree backyard scare poltergeist tree it’ s...,alex ✨,2022-09-30 18:38:57+00:00,0,0,,"[scwx, hurricaneian]"
4,doc_5,ashleyruizwx stephan lilmizzheidi mrsniffl win...,tess 💋,2022-09-30 18:38:53+00:00,0,0,,[hurricaneian]


In [13]:
# save result in a new csv file
tweets.to_csv(f'{BASEDIR}/data/processed_tweets.csv')

In [14]:
def create_index(lines):
    """
    Implement the inverted index
    
    Argument:
    lines -- collection of Wikipedia articles
    
    Returns:
    index - the inverted index (implemented through a Python dictionary) containing terms as keys and the corresponding
    list of documents where these keys appears in (and the positions) as values.
    """
    index = defaultdict(list)
    title_index = {}  # dictionary to map page titles to page ids

    # ight esto no es correcto, tenemos tremenda lista de tweets, no documentos
    for line in lines:  # Remember, lines contain all documents from file
        line_arr = line.split("|")
        page_id = int(line_arr[0])

        terms = build_terms(''.join(line_arr[1:]))  # page_title + page_text

        title = line_arr[1]
        title_index[page_id] = title  ## we do not need to apply get terms to title because it used only to print titles and not in the index
        
        ## ===============================================================        
        ## create the index for the current page and store it in current_page_index (current_page_index)
        ## current_page_index ==> { ‘term1’: [current_doc, [list of positions]], ...,‘term_n’: [current_doc, [list of positions]]}

        ## Example: if the curr_doc has id 1 and his text is "web retrieval information retrieval":

        ## current_page_index ==> { ‘web’: [1, [0]], ‘retrieval’: [1, [1,4]], ‘information’: [1, [2]]}

        ## the term ‘web’ appears in document 1 in positions 0, 
        ## the term ‘retrieval’ appears in document 1 in positions 1 and 4
        ## ===============================================================

        current_page_index = {}

        for position, term in enumerate(terms): # terms contains page_title + page_text. Loop over all terms
            try:
                # if the term is already in the index for the current page (current_page_index)
                # append the position to the corresponding list
                current_page_index[term][1].append(position)
            except:
                # Add the new term as dict key and initialize the array of positions and add the position
                current_page_index[term] = [page_id, array('I', [position])]  #'I' indicates unsigned int (int in Python)

        # merge the current page index with the main index
        for term_page, posting_page in current_page_index.items():
            index[term_page].append(posting_page)
    return index, title_index