# Information Retrieval and Web Analytics

# Part 1: Text Processing


In [1]:
# mount google drive if using google collab, else skip
# we are not using it because it is more comfortable to use jupyter lab

try:
    from google.colab import drive
    drive.mount('/content/drive')
    
except ModuleNotFoundError:
    pass

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# required imports for the notebook

import json
import csv

from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import pandas as pd

In [3]:
# read the json file as a dataframe
df = pd.read_json('drive/MyDrive/data/tw_hurricane_data.json',lines=True)

In [4]:
# create a dataframe with the features wanted
tweets = df[['id','full_text', 'user','created_at','entities', 'favorite_count', 'retweet_count']].copy()

In [5]:
# correct different features inside the dataframe
# create lists for the features we want to modify
url=[]
has=[]
user=[]
# iterate through the whole dataset
for ele in range(len(tweets)):
  has.append(tweets['entities'][ele]['hashtags'][0]["text"])  # extract the hashtags
  user.append(tweets['user'][ele]['name'])                    # extract user names
  try:
    url.append(tweets['entities'][ele]['media'][0]["expanded_url"]) # extract url if this exists
  except: 
    url.append('')
# assign this lists to columns to dataframe
tweets['url'] = url
tweets['hashtags'] = has
tweets['user'] = user

In [6]:
# drop the column used before that would not be needed after
tweets.drop(['entities'], axis=1)

Unnamed: 0,id,full_text,user,created_at,favorite_count,retweet_count,url,hashtags
0,1575918182698979328,So this will keep spinning over us until 7 pm…...,Suz👻,2022-09-30 18:39:08+00:00,0,0,https://twitter.com/suzjdean/status/1575918182...,HurricaneIan
1,1575918151862304768,Our hearts go out to all those affected by #Hu...,Lytx,2022-09-30 18:39:01+00:00,0,0,,HurricaneIan
2,1575918140839673873,Kissimmee neighborhood off of Michigan Ave. \n...,Christopher Heath,2022-09-30 18:38:58+00:00,0,0,https://twitter.com/CHeathWFTV/status/15759181...,HurricaneIan
3,1575918135009738752,I have this one tree in my backyard that scare...,alex ✨,2022-09-30 18:38:57+00:00,0,0,,scwx
4,1575918119251419136,@AshleyRuizWx @Stephan89441722 @lilmizzheidi @...,Tess 💋,2022-09-30 18:38:53+00:00,0,0,,HurricaneIan
...,...,...,...,...,...,...,...,...
3995,1575856268022992896,"The CFRD, @CarrboroPD , Carrboro Public Works,...",Carrboro Fire-Rescue,2022-09-30 14:33:06+00:00,2,0,,CarrboroSafe
3996,1575856245650919424,Why isn’t @OsceolaCountyFl listed on the @fema...,BaconBitsNews,2022-09-30 14:33:01+00:00,0,0,,Kissimmee
3997,1575856228886089728,So it really wasn't #HurricaneIan that flooded...,@jganyfl,2022-09-30 14:32:57+00:00,16,8,https://twitter.com/jganyfl1/status/1575856228...,HurricaneIan
3998,1575856226139017216,Damage in my area in Punta Gorda...a thread. I...,CJ Haddad,2022-09-30 14:32:56+00:00,2,1,https://twitter.com/haddad_cj/status/157585622...,HurricaneIan


In [7]:
# reuse of the function shown in class to transform text into lowercase and erase stop words...
def build_terms(line):
    """
    Preprocess the article text (title + body) removing stop words, stemming,
    transforming in lowercase and return the tokens of the text.
    
    Argument:
    line -- string (text) to be preprocessed
    
    Returns:
    line - a list of tokens corresponding to the input text after the preprocessing
    """

    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))
    line = line.lower()
    line = line.split()  # Tokenize the text to get a list of terms
    line = [x for x in line if x not in stop_words]  # eliminate the stopwords
    line = [stemmer.stem(word) for word in line] # perform stemming (HINT: use List Comprehension)
    return line

In [8]:
# use the function above to correct the tweet and also convert into lowercase the hastags and usernames
for ele in range(len(tweets)):
  text = build_terms(tweets['full_text'][ele])
  text = [word for word in text if word.startswith('#')==False]
  text = [word for word in text if word.startswith('@')==False]
  text = [word for word in text if word.startswith('http')==False]
  tweets['full_text'][ele] = ' '.join(text)
  tweets['user'][ele] = tweets['user'][ele].lower()
  tweets['hashtags'][ele] = tweets['hashtags'][ele].lower()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [9]:
# get dictionary to map tweet ids to doc ids
# we know the ids file is a list of [doc_id \t tweet_id]
with open("drive/MyDrive/data/tweet_document_ids_map.csv", "r") as id_file:
    ids = csv.reader(id_file, delimiter="\t")
    dict_ids = {id_to_id[1]: id_to_id[0] for id_to_id in list(ids)}

In [10]:
# map tweet ids with doc ids
for ele in range(len(tweets)):
    tweet_id = str(tweets["id"][ele])
    tweets["id"][ele] = dict_ids[tweet_id]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [11]:
# get the head of the dataframe to visualize our result
tweets.head()

Unnamed: 0,id,full_text,user,created_at,entities,favorite_count,retweet_count,url,hashtags
0,doc_1,keep spin us 7 pm…go away already.,suz👻,2022-09-30 18:39:08+00:00,"{'hashtags': [{'text': 'HurricaneIan', 'indice...",0,0,https://twitter.com/suzjdean/status/1575918182...,hurricaneian
1,doc_2,heart go affect wish everyon road current brav...,lytx,2022-09-30 18:39:01+00:00,"{'hashtags': [{'text': 'HurricaneIan', 'indice...",0,0,,hurricaneian
2,doc_3,kissimme neighborhood michigan ave.,christopher heath,2022-09-30 18:38:58+00:00,"{'hashtags': [{'text': 'HurricaneIan', 'indice...",0,0,https://twitter.com/CHeathWFTV/status/15759181...,hurricaneian
3,doc_4,one tree backyard scare poltergeist tree it’ s...,alex ✨,2022-09-30 18:38:57+00:00,"{'hashtags': [{'text': 'scwx', 'indices': [122...",0,0,,scwx
4,doc_5,pray everyon affect associ winknews. sympathi ...,tess 💋,2022-09-30 18:38:53+00:00,"{'hashtags': [{'text': 'HurricaneIan', 'indice...",0,0,,hurricaneian
