In [1]:
import pandas as pd
import numpy as np
import re
from textblob import TextBlob

Features in the datasets :

- user_veritied : if the user is verified or not (converted to 0-1. Already in the base dataset)
- user_statuses_count : already in the base dataset
- user_followers_count : already in the base dataset
- user_friends_count : already in the base dataset
- hour : hour of the tweet
- day : day of the tweet 
- month : month of the tweet
- weekday : if the tweet is a weekend or not
- friends_followers_ratio = user_friends_count/user_followers_count
- has_hashtags : if the tweet has hastags or not
- has_mentions : if the tweet has mentions or not
- has_urls : if the tweet has an url or not
- number_of_urls : the number of urls of a tweet
- number_of_mentions : the number of mentions of a tweet
- number_of_hashtags : the number of hashtags of a tweet
- urls_popularity : the popularity of the urls of a tweet. If the tweet has urlA and urlB, urls_popularity = max(number of occurences of urlA in the database, number of occurences of urlB in the database)
- hashtags_popularity : popularity of the hashtags of a tweet. The definition is similar as above
- mentions_popularity : same as above
- polarity : the polarity of a tweet computed using the textBlob library. It is a scalar between -1 and 1 that represents the positivity-negativity of the text of the tweet.
- subjectivity : the subjectivity of a tweet computed using textBlob library. It is a scalar between 0 and 1 that tells us how much the tweet is subjective

New features coming soon : the number of followers and friends of the users mentioned in a tweet. Must use the twitter API to compute this :( . 

We load both datasets and merge them because to compute the popularity of a given hashtag, url, or mention, all the data is needed.

We will then load and merge the training and test set, and split them at the end.

In [71]:
df1 = pd.read_csv("data/train.csv")
df2 = pd.read_csv("data/evaluation.csv")

In [74]:
df = df1.append(df2)

In [66]:
df.head(10)

Unnamed: 0,id,timestamp,retweet_count,user_verified,user_statuses_count,user_followers_count,user_friends_count,user_mentions,urls,hashtags,text
0,0,1588696955143,0.0,False,68460,1101,1226,,,,Smh I give up
1,1,1588464948124,0.0,False,309,51,202,,,,"Most of us are Human Beings, but I think you m..."
2,2,1588634673360,0.0,False,3241,1675,2325,,,,"Old dirty tricks Trump, at it again...like we ..."
3,3,1588433158672,0.0,False,32327,667,304,,,,Seriously..... I worked 86 hours my last check...
4,4,1588582751599,0.0,False,581,42,127,,,,May ALMIGHTY ALLAH have mercy on us all. Only ...
5,5,1588434563287,0.0,False,7214,503,1126,,,,They couldn’t care less.
6,6,1588692966869,2.0,False,372,738,472,,twitter.com/i/web/status/1…,Ethiopia,Extremely valid points being made here 👇🏾 #Eth...
7,7,1588316892450,1.0,False,2085,3808,153,,twitter.com/i/web/status/1…,,COVID-19 dominated the discussion Tuesday at a...
8,8,1588625905286,0.0,False,17765,11666,40,,,,BC now has 112 patients on ventilators. 17 of...
9,9,1588604315931,0.0,False,3086,66,241,,,,a COVID-19 vaccine would be pretty lit ngl


In [4]:
# Just convert False to 0 and True to 1.

df["user_verified"] = df["user_verified"].astype(int)

In [5]:
# I don't know why I am supposed to take the modulo, but it works...
# If I don't do that, I get a wrong date in 1970

df['date']  =(df['timestamp']).astype(np.int64) // 10**3

In [6]:
# unit = 's' puts the date in unix format. Necessary to get the good format.

df['date']= pd.to_datetime(df['date'], unit='s')

In [7]:
df["hour"] = df["date"].dt.hour
df["day"] = df["date"].dt.day
df["month"] = df["date"].dt.month

# 0 for Monday
df["weekday"] = df["date"].dt.weekday

# If the day is a weekend or not
df["weekend"] = np.where(np.logical_or(df["weekday"] == 5, df["weekday"] == 6), 1, 0)

In [8]:
df["friends_followers_ratio"] = df["user_friends_count"]/df["user_followers_count"]

In [9]:
df["has_hashtags"] = np.where(pd.notnull(df["hashtags"]), 1, 0)
df["has_mentions"] = np.where(pd.notnull(df["user_mentions"]), 1, 0)
df["has_urls"] = np.where(pd.notnull(df["urls"]), 1, 0)

In [10]:
# Stupid to use this fonction

def counter(word):
    if pd.isna(word):
        return 0
    number = 1
    for s in word:
        if s == ',':
            number += 1
    return number

In [11]:
df["number_of_urls"] = df["urls"].apply(counter)
df["number_of_mentions"] = df["user_mentions"].apply(counter)
df["number_of_hashtags"] = df["hashtags"].apply(counter)

Now we will compute hashtags, urls, and mentions popularity. We computed it only on the given dataset. However it may be smarter to us it on ALL the data (both training and test).

We :

- Turn the urls, hashtags, and mentions into lists.
- We create dictionnaries that will help us stock the number of occurences of urls, hashtags, and tweets.
- We finally compute the popularity (popularity = number of occurences in the dataset) of the hashtags, urls, and mentions of a tweet, and the we take the maximum.

In [12]:
# An auxiliary function that, given a text, separates it
# with commas (ie useful to get a list of hashtags, urls, mentions)

def word_cut(word):
    if pd.isna(word):
        return []
    word_array = np.array([])
    l = 0
    for i,s in enumerate(word):
        if s == ',':
            word_array = np.append(word_array, word[l:i])
            l = i+1
    word_array = np.append(word_array, word[l::])
    return word_array
    

In [46]:
# We create a new column with the parsed elements. We will delete it in the end.

df["urls_list"] = df["urls"].apply(word_cut)
df["hashtags_list"] = df["hashtags"].apply(word_cut)
df["mentions_list"] = df["user_mentions"].apply(word_cut)

In [47]:
# Dictionnaries that will contain the number of occurences of elements

urls_pop = dict()
hashtags_pop = dict()
mentions_pop = dict()

In [48]:
# Updates the counting of the occurences of a given element in one of the dictionaries defined above

def stock(x, name):
    if name == "urls":
        if x in urls_pop.keys():
            urls_pop[x] += 1
        else :
            urls_pop[x] = 0
    if name == "hashtags":
        if x in hashtags_pop.keys():
            hashtags_pop[x] += 1
        else :
            hashtags_pop[x] = 0
    if name == "mentions":
        if x in mentions_pop.keys():
            mentions_pop[x] += 1
        else :
            mentions_pop[x] = 0
 

In [49]:
# An auxiliary function that uses the function above on a array.

def fill_dico(x, name):
    for i in x:
        stock (i, name)



In [50]:
# We apply the function above on the lists of urls, hashtags, and mentions

df["urls_list"].apply(fill_dico, args = ("urls",))
df["hashtags_list"].apply(fill_dico, args = ("hashtags",))
df["mentions_list"].apply(fill_dico, args = ("mentions",))

0         None
1         None
2         None
3         None
4         None
          ... 
665772    None
665773    None
665774    None
665775    None
665776    None
Name: mentions_list, Length: 665777, dtype: object

In [52]:
# Finally, a fonction that for a given array of urls, hashtags, or mentions, computes
# the maximum popularity of the elements of the array.

def compute_pop(x, name):
    pop = 0
    if name == "urls":
        for i in x:
            pop = max(pop, urls_pop[i])
            
    if name == "hashtags":
        for i in x:
            pop = max(pop, hashtags_pop[i])
            
    if name == "mentions":
        for i in x:
            pop = max(pop, mentions_pop[i])
    
    return pop
        

In [53]:
# We finally compute the popularity of the tweets

df["urls_popularity"] = df["urls_list"].apply(compute_pop, args = ("urls",))
df["hashtags_popularity"] = df["hashtags_list"].apply(compute_pop, args = ("hashtags",))
df["mentions_popularity"] = df["mentions_list"].apply(compute_pop, args = ("mentions",))

In [58]:
# We drop the parsed urls, hashtags, and mentions

df = df.drop(columns=["urls_list", "hashtags_list", "mentions_list"])

In [14]:
# Simple function that uses textBlob to return (polarity, subjectivity) of a tweet.

# Polarity >= 0 -> positive 
# Polarity >= 0 -> negative

# Subjectivity measures the subjectivity
def sentiment(text):
    try:
        return TextBlob(text).sentiment
    except:
        return None

In [None]:
# This might take a little time, but on a good computer it shouldn't take more than a few minutes

df['polarity']     = df["text"].apply(sentiment).apply(lambda x: x[0])
df['subjectivity'] = df["text"].apply(sentiment).apply(lambda x: x[1])

In [None]:
df1 = df.iloc[:665777,:]
df2 = df.iloc[665777:,:]

In [59]:
df1.to_csv('data/partially_treated_train_data.csv', index = False)
df2.to_csv('data/partially_treated_evaluation_data.csv', index = False)