In [9]:
import pandas as pd
import numpy as np
import re
import json 
from textblob import TextBlob
import tweepy 
from google.colab import drive
from nltk.tokenize import word_tokenize
from langdetect import detect 
import pickle

Features in the datasets :

- user_veritied : if the user is verified or not (converted to 0-1. Already in the base dataset)
- user_statuses_count : already in the base dataset
- user_followers_count : already in the base dataset
- user_friends_count : already in the base dataset
- hour : hour of the tweet
- day : day of the tweet 
- month : month of the tweet
- weekday : if the tweet is a weekend or not
- friends_followers_ratio = user_friends_count/user_followers_count
- has_hashtags : if the tweet has hastags or not
- has_mentions : if the tweet has mentions or not
- has_urls : if the tweet has an url or not
- number_of_urls : the number of urls of a tweet
- number_of_mentions : the number of mentions of a tweet
- number_of_hashtags : the number of hashtags of a tweet
- urls_popularity : the popularity of the urls of a tweet. If the tweet has urlA and urlB, urls_popularity = max(number of occurences of urlA in the database, number of occurences of urlB in the database)
- hashtags_popularity : popularity of the hashtags of a tweet. The definition is similar as above
- mentions_popularity : same as above
- polarity : the polarity of a tweet computed using the textBlob library. It is a scalar between -1 and 1 that represents the positivity-negativity of the text of the tweet.
- subjectivity : the subjectivity of a tweet computed using textBlob library. It is a scalar between 0 and 1 that tells us how much the tweet is subjective

New features coming soon : the number of followers and friends of the users mentioned in a tweet. Must use the twitter API to compute this :( . 

We load both datasets and merge them because to compute the popularity of a given hashtag, url, or mention, all the data is needed.

We will then load and merge the training and test set, and split them at the end.

In [17]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
df1 = pd.read_csv("gdrive/MyDrive/train.csv")
df2 = pd.read_csv("gdrive/MyDrive/evaluation.csv")

In [None]:
df = df1.append(df2, ignore_index = True)

In [None]:
df.head(10)

Unnamed: 0,id,timestamp,retweet_count,user_verified,user_statuses_count,user_followers_count,user_friends_count,user_mentions,urls,hashtags,text
0,0,1588696955143,0.0,False,68460,1101,1226,,,,Smh I give up
1,1,1588464948124,0.0,False,309,51,202,,,,"Most of us are Human Beings, but I think you m..."
2,2,1588634673360,0.0,False,3241,1675,2325,,,,"Old dirty tricks Trump, at it again...like we ..."
3,3,1588433158672,0.0,False,32327,667,304,,,,Seriously..... I worked 86 hours my last check...
4,4,1588582751599,0.0,False,581,42,127,,,,May ALMIGHTY ALLAH have mercy on us all. Only ...
5,5,1588434563287,0.0,False,7214,503,1126,,,,They couldn’t care less.
6,6,1588692966869,2.0,False,372,738,472,,twitter.com/i/web/status/1…,Ethiopia,Extremely valid points being made here 👇🏾 #Eth...
7,7,1588316892450,1.0,False,2085,3808,153,,twitter.com/i/web/status/1…,,COVID-19 dominated the discussion Tuesday at a...
8,8,1588625905286,0.0,False,17765,11666,40,,,,BC now has 112 patients on ventilators. 17 of...
9,9,1588604315931,0.0,False,3086,66,241,,,,a COVID-19 vaccine would be pretty lit ngl


In [None]:
# Just convert False to 0 and True to 1.

df["user_verified"] = df["user_verified"].astype(int)

In [None]:
# I don't know why I am supposed to take the modulo, but it works...
# If I don't do that, I get a wrong date in 1970

df['date']  =(df['timestamp']).astype(np.int64) // 10**3

In [None]:
# unit = 's' puts the date in unix format. Necessary to get the good format.

df['date']= pd.to_datetime(df['date'], unit='s')

In [None]:
df["hour"] = df["date"].dt.hour
df["day"] = df["date"].dt.day
df["month"] = df["date"].dt.month

# 0 for Monday
df["weekday"] = df["date"].dt.weekday

# If the day is a weekend or not
df["weekend"] = np.where(np.logical_or(df["weekday"] == 5, df["weekday"] == 6), 1, 0)

In [None]:
df["friends_followers_ratio"] = df["user_friends_count"]/df["user_followers_count"]

In [None]:
df["has_hashtags"] = np.where(pd.notnull(df["hashtags"]), 1, 0)
df["has_mentions"] = np.where(pd.notnull(df["user_mentions"]), 1, 0)
df["has_urls"] = np.where(pd.notnull(df["urls"]), 1, 0)

In [None]:
# Stupid to use this fonction

def counter(word):
    if pd.isna(word):
        return 0
    number = 1
    for s in word:
        if s == ',':
            number += 1
    return number

In [None]:
# Counts the number of urls, mentions, and hashtags in a tweet

df["number_of_urls"] = df["urls"].apply(counter)
df["number_of_mentions"] = df["user_mentions"].apply(counter)
df["number_of_hashtags"] = df["hashtags"].apply(counter)

Now we will compute hashtags, urls, and mentions popularity. We computed it only on the given dataset. However it may be smarter to us it on ALL the data (both training and test).

We :

- Turn the urls, hashtags, and mentions into lists.
- We create dictionnaries that will help us stock the number of occurences of urls, hashtags, and tweets.
- We finally compute the popularity (popularity = number of occurences in the dataset) of the hashtags, urls, and mentions of a tweet, and the we take the maximum.

In [None]:
# An auxiliary function that, given a text, separates it
# with commas (ie useful to get a list of hashtags, urls, mentions)

def word_cut(word):
    if pd.isna(word):
        return []
    word_array = word.split(",")
    return word_array
    

In [None]:
# We create a new column with the parsed elements. We will delete it in the end.

df["urls_list"] = df["urls"].apply(word_cut)
df["hashtags_list"] = df["hashtags"].apply(word_cut)
df["mentions_list"] = df["user_mentions"].apply(word_cut)

In [None]:
# Dictionnaries that will contain the number of occurences of elements

urls_pop = dict()
hashtags_pop = dict()
mentions_pop = dict()

In [None]:
# Updates the counting of the occurences of a given element in one of the dictionaries defined above

def stock(x, name):
    if name == "urls":
        if x in urls_pop.keys():
            urls_pop[x] += 1
        else :
            urls_pop[x] = 1
    if name == "hashtags":
        if x in hashtags_pop.keys():
            hashtags_pop[x] += 1
        else :
            hashtags_pop[x] = 1
    if name == "mentions":
        if x in mentions_pop.keys():
            mentions_pop[x] += 1
        else :
            mentions_pop[x] = 1
 

In [None]:
# An auxiliary function that uses the function above on a array.

def fill_dico(x, name):
  if x !=[]:
    for i in x:
        stock (i, name)



In [None]:
# We apply the function above on the lists of urls, hashtags, and mentions

df["urls_list"].apply(fill_dico, args = ("urls",))
df["hashtags_list"].apply(fill_dico, args = ("hashtags",))
df["mentions_list"].apply(fill_dico, args = ("mentions",))

In [None]:
# Finally, a fonction that for a given array of urls, hashtags, or mentions, computes
# the maximum popularity of the elements of the array.

def compute_pop(x, name):
    pop = 0
    if name == "urls":
        for i in x:
            pop = max(pop, urls_pop[i])
            
    if name == "hashtags":
        for i in x:
            pop = max(pop, hashtags_pop[i])
            
    if name == "mentions":
        for i in x:
            pop = max(pop, mentions_pop[i])
    
    return pop
        

In [None]:
# We finally compute the popularity of the tweets

df["urls_popularity"] = df["urls_list"].apply(compute_pop, args = ("urls",))
df["hashtags_popularity"] = df["hashtags_list"].apply(compute_pop, args = ("hashtags",))
df["mentions_popularity"] = df["mentions_list"].apply(compute_pop, args = ("mentions",))

Now we will use the twitter API to find the number of friends and followers of users mentioned in a tweet

In [None]:
# I had to create a pro twitter account to use tweepy.

consumer_key = # Put your consumer key here
consumer_secret = # Put your consumer secret here
access_token = # Put you access token here
access_token_secret = # Put your secret access token here


auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

In [None]:
# Two dictonnaries that will store all the mentions, as well as their number of friends and followers.

mention_followers = dict()
mention_friends = dict()

# We define some functions to fill them

def stock_mention_followers(x):
            mention_followers[x] = 0

def fill_mention_followers(x):
    for i in x:
        stock_mention_followers(i)

def stock_mention_friends(x):
            mention_friends[x] = 0

def fill_mention_friends(x):
    for i in x:
        stock_mention_friends(i)

df["mentions_list"].apply(fill_mention_followers)
df["mentions_list"].apply(fill_mention_friends)

In [None]:
# A function that will go through a list of mentions, collect their numbers of followers,
# and put them in the dictionnary

count = 0
def get_mentions_followers(mentions):
  global count
  if mentions == []:
    return 0
  
  for mention in mentions:
    if mention_followers[mention] == 0:
      count += 1
      if (count % 100 == 0):
        print(count)
      try:
        user = api.get_user(mention)
      except:
        return float('NaN')
      mention_followers[mention] = user.followers_count
  return 0

In [None]:
df["mentions_list"].apply(get_mentions_followers)

In [None]:
# A function that will go through a list of mentions, collect their numbers of friends,
# and put them in the dictionnary

count = 0
def get_mentions_friends(mentions):
  global count
  if mentions == []:
    return 0
  
  for mention in mentions:
    if mention_friends[mention] == 0:
      count += 1
      if (count % 100 == 0):
        print(count)
      try:
        user = api.get_user(mention)
      except:
        return float('NaN')
      mention_friends[mention] = user.friends_count
  return 0

In [None]:
df["mentions_list"].apply(get_mentions_friends)

In [None]:
# Two functions that given a list of mentions, will extract their number of friends and followers
# using the two dictionnaries

def compute_mention_followers(mentions):
  if mentions == []:
    return float('Nan')
  popularity = 0
  for mention in mentions:
    if pd.isna(mention_followers[mention]):
      pop = 0
    else :
      pop = mention_followers[mention]
    popularity = max(popularity, pop)
  return popularity

def compute_mention_friends(mentions):
  if mentions == []:
    return float('Nan')
  popularity = 0
  for mention in mentions:
    if pd.isna(mention_friends[mention]):
      pop = 0
    else :
      pop = mention_friends[mention]
    popularity = max(popularity, pop)
  return popularity

In [None]:
mention_followers

{'AberdeenCC': 45227,
 'OuestFrance': 605025,
 'hope2shine': 12415,
 'MoorePresence': 288,
 'LemusteleSUR': 18004,
 'MikeQuindazzi': 157234,
 ' antgrasso': 173960,
 ' Fisher85M': 88867,
 'InovioPharma': 16515,
 ' gatesfoundation': 2118165,
 'NASCAR': 3444976,
 'Cuse': 141027,
 ' RociFortuny': 419,
 'KilshannigGAA': 3291,
 ' KilshannigLGFC': 1674,
 'dacolon': 12519,
 ' julito77': 37014,
 ' latinorebels': 59767,
 'CMO_England': 281961,
 ' BethRigby': 328443,
 'VilledeNice': 99512,
 'DrJaneMunro': 6303,
 ' FreerMary': 4570,
 ' codachange': 26700,
 'tvhousehunter': 159,
 ' scottzolak': 191599,
 'nobby15': 9305,
 'AshaRangappa_': 649925,
 ' rgoodlaw': 134546,
 'MinofHealthUG': 237287,
 ' GovUganda': 93055,
 'officialDannyT': 415435,
 'foresightinst': 1648,
 'thedailybeast': 1311835,
 'Oprah': 43438688,
 ' JuliaRoberts': 490040,
 ' DeepakChopra': 3275807,
 ' NaomiCampbell': 721642,
 'GovUgwuanyi': 0,
 'MontefioreID': 806,
 'kylegriffin1': 993828,
 'NYCMayor': 1532490,
 'CDCFlu': 891829,
 'Na

In [None]:
# We use the two functions above to fill two new columns

df["mentions_max_followers"] = df["mentions_list"].apply(compute_mention_followers)
df["mentions_max_friends"] = df["mentions_list"].apply(compute_mention_friends)

In [None]:
# We drop the parsed urls, hashtags, and mentions

df = df.drop(columns=["urls_list", "hashtags_list", "mentions_list"])

Now we will do some sentiment analysis. I found that most (if not all) NLP libraries computed something given only one language. The problem is that our tweets are in many different languages, and even contain emojis. So we decided to use an emolex dictionnary : an dictionary of words that gives a score to each word. We used another one for emojis. For each tweet, we take the sentiment of each word.

In [None]:
# We load the emoji lexicon

f = open('gdrive/MyDrive/emotion-emoji.json',) 
emoji_lexicon = json.load(f) 
f.close() 

In [None]:
# A function that computes the sentiment for a given tweet.

def text_analysis(text):
  # We define the sentiment as a dictionnary
  text_sentiment = dict()
  text_sentiment['fear'] = 0
  text_sentiment['anger'] = 0
  text_sentiment['joy'] = 0
  text_sentiment['disgust'] = 0
  text_sentiment['sadness'] = 0
  text_sentiment['surprise'] = 0
  text_sentiment['positive'] = 0
  text_sentiment['negative'] = 0

  # Now we parse the text into different words
  parsed_text = word_tokenize(text)

  for word in parsed_text:

    # If the word is a emoji (ie is in the emoji lexicon)
    if word in emoji_lexicon.keys():

      word_sentiment = emoji_lexicon[word]
      # The sentiments defined in the emoji lexicon are not the same as in the emolex lexicon.
      # We had to adapt the sentiment a little

      text_sentiment['fear'] += float(word_sentiment['fear'])
      text_sentiment['anger'] += float(word_sentiment["anger"])
      text_sentiment['joy'] += float(word_sentiment['joy'])
      text_sentiment['disgust'] += float(word_sentiment['disgust'])
      text_sentiment['sadness'] += float(word_sentiment['sadness'])
      text_sentiment['surprise'] += float(word_sentiment['surprise'])

      # Here, since there wasn't any "negative" or "positive" score in the emoji lexicon,
      # we decided to make it out of the other scores.

      text_sentiment['positive'] +=  max(1, float(word_sentiment['happiness']) +
                                         float(word_sentiment['joy']) + float(word_sentiment['love']))
      text_sentiment['negative'] += max(1,
                                        float(word_sentiment['boredom']) + float(word_sentiment['disappointment']) +
                                        float(word_sentiment['worry']) + float(word_sentiment['disgust']) +
                                        float(word_sentiment['anger']))
      
    # If the word is not an emoji, we fill blindly the dictionnary  
    else :
      # The function NRCLex, from the NRC Library, finds a given word
      # in the emolex lexicon (very useful, since we otherwise would have to
      # look into it with our own code. Not easy since there are different
      # languages, and it is really big)
      word_sentiment = NRCLex(word.lower()).affect_frequencies 
      text_sentiment['fear'] += word_sentiment['fear']
      text_sentiment['anger'] += word_sentiment["anger"]
      text_sentiment['joy'] += word_sentiment['joy']
      text_sentiment['disgust'] += word_sentiment['disgust']
      text_sentiment['sadness'] += word_sentiment['sadness']
      text_sentiment['surprise'] += word_sentiment['surprise']
      text_sentiment['positive'] += word_sentiment['positive']
      text_sentiment['negative'] += word_sentiment['negative']

  # Finally, we just add the length of the tweet (in terms of words) as a feature
  return list(text_sentiment.values()) + [len(parsed_text)]
    


In [None]:
# We decided to fill an array first, with the sentiment of each tweet, and
# then save it, so as not to take any risks and losoe what we just gathered.

sentiment_array = np.array([[0 for i in range(9)] for j in range(len(df))])
count = 0
def compute_sentiment(text):
  global count
  if count%1000 == 0:
    print(count)
  temp_array = text_analysis(text)
  for i in range(9):
    sentiment_array[count][i] = temp_array[i]
  count += 1


In [None]:
# We fill the array we just defined with the sentiments computed

df["text"].apply(compute_sentiment)

In [None]:
with open('gdrive/MyDrive/sentiment_array.npy', 'wb') as f:
  np.save(f, sentiment_array)


In [None]:
# Now we fill new columns with what we just computed

df['fear'] = df["id"].apply(lambda x : sentiment_array[x][0])
df['anger'] = df["id"].apply(lambda x : sentiment_array[x][1])
df['joy'] = df["id"].apply(lambda x : sentiment_array[x][2])
df['disgust'] = df["id"].apply(lambda x : sentiment_array[x][3])
df['sadness'] = df["id"].apply(lambda x : sentiment_array[x][4])
df['surprise'] = df["id"].apply(lambda x : sentiment_array[x][5])
df['positive'] = df["id"].apply(lambda x : sentiment_array[x][6])
df['negative'] = df["id"].apply(lambda x : sentiment_array[x][7])
df['text_size'] = df["id"].apply(lambda x : sentiment_array[x][8])

Now, we'll compute one last feature : the language of a tweet.

In [4]:
# A function that determines the language of a tweet, using the langdetect library

# Since we want numbers as features, we'll just make a dictionnary that will map
# initials of languages to numbers.

lang_dict = dict()
count = 1
c = 0
def detection(text):
  global count
  global c
  c+=1
  if c%1000 == 0:
    print(c)
  err = 0
  try:
    lang = detect(text)
  except:
    err = 1
    lang = float("Nan")
  if err == 0:
    if lang not in lang_dict.keys():
      lang_dict[lang] = count
      count += 1
    number = lang_dict[lang]
  else :
    number = 0
  return number

In [None]:
df["language"] = df_text["text"].apply(detection)

In [6]:
df1 = df.iloc[:665777,:]
df2 = df.iloc[665777:,:]

In [7]:
df1.to_csv('gdrive/MyDrive/train_final.csv', index = False)
df2.to_csv('gdrive/MyDrive/evaluation_final.csv', index = False)

In [10]:
f = open("language_dictionnary.pickle", "wb")


pickle.dump(lang_dict, f)

In [11]:
lang_dict

{'af': 19,
 'ar': 31,
 'bg': 54,
 'bn': 49,
 'ca': 4,
 'cs': 40,
 'cy': 14,
 'da': 13,
 'de': 11,
 'el': 43,
 'en': 2,
 'es': 6,
 'et': 15,
 'fa': 35,
 'fi': 12,
 'fr': 3,
 'gu': 46,
 'he': 52,
 'hi': 10,
 'hr': 23,
 'hu': 24,
 'id': 22,
 'it': 25,
 'ja': 28,
 'kn': 37,
 'ko': 41,
 'lt': 29,
 'lv': 38,
 'mk': 55,
 'ml': 47,
 'mr': 39,
 'ne': 45,
 'nl': 7,
 'no': 17,
 'pa': 51,
 'pl': 33,
 'pt': 20,
 'ro': 9,
 'ru': 50,
 'sk': 34,
 'sl': 27,
 'so': 26,
 'sq': 30,
 'sv': 1,
 'sw': 8,
 'ta': 36,
 'te': 42,
 'th': 18,
 'tl': 16,
 'tr': 5,
 'uk': 53,
 'ur': 32,
 'vi': 21,
 'zh-cn': 44,
 'zh-tw': 48}