# Exploratory Data Analysis

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import re

## Import Data

In [2]:
tweets = pd.read_csv('../data/TW.csv')
print(tweets.shape)
tweets.sample(2)

(6786, 19)


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,tweet_id,creation_date,full_text,mentions,entities_hashtags,user_name,user_screen_name,user_id,location,description,protected,followers_count,friends_count,profile_created_at,retweet_count,favourite_count,search_key
4266,5,5,1374087273239736322,Mon Mar 22 19:55:09 +0000 2021,RT @thevegansnuts: Ello gowjess friends I made...,['thevegansnuts'],['vegan'],Steph Good.Ⓥ 🌱🌼🐷🐮🦡🦊🐇🐑🐔🐤💚,StephGood_,815947928376328192,N. Wales,"Grandma 💕 - 🌱 I love animals, so I DON'T EAT T...",False,1714,2276,Mon Jan 02 15:48:40 +0000 2017,1,0,veganfoodshare
1922,65,65,1374098776537841669,Mon Mar 22 20:40:52 +0000 2021,#weighlossfood #healthychoices #holistichealth...,[],"['weighlossfood', 'healthychoices', 'holistich...",Cami Onolfo,miracletea,1495626662,"Miami Beach, FL","Author of Miracle Herbs & Plants, Certified He...",False,333,464,Sun Jun 09 13:35:36 +0000 2013,0,0,healthyfood


In [11]:
retweets = pd.read_csv('../data/RT.csv')
print(retweets.shape)
retweets.sample(2)

(1120, 20)


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,tweet_id,creation_date,full_text,mentions,entities_hashtags,user_name,user_screen_name,user_id,location,description,protected,followers_count,friends_count,profile_created_at,retweet_count,favourite_count,RT_of_ID,search_key
1020,6,6,1374148220931211265,Mon Mar 22 23:57:20 +0000 2021,RT @30seconds: The food we eat is what powers ...,['30seconds'],[],Erling,chiclona,18380803,,,False,1444,4210,Thu Dec 25 23:24:49 +0000 2008,2,0,1374145404208623616,vegetarian
363,13,13,1374072901218009088,Mon Mar 22 18:58:03 +0000 2021,RT @OnRealFood: The sixth day of my #Vegan #Fa...,['OnRealFood'],"['Vegan', 'Fasting', 'recipes', 'healthylifest...",Homemade Wit,HomemadeWit,3290256723,,Home of the original #political #comic strip #...,False,157,154,Tue May 19 17:38:05 +0000 2015,2,0,1374072709076942851,healthyfood


In [12]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6786 entries, 0 to 6785
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          6786 non-null   int64 
 1   Unnamed: 0.1        6786 non-null   int64 
 2   tweet_id            6786 non-null   int64 
 3   creation_date       6786 non-null   object
 4   full_text           6786 non-null   object
 5   mentions            6786 non-null   object
 6   entities_hashtags   6786 non-null   object
 7   user_name           6785 non-null   object
 8   user_screen_name    6786 non-null   object
 9   user_id             6786 non-null   int64 
 10  location            4652 non-null   object
 11  description         6075 non-null   object
 12  protected           6786 non-null   bool  
 13  followers_count     6786 non-null   int64 
 14  friends_count       6786 non-null   int64 
 15  profile_created_at  6786 non-null   object
 16  retweet_count       6786

## Pre-Processing the data

In [13]:
# Drop all the unnamed columns due to concatenation of the data
tweets = tweets.drop([col for col in tweets.columns if 'Unnamed' in col], axis=1)
retweets = retweets.drop([col for col in retweets.columns if 'Unnamed' in col], axis=1)

In [14]:
tweets.head(2)

Unnamed: 0,tweet_id,creation_date,full_text,mentions,entities_hashtags,user_name,user_screen_name,user_id,location,description,protected,followers_count,friends_count,profile_created_at,retweet_count,favourite_count,search_key
0,1374308392186036225,Tue Mar 23 10:33:48 +0000 2021,Did it ever happen to you?🏨😂 #fortnite #fortni...,[],"['fortnite', 'fortniteclips', 'memes', 'gaming...",HNoel526,HNoel526,1373921658440986627,,,False,1,1,Mon Mar 22 08:57:14 +0000 2021,0,0,bhfyp
1,1374307966250229767,Tue Mar 23 10:32:07 +0000 2021,#memorycare #seniorliving #alzheimerscaregiver...,[],"['memorycare', 'seniorliving', 'alzheimerscare...",CharnwoodCaremark,CareCharnwood,1289130724059119621,"Charnwood, UK",Licensed home care agency providing a comprehe...,False,193,433,Fri Jul 31 09:28:32 +0000 2020,0,0,bhfyp


In [48]:
def preprocess_twitter(df):
    pass

In [49]:
# Set dates to datetime
tweets['creation_date'] = pd.to_datetime(tweets['creation_date'])
tweets['profile_created_at'] = pd.to_datetime(tweets['profile_created_at'])

retweets['creation_date'] = pd.to_datetime(retweets['creation_date'])
retweets['profile_created_at'] = pd.to_datetime(retweets['profile_created_at'])

In [50]:
# Create new column if it is retweet
tweets['is_retweet'] = tweets['full_text'].str.contains(pat=r'^RT', regex=True)
retweets['is_retweet'] = True # All are retweets

In [51]:
# Change mentions to a list not string
tweets['mentions'] = np.where(tweets['mentions'] == '[]', np.nan, tweets['mentions'])
tweets['mentions'] = tweets['mentions'].str.strip('[]').str.split(',')

retweets['mentions'] = np.where(retweets['mentions'] == '[]', np.nan, retweets['mentions'])
retweets['mentions'] = retweets['mentions'].str.strip('[]').str.split(',')

In [52]:
tweets.loc[:4, 'mentions']

0                NaN
1                NaN
2    ['BrwnStoneMG']
3                NaN
4                NaN
Name: mentions, dtype: object

In [53]:
# Change hashtags column
tweets['entities_hashtags'] = np.where(tweets['entities_hashtags'] == '[]', np.nan, tweets['entities_hashtags'])
retweets['entities_hashtags'] = np.where(retweets['entities_hashtags'] == '[]', np.nan, retweets['entities_hashtags'])

In [27]:
tweets[tweets['entities_hashtags'].isnull() == False].iloc[[1,2]]

Unnamed: 0,tweet_id,creation_date,full_text,mentions,entities_hashtags,user_name,user_screen_name,user_id,location,description,protected,followers_count,friends_count,profile_created_at,retweet_count,favourite_count,search_key,is_retweet
1,1374307966250229767,2021-03-23 10:32:07+00:00,#memorycare #seniorliving #alzheimerscaregiver...,,"['memorycare', 'seniorliving', 'alzheimerscare...",CharnwoodCaremark,CareCharnwood,1289130724059119621,"Charnwood, UK",Licensed home care agency providing a comprehe...,False,193,433,2020-07-31 09:28:32+00:00,0,0,bhfyp,False
2,1374307807051141126,2021-03-23 10:31:29+00:00,RT @BrwnStoneMG: Keegan-Michael Key is celebra...,['BrwnStoneMG'],"['keyandpeele', 'happybirthday', 'birthdaycake...",Sam Traspe,djsammyt0917,135435635,,I am Sam :-),False,497,1045,2010-04-21 08:53:13+00:00,2,0,bhfyp,True


In [54]:
tweets['entities_hashtags'] = tweets['entities_hashtags'].str.findall(r"'(\w+)'")
retweets['entities_hashtags'] = retweets['entities_hashtags'].str.findall(r"'(\w+)'")

In [55]:
tweets.head(2)

Unnamed: 0,tweet_id,creation_date,full_text,mentions,entities_hashtags,user_name,user_screen_name,user_id,location,description,protected,followers_count,friends_count,profile_created_at,retweet_count,favourite_count,search_key,is_retweet
0,1374308392186036225,2021-03-23 10:33:48+00:00,Did it ever happen to you?🏨😂 #fortnite #fortni...,,"[fortnite, fortniteclips, memes, gaming, ps, f...",HNoel526,HNoel526,1373921658440986627,,,False,1,1,2021-03-22 08:57:14+00:00,0,0,bhfyp,False
1,1374307966250229767,2021-03-23 10:32:07+00:00,#memorycare #seniorliving #alzheimerscaregiver...,,"[memorycare, seniorliving, alzheimerscaregiver...",CharnwoodCaremark,CareCharnwood,1289130724059119621,"Charnwood, UK",Licensed home care agency providing a comprehe...,False,193,433,2020-07-31 09:28:32+00:00,0,0,bhfyp,False


In [64]:
retweets.loc[0, 'tweet_id'] == retweets.loc[0, 'RT_of_ID']

False

In [66]:
pd.concat([tweets, retweets], axis=0)

Unnamed: 0,tweet_id,creation_date,full_text,mentions,entities_hashtags,user_name,user_screen_name,user_id,location,description,protected,followers_count,friends_count,profile_created_at,retweet_count,favourite_count,search_key,is_retweet,RT_of_ID
0,1374308392186036225,2021-03-23 10:33:48+00:00,Did it ever happen to you?🏨😂 #fortnite #fortni...,,"[fortnite, fortniteclips, memes, gaming, ps, f...",HNoel526,HNoel526,1373921658440986627,,,False,1,1,2021-03-22 08:57:14+00:00,0,0,bhfyp,False,
1,1374307966250229767,2021-03-23 10:32:07+00:00,#memorycare #seniorliving #alzheimerscaregiver...,,"[memorycare, seniorliving, alzheimerscaregiver...",CharnwoodCaremark,CareCharnwood,1289130724059119621,"Charnwood, UK",Licensed home care agency providing a comprehe...,False,193,433,2020-07-31 09:28:32+00:00,0,0,bhfyp,False,
2,1374307807051141126,2021-03-23 10:31:29+00:00,RT @BrwnStoneMG: Keegan-Michael Key is celebra...,['BrwnStoneMG'],"[keyandpeele, happybirthday, birthdaycake, cak...",Sam Traspe,djsammyt0917,135435635,,I am Sam :-),False,497,1045,2010-04-21 08:53:13+00:00,2,0,bhfyp,True,
3,1374307495720673285,2021-03-23 10:30:15+00:00,“Non-Fungible Tokens” Driving Investment Crazy...,,"[cryptoartist, cryptocollectibles, bhfyp, nft,...",Andy Wood,clickandywood,59716459,"England, United Kingdom",Image punk and visual arts. Citizen Musk Coll...,False,15838,3710,2009-07-24 07:10:22+00:00,0,1,bhfyp,False,
4,1374306170546970627,2021-03-23 10:24:59+00:00,Check out Milk Mathi on Aslidesi.\nLink Below\...,,"[lunch, vegan, healthysnacks, chips, streetfoo...",Aslidesi.com,aslidesimarket,1226025759698874368,Delhi,"https://t.co/o2HFcwJiut, The global market of ...",False,237,68,2020-02-08 06:11:47+00:00,0,0,bhfyp,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1115,1374289148228509698,2021-03-23 09:17:20+00:00,RT @moondeeey: [MELON]\n\n'We Play' liked song...,['moondeeey'],,d 🛹,ks___dd,1317311063335956481,daileeepanda,just @APINK_2011 @_Weeekly ❤️,False,70,116,2020-10-17 03:47:00+00:00,1,0,yummy,True,1.374289e+18
1116,1374288407967399936,2021-03-23 09:14:24+00:00,RT @Yummy_Lummy: I need to do a Dexter rewatch...,['Yummy_Lummy'],,Gary 🖖,garydlum,178183112,The mirror universe,is hungry for Kelpien ganglia to eat with my g...,False,1938,1391,2010-08-14 02:34:10+00:00,1,0,yummy,True,1.374288e+18
1117,1374289608268148742,2021-03-23 09:19:10+00:00,RT @Carolynd07: Yummy yummy \n\nMAYMAY BigMac7...,"['Carolynd07', 'maymayentrata07', 'Barber_Ed...",[MayWard],Lyn ❤ MWTeamSolo,Carolynd07,841497720296620033,Hong Kong,@maymayentrata07\n@Barber_edward_\n\nFAN accou...,False,655,471,2017-03-14 03:54:25+00:00,2,0,yummy,True,1.374288e+18
1118,1374288697596669954,2021-03-23 09:15:33+00:00,RT @Carolynd07: Yummy yummy \n\nMAYMAY BigMac7...,"['Carolynd07', 'maymayentrata07', 'Barber_Ed...",[MayWard],Louie,mlouiegdm,1372271617,,😀,False,316,268,2013-04-22 14:25:07+00:00,2,0,yummy,True,1.374288e+18


In [65]:
pd.merge(left=tweets, right=retweets, left_on='tweet_id', right_on='RT_of_ID', how='left')

Unnamed: 0,tweet_id_x,creation_date_x,full_text_x,mentions_x,entities_hashtags_x,user_name_x,user_screen_name_x,user_id_x,location_x,description_x,...,description_y,protected_y,followers_count_y,friends_count_y,profile_created_at_y,retweet_count_y,favourite_count_y,RT_of_ID,search_key_y,is_retweet_y
0,1374308392186036225,2021-03-23 10:33:48+00:00,Did it ever happen to you?🏨😂 #fortnite #fortni...,,"[fortnite, fortniteclips, memes, gaming, ps, f...",HNoel526,HNoel526,1373921658440986627,,,...,,,,,NaT,,,,,
1,1374307966250229767,2021-03-23 10:32:07+00:00,#memorycare #seniorliving #alzheimerscaregiver...,,"[memorycare, seniorliving, alzheimerscaregiver...",CharnwoodCaremark,CareCharnwood,1289130724059119621,"Charnwood, UK",Licensed home care agency providing a comprehe...,...,,,,,NaT,,,,,
2,1374307807051141126,2021-03-23 10:31:29+00:00,RT @BrwnStoneMG: Keegan-Michael Key is celebra...,['BrwnStoneMG'],"[keyandpeele, happybirthday, birthdaycake, cak...",Sam Traspe,djsammyt0917,135435635,,I am Sam :-),...,,,,,NaT,,,,,
3,1374307495720673285,2021-03-23 10:30:15+00:00,“Non-Fungible Tokens” Driving Investment Crazy...,,"[cryptoartist, cryptocollectibles, bhfyp, nft,...",Andy Wood,clickandywood,59716459,"England, United Kingdom",Image punk and visual arts. Citizen Musk Coll...,...,,,,,NaT,,,,,
4,1374306170546970627,2021-03-23 10:24:59+00:00,Check out Milk Mathi on Aslidesi.\nLink Below\...,,"[lunch, vegan, healthysnacks, chips, streetfoo...",Aslidesi.com,aslidesimarket,1226025759698874368,Delhi,"https://t.co/o2HFcwJiut, The global market of ...",...,,,,,NaT,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7610,1374285493303713801,2021-03-23 09:02:49+00:00,RT @jisungskrrt: i miss the your behavior is s...,['jisungskrrt'],,yeonmi,yeonmi58133383,1347868764373741570,,CHANGLIX MADE A SONG https://t.co/BuChk6BvFf,...,,,,,NaT,,,,,
7611,1374285443013873665,2021-03-23 09:02:37+00:00,@IsssCb1 Yummy,['IsssCb1'],,爱穿内裤子,lovesexyundies,1168544690485940224,"Kuala Lumpur City, Kuala Lumpu",FOLLOW ME if you are underwear lover too... CH...,...,,,,,NaT,,,,,
7612,1374285397434396679,2021-03-23 09:02:26+00:00,"RT @MamboCakeHouse: It's a moist, delicious, y...",['MamboCakeHouse'],,The good gal💥 😍❤️🍷,Pliam_,1221418848135581701,"Kampala, Uganda","I love, I hate, I cry and I laugh I'm human. \...",...,,,,,NaT,,,,,
7613,1374285382762729482,2021-03-23 09:02:22+00:00,RT @MicahChatterton: @Em__Dash__ What is this ...,"['MicahChatterton', 'Em__Dash__']",,Lime Link,lime_link,992797816618561536,,Helping podcasters. Retweeting #podernfamily.\...,...,,,,,NaT,,,,,


In [1]:
import sys

In [2]:
sys.path.append('..')

In [3]:
from transform.read_transform import read_transform

In [4]:
data = read_transform(path_tweets='../data/TW.csv', path_retweets='../data/RT.csv', join_method='concat')

Cols after loop: 
 tweet_id creation_date full_text mentions entities_hashtags user_name user_screen_name user_id location description protected followers_count friends_count profile_created_at retweet_count favourite_count search_key is_retweet


In [5]:
data.sample(2)

Unnamed: 0,tweet_id,creation_date,full_text,mentions,entities_hashtags,user_name,user_screen_name,user_id,location,description,protected,followers_count,friends_count,profile_created_at,retweet_count,favourite_count,search_key,is_retweet,RT_of_ID
6203,1374136827876106240,2021-03-22 23:12:04+00:00,RT @MatthewModine: Here is ONE SIMPLE thing yo...,['MatthewModine'],"[AmazonRainforest, vegan, vegetarian, Brazil]","Antifa e ""Fora Bolsonaro, Fora!""",joaoaliano,17003486,,,False,403,1377,2008-10-27 18:13:49+00:00,108,0,vegetarian,True,
1690,1374158636088459265,2021-03-23 00:38:44+00:00,RT @lovynlife: Heart Warming Rescued Babies 💕\...,['lovynlife'],"[govegan, vegan, choices]",Lars Ferdsson,LarsFerdsson,1372530509163728899,"Amsterdam, The Netherlands",A Man of Conviction,False,6,110,2021-03-18 12:50:19+00:00,159,0,govegan,True,


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7906 entries, 0 to 1119
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   tweet_id            7906 non-null   int64              
 1   creation_date       7906 non-null   datetime64[ns, UTC]
 2   full_text           7906 non-null   object             
 3   mentions            5831 non-null   object             
 4   entities_hashtags   4030 non-null   object             
 5   user_name           7904 non-null   object             
 6   user_screen_name    7906 non-null   object             
 7   user_id             7906 non-null   int64              
 8   location            5380 non-null   object             
 9   description         7082 non-null   object             
 10  protected           7906 non-null   bool               
 11  followers_count     7906 non-null   int64              
 12  friends_count       7906 non-null 