In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
feeling_words = pd.read_csv('../data/feeling_words')

In [3]:
twitter_tweets = pd.read_csv('../data/twitter_tweets', engine='python')

### Checking out the dataframes 👀

In [4]:
feeling_words.head()

Unnamed: 0,words,sub_category,main_category
0,shy,confused,unpleasant
1,afflicted,hurt,unpleasant
2,vulnerable,helpless,unpleasant
3,fascinated,interested,pleasant
4,certain,good/strong,pleasant


In [5]:
feeling_words.tail()

Unnamed: 0,words,sub_category,main_category
235,comforted,love,pleasant
236,doubtful,afraid/confused,unpleasant
237,alone,helpless,unpleasant
238,gay,happy,pleasant
239,keen,positive,pleasant


In [6]:
feeling_words.shape

(240, 3)

In [7]:
feeling_words.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   words          240 non-null    object
 1   sub_category   240 non-null    object
 2   main_category  240 non-null    object
dtypes: object(3)
memory usage: 5.8+ KB


In [8]:
twitter_tweets.head()

Unnamed: 0,id,created_at,text,retweet_count,reply_count,like_count,quote_count,date,time
0,1384122103977316355,2021-04-19T12:29:59.000Z,@Sportyfreak2005 ISL played an huge role in de...,0.0,0.0,1,0,2021-04-19,12:29:59.000Z
1,1384122103918596098,2021-04-19T12:29:59.000Z,@O_ssai @AyoOyalowo Was having a conversation ...,0.0,1.0,7,0,2021-04-19,12:29:59.000Z
2,1384122103897546760,2021-04-19T12:29:59.000Z,@Aventonio @GoonerJel_LDN To us it will mean w...,0.0,0.0,1,0,2021-04-19,12:29:59.000Z
3,1384122103876648967,2021-04-19T12:29:59.000Z,Money stop nonsense is the main reason why mos...,1.0,0.0,2,0,2021-04-19,12:29:59.000Z
4,1384122103863996422,2021-04-19T12:29:59.000Z,Watching PMs speech right now and honestly it'...,2.0,0.0,2,0,2021-04-19,12:29:59.000Z


In [9]:
twitter_tweets.tail()

Unnamed: 0,id,created_at,text,retweet_count,reply_count,like_count,quote_count,date,time
25007,1384121583128571912,2021-04-19T12:27:55.000Z,Beware the fallout of America’s exit from Afgh...,0.0,0.0,0,0,2021-04-19,12:27:55.000Z
25008,1384121583124434944,2021-04-19T12:27:55.000Z,"In MyinGyan, there are 4 death,4 in critical c...",0.0,0.0,0,0,2021-04-19,12:27:55.000Z
25009,1384121583053144066,2021-04-19T12:27:55.000Z,"no motherfucker if YOU want to MAKE a sequel, ...",0.0,0.0,0,0,2021-04-19,12:27:55.000Z
25010,1384121583015317514,2021-04-19T12:27:55.000Z,Way past his 15 minutes of fame. https://t.co/...,0.0,0.0,0,0,2021-04-19,12:27:55.000Z
25011,1384121582994432011,2021-04-19T12:27:55.000Z,Wednesday to the moon 🌝🚀 #Safemoon https://t.c...,9.0,1.0,26,0,2021-04-19,12:27:55.000Z


In [10]:
twitter_tweets.shape

(25012, 9)

In [11]:
twitter_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25012 entries, 0 to 25011
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             25012 non-null  object 
 1   created_at     25011 non-null  object 
 2   text           25011 non-null  object 
 3   retweet_count  25010 non-null  float64
 4   reply_count    25010 non-null  float64
 5   like_count     25010 non-null  object 
 6   quote_count    25010 non-null  object 
 7   date           25009 non-null  object 
 8   time           25009 non-null  object 
dtypes: float64(2), object(7)
memory usage: 1.7+ MB


In [12]:
twitter_tweets.isnull().sum()

id               0
created_at       1
text             1
retweet_count    2
reply_count      2
like_count       2
quote_count      2
date             3
time             3
dtype: int64

### Looking for the rows with NaN values 🤨

In [13]:
nan_value = twitter_tweets.isnull()
row_nan = nan_value.any(axis=1)
rows_with_nan = twitter_tweets[row_nan]
rows_with_nan

Unnamed: 0,id,created_at,text,retweet_count,reply_count,like_count,quote_count,date,time
2752,1384122047689728004,2021-04-19T12:29:46.000Z,What state is home to an area known as the Res...,,,,,,
2753,Join me on Travel Trivia!,,,,,,,,
2754,https://t.co/SywEbpPDYk,0,0,0.0,0.0,2021-04-19,12:29:46.000Z,,


#### A row in the dataframe was parsed incorrectly into three rows of gibberish  👁 👄 👁
Checking out where the parse went wrong 😅

In [14]:
twitter_tweets.text[2752]

'What state is home to an area known as the Research Triangle? '

In [15]:
twitter_tweets.id[2753]

'Join me on Travel Trivia!'

In [16]:
twitter_tweets.id[2754]

' https://t.co/SywEbpPDYk'

In [17]:
#replacing the value in the text column for index 2752.
twitter_tweets.loc[(twitter_tweets.text == 'What state is home to an area known as the Research Triangle? '),'text']='What state is home to an area known as the Research Triangle? Join me on Travel Trivia! https://t.co/SywEbpPDYk'

In [18]:
twitter_tweets.text[2752]

'What state is home to an area known as the Research Triangle? Join me on Travel Trivia! https://t.co/SywEbpPDYk'

In [19]:
twitter_tweets.loc[(twitter_tweets['date'] == 'None'),'date'] = '2021-04-19'

In [20]:
print(type(twitter_tweets['date'][2752]))

<class 'NoneType'>


👁 👄 👁 .. 何

#### Changing the NoneType to a str...Noticed other columns with the NoneType as well.

In [21]:
date_list = ['None' if v is None else v for v in twitter_tweets['date']]

In [22]:
#checking for the change. Correct! Three values in the date column are null.
date_list.count('None')

3

In [23]:
twitter_tweets[['date']] = date_list

In [24]:
twitter_tweets['date'][2752]

'None'

#### Let's change all the columns with NoneType values!

In [25]:
like_count_list = ['None' if v is None else v for v in twitter_tweets['like_count']]

In [26]:
quote_count_list = ['None' if v is None else v for v in twitter_tweets['quote_count']]

In [27]:
time_list = ['None' if v is None else v for v in twitter_tweets['time']]

In [28]:
like_count_list.count('None')

2

In [29]:
quote_count_list.count('None')

2

In [30]:
time_list.count('None')

3

#### Change the columns like_count, quote_count, & time_list  🔮

In [31]:
twitter_tweets[['like_count']] = like_count_list

In [32]:
twitter_tweets[['quote_count']] = quote_count_list

In [33]:
twitter_tweets[['time']] = time_list

#### Back to replacing values in row 2752.

In [34]:
twitter_tweets.loc[(twitter_tweets['date'] == 'None'),'date'] = '2021-04-19'

In [35]:
twitter_tweets['date'][2752]

'2021-04-19'

In [36]:
twitter_tweets.loc[(twitter_tweets['time'] == 'None'),'time'] = '12:29:46.000Z'

In [37]:
twitter_tweets['time'][2752]

'12:29:46.000Z'

In [38]:
twitter_tweets.loc[(twitter_tweets['like_count'] == 'None'),'like_count'] = 0

In [39]:
twitter_tweets['like_count'][2752]

0

In [40]:
twitter_tweets.loc[(twitter_tweets['quote_count'] == 'None'),'quote_count'] = 0

In [41]:
twitter_tweets['quote_count'][2752]

0

In [42]:
twitter_tweets[['retweet_count','reply_count']] = twitter_tweets[['retweet_count','reply_count']].fillna(0)

In [43]:
twitter_tweets['retweet_count'][2752]

0.0

In [44]:
twitter_tweets['reply_count'][2752]

0.0

#### Perfect! the changes worked! Time to drop the unwanted rows ✂️

In [45]:
twitter_tweets = twitter_tweets.drop(twitter_tweets.index[[2753,2754]])

In [47]:
twitter_tweets.shape

(25010, 9)

#### Convert datatype for retweet_count & reply_count column.

In [48]:
convert_types = {
    'retweet_count':int,
    'reply_count':int,
    'like_count':int,
    'quote_count':int
}

In [49]:
twitter_tweets = twitter_tweets.astype(convert_types)
twitter_tweets

Unnamed: 0,id,created_at,text,retweet_count,reply_count,like_count,quote_count,date,time
0,1384122103977316355,2021-04-19T12:29:59.000Z,@Sportyfreak2005 ISL played an huge role in de...,0,0,1,0,2021-04-19,12:29:59.000Z
1,1384122103918596098,2021-04-19T12:29:59.000Z,@O_ssai @AyoOyalowo Was having a conversation ...,0,1,7,0,2021-04-19,12:29:59.000Z
2,1384122103897546760,2021-04-19T12:29:59.000Z,@Aventonio @GoonerJel_LDN To us it will mean w...,0,0,1,0,2021-04-19,12:29:59.000Z
3,1384122103876648967,2021-04-19T12:29:59.000Z,Money stop nonsense is the main reason why mos...,1,0,2,0,2021-04-19,12:29:59.000Z
4,1384122103863996422,2021-04-19T12:29:59.000Z,Watching PMs speech right now and honestly it'...,2,0,2,0,2021-04-19,12:29:59.000Z
...,...,...,...,...,...,...,...,...,...
25007,1384121583128571912,2021-04-19T12:27:55.000Z,Beware the fallout of America’s exit from Afgh...,0,0,0,0,2021-04-19,12:27:55.000Z
25008,1384121583124434944,2021-04-19T12:27:55.000Z,"In MyinGyan, there are 4 death,4 in critical c...",0,0,0,0,2021-04-19,12:27:55.000Z
25009,1384121583053144066,2021-04-19T12:27:55.000Z,"no motherfucker if YOU want to MAKE a sequel, ...",0,0,0,0,2021-04-19,12:27:55.000Z
25010,1384121583015317514,2021-04-19T12:27:55.000Z,Way past his 15 minutes of fame. https://t.co/...,0,0,0,0,2021-04-19,12:27:55.000Z


In [50]:
twitter_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25010 entries, 0 to 25011
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             25010 non-null  object
 1   created_at     25010 non-null  object
 2   text           25010 non-null  object
 3   retweet_count  25010 non-null  int64 
 4   reply_count    25010 non-null  int64 
 5   like_count     25010 non-null  int64 
 6   quote_count    25010 non-null  int64 
 7   date           25010 non-null  object
 8   time           25010 non-null  object
dtypes: int64(4), object(5)
memory usage: 1.9+ MB


**Save the newly cleaned Twitter tweets dataframe to a csv**  💾

In [51]:
twitter_tweets.to_csv(path_or_buf = '../data/cleaned_twitter_tweets',index=False)