In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import string

from pyScript import get_tweets, cleanup_columns, cleanup_rows, display_img, top_characteristics

%matplotlib inline

# Web/Twitter scraping

In [2]:
# get_tweets('(from:jimmyfallon) until:2020-03-22 since:2020-03-01', 'data_20')

In [3]:
# get_tweets('(from:jimmyfallon) until:2019-03-22 since:2019-03-01', 'data_19')

In [4]:
data_20 = pd.read_csv('data_20.csv')
data_19 = pd.read_csv('data_19.csv')

In [5]:
list(data_20)

['Unnamed: 0',
 'screen_name',
 'username',
 'user_id',
 'tweet_id',
 'tweet_url',
 'timestamp',
 'timestamp_epochs',
 'text',
 'text_html',
 'links',
 'hashtags',
 'has_media',
 'img_urls',
 'video_url',
 'likes',
 'retweets',
 'replies',
 'is_replied',
 'is_reply_to',
 'parent_tweet_id',
 'reply_to_users']

As I checked the data, I realized that some tweets are not there and I had to manually add them to the csv file. I merge it to our 2020 dataframe.

In [6]:
data_20_missing_rows = pd.read_csv('data_20_missing_rows.csv', sep=';', header=1)

In [7]:
data_20_missing_rows.head()

Unnamed: 0.1,Unnamed: 0,screen_name,username,user_id,tweet_id,tweet_url,timestamp,timestamp_epochs,text,text_html,...,has_media,img_urls,video_url,likes,retweets,replies,is_replied,is_reply_to,parent_tweet_id,reply_to_users
0,0,jimmyfallon,jimmy fallon,15485441,1241176709770117120,/jimmyfallon/status/1241176709770117120,2020-03-21 02:36:00,,Had a great chat with @JBALVIN and we’re raisi...,,...,False,[],,389,71,27,True,False,,[]
1,1,jimmyfallon,jimmy fallon,15485441,1241176598419816448,/jimmyfallon/status/1241176598419816448,2020-03-21 02:35:00,,The Tonight Show: At Home Edition ( @JBALVIN )...,,...,True,[],,3900,288,328,True,False,,[]
2,2,jimmyfallon,jimmy fallon,15485441,1240840137841524737,/jimmyfallon/status/1240840137841524737,2020-03-20 04:18:00,,Tonight we are raising money for @SavetheChild...,,...,False,[],,427,64,20,True,False,,[]
3,3,jimmyfallon,jimmy fallon,15485441,1240835093419241472,/jimmyfallon/status/1240835093419241472,2020-03-20 03:58:00,,The Tonight Show: At Home Edition (Jennifer Ga...,,...,True,[],,3300,360,298,True,False,,[]
4,4,jimmyfallon,jimmy fallon,15485441,1240451755474931712,/jimmyfallon/status/1240451755474931712,2020-03-19 02:35:00,,The Tonight Show: At Home Edition ( @Lin_Manue...,,...,True,[],,5300,623,244,True,False,,[]


In [8]:
data_20 = data_20.append(data_20_missing_rows, ignore_index=True, sort=False)
data_20.head()

Unnamed: 0.1,Unnamed: 0,screen_name,username,user_id,tweet_id,tweet_url,timestamp,timestamp_epochs,text,text_html,...,has_media,img_urls,video_url,likes,retweets,replies,is_replied,is_reply_to,parent_tweet_id,reply_to_users
0,0,jimmyfallon,jimmy fallon,15485441,1241180473344262144,/jimmyfallon/status/1241180473344262144,2020-03-21 01:50:59,1584755000.0,I miss you bud!!! Love you!!,"<p class=""TweetTextSize js-tweet-text tweet-te...",...,False,[],,113,14,14,True,True,1.241179e+18,"[{'screen_name': 'jamespoyser', 'user_id': '15..."
1,1,jimmyfallon,jimmy fallon,15485441,1241079393683943424,/jimmyfallon/status/1241079393683943424,2020-03-20 19:09:20,1584731000.0,That was very cool. We heard you!Thank you to ...,"<p class=""TweetTextSize js-tweet-text tweet-te...",...,False,[],,2716,224,160,True,False,,[]
2,2,jimmyfallon,jimmy fallon,15485441,1240688758069968896,/jimmyfallon/status/1240688758069968896,2020-03-19 17:17:05,1584638000.0,I’m huge in the 3 year old demo. Please tell h...,"<p class=""TweetTextSize js-tweet-text tweet-te...",...,False,[],,2574,118,103,True,False,,[]
3,3,jimmyfallon,jimmy fallon,15485441,1240431406204977152,/jimmyfallon/status/1240431406204977152,2020-03-19 00:14:27,1584577000.0,"Fine, you can paint Daddy’s nails. #MyQuaranti...","<p class=""TweetTextSize js-tweet-text tweet-te...",...,False,[],,8950,663,481,True,False,,[]
4,4,jimmyfallon,jimmy fallon,15485441,1240431255503724547,/jimmyfallon/status/1240431255503724547,2020-03-19 00:13:52,1584577000.0,"Hey guys. It's been a crazy week, so for our f...","<p class=""TweetTextSize js-tweet-text tweet-te...",...,False,[],,6405,473,3461,True,False,,[]


In [9]:
data_20.shape

(39, 22)

In [10]:
data_20.to_csv('data_20_completed.csv')

In [11]:
_header = ['tweetId', 'tweetUrl', 'timestamp', 'text', 'hashtags', 'hasMedia', 'imgUrl', 'likes', 'retweets', 'replies', 'isReplied']
_drop = ['Unnamed: 0', 'screen_name', 'username', 'user_id', 'timestamp_epochs', 'text_html', 'links', 'video_url', 'is_reply_to', 'parent_tweet_id', 'reply_to_users']

In [12]:
data_20 = cleanup_columns(data_20, names=_header, to_drop=_drop)
data_20.head()

Unnamed: 0,tweetId,tweetUrl,timestamp,text,hashtags,hasMedia,imgUrl,likes,retweets,replies,isReplied
0,1241180473344262144,/jimmyfallon/status/1241180473344262144,2020-03-21 01:50:59,I miss you bud!!! Love you!!,[],False,[],113,14,14,True
1,1241079393683943424,/jimmyfallon/status/1241079393683943424,2020-03-20 19:09:20,That was very cool. We heard you!Thank you to ...,['CowbellChallenge'],False,[],2716,224,160,True
2,1240688758069968896,/jimmyfallon/status/1240688758069968896,2020-03-19 17:17:05,I’m huge in the 3 year old demo. Please tell h...,[],False,[],2574,118,103,True
3,1240431406204977152,/jimmyfallon/status/1240431406204977152,2020-03-19 00:14:27,"Fine, you can paint Daddy’s nails. #MyQuaranti...",['MyQuarantineInSixWords'],False,[],8950,663,481,True
4,1240431255503724547,/jimmyfallon/status/1240431255503724547,2020-03-19 00:13:52,"Hey guys. It's been a crazy week, so for our f...",['MyQuarantineInSixWords'],False,[],6405,473,3461,True


In [13]:
data_19 = cleanup_columns(data_19, names=_header, to_drop=_drop)
data_19.head()

Unnamed: 0,tweetId,tweetUrl,timestamp,text,hashtags,hasMedia,imgUrl,likes,retweets,replies,isReplied
0,1108780905903149061,/jimmyfallon/status/1108780905903149061,2019-03-21 17:22:22,What a team you’ve got? Loved them. Crush it o...,[],False,[],1413,112,35,True
1,1108390399364812800,/jimmyfallon/status/1108390399364812800,2019-03-20 15:30:38,Great performance last night from @findingflet...,['FallonTonight'],False,[],1727,188,39,True
2,1108367192419155969,/jimmyfallon/status/1108367192419155969,2019-03-20 13:58:25,"Tonight on the show: @armiehammer is here, @AJ...",['FallonTonight'],False,[],929,107,45,True
3,1108158944248299522,/jimmyfallon/status/1108158944248299522,2019-03-20 00:10:55,"Tonight!! @kobebryant, Aidy Bryant, @nataliemo...",['FallonTonight'],False,[],1240,96,50,True
4,1108010438963482624,/jimmyfallon/status/1108010438963482624,2019-03-19 14:20:49,I tried to throw the\nPing pong ball in the cu...,['SpringBreakHaiku'],False,[],4822,349,150,True


# Data Mining

In [14]:
data_20.isna().sum()

tweetId      0
tweetUrl     0
timestamp    0
text         0
hashtags     0
hasMedia     0
imgUrl       0
likes        0
retweets     0
replies      0
isReplied    0
dtype: int64

In [15]:
# dropping the duplicates and nans
print(f'Shape before: {data_20.shape[0]}')
data_20 = cleanup_rows(df=data_20)
print(f'Shape after: {data_20.shape[0]}')

Shape before: 39
Shape after: 39


In [16]:
# dropping the duplicates and nans
print(f'Shape before: {data_19.shape[0]}')
data_19 = cleanup_rows(df=data_19)
print(f'Shape after: {data_19.shape[0]}')

Shape before: 31
Shape after: 31


In [17]:
# top tweets
_tops = 10

In [18]:
# convert the imgUrl list to a string
data_20.imgUrl = data_20.imgUrl.apply(''.join)
data_19.imgUrl = data_19.imgUrl.apply(''.join)
data_20.head(1)

Unnamed: 0,tweetId,tweetUrl,timestamp,text,hashtags,hasMedia,imgUrl,likes,retweets,replies,isReplied
0,1241180473344262144,/jimmyfallon/status/1241180473344262144,2020-03-21 01:50:59,I miss you bud!!! Love you!!,[],False,[],113,14,14,True


In [19]:
_top_tweets_20 = data_20.sort_values(by='likes', ascending=False).head(_tops)
_lowest_tweets_20 = data_20.sort_values(by='likes', ascending=False).tail(_tops)
_top_tweets_20.head(1)

Unnamed: 0,tweetId,tweetUrl,timestamp,text,hashtags,hasMedia,imgUrl,likes,retweets,replies,isReplied
11,1238913079821455361,/jimmyfallon/status/1238913079821455361,2020-03-14 19:41:10,Are you stuck at home with your family going s...,[],False,[],13960,748,4533,True


In [20]:
_top_tweets_19 = data_19.sort_values(by='likes', ascending=False).head(_tops)
_lowest_tweets_19 = data_19.sort_values(by='likes', ascending=False).tail(_tops)
_top_tweets_19.head(1)

Unnamed: 0,tweetId,tweetUrl,timestamp,text,hashtags,hasMedia,imgUrl,likes,retweets,replies,isReplied
15,1103333591726067712,/jimmyfallon/status/1103333591726067712,2019-03-06 16:36:41,Ha!! I like this! #ChimmyClockhttps://twitter....,['ChimmyClockhttps'],False,[],73476,21203,1121,True


In [21]:
# list(_top_tweets.imgUrl)[0][0]

In [22]:
from IPython.display import HTML
from PIL import Image

In [23]:
HTML(_top_tweets_20[['text', 'likes', 'imgUrl']].to_html(formatters={'imgUrl': display_img}, escape=False))

Unnamed: 0,text,likes,imgUrl
11,Are you stuck at home with your family going stir crazy right now? Tweet me your favorite things to do with your kids in the house and I’ll RT my favs!!,13960,
35,The Tonight Show: At Home Edition (The First One) https://youtu.be/bEQl6Pt-654,12100,
3,"Fine, you can paint Daddy’s nails. #MyQuarantineInSixWords",8950,
36,#WashYourHandsSong,8800,
5,Thank you for doing this. And Dear Theodosia was such a BONUS!!!!! Whaaaaaa??!??https://twitter.com/Lin_Manuel/status/1240408633067876353 …,7666,
4,"Hey guys. It's been a crazy week, so for our first Tonight Show: At Home Edition Hashtags, use six words to describe your time staying home and tag it with #MyQuarantineInSixWords. Could be on the show!",6405,
10,"Right now I'm thinking about what we can do to help our most vulnerable populations - children who are losing the one meal they may rely on per day, our friends and family who are facing job issues, the elderly, and low-income families.pic.twitter.com/OnimRUZKnK",5886,
32,The Tonight Show: At Home Edition ( @Lin_Manuel ) https://youtu.be/SOS7e6UTNPI,5300,
12,I’m available to do Oh Na Na Na at weddings and birthday parties. Please contact my agent: @charlidamelio.https://twitter.com/POPSUGAR/status/1237561892983324673 …,4328,
16,"All new show tonight @Nick_Offerman, @charlidamelio and @HAIMtheband (Summer Girl special request). #FallonTonight #NBC 11:30PM set your DVR.",4091,


In [24]:
HTML(_lowest_tweets_20[['text', 'likes', 'imgUrl']].to_html(formatters={'imgUrl': display_img}, escape=False))

Unnamed: 0,text,likes,imgUrl
34,We’re going to be highlighting a different charity each night - tonight it’s @FeedingAmerica go here to learn more and donate if you can!,1000,
13,New jam with @HAIMtheband “I Liked an Instagram Post”https://youtu.be/0iypSh2TXVY,975,
37,Speaking of tirelessly - my man @ChefJoseAndres ' org @WCKitchen is again showing us all what we can be as we work through these weird times. Sending love and thanks. #ChefsForAmerica,825,
33,Tonight we’re raising money for the @BCEFA Emergency Fund - go here to learn more and donate if you can!,728,
30,Tonight we are raising money for @SavetheChildren - Follow them on Instagram http://instagram.com/savewithstories for more info or click the Donate button on the YouTube video!,427,
28,Had a great chat with @JBALVIN and we’re raising money for @fams2gether which is part of @domesticworkers - click the Donate button on our YouTube video if you’re able to help!,389,
22,@MooreOrange I love tacos.,137,
0,I miss you bud!!! Love you!!,113,
14,Go!! Go!!! Go!!,93,
15,You are welcome whenever. Always fun!,62,


In [25]:
print('Characteristics of top tweets for 2008')
top_characteristics(_top_tweets_20, _tops)

Characteristics of top tweets for 2008


Unnamed: 0,hasMedia,hasHashtag,avarage hashtags,avarage text length,avarage likes,avarage retweet,avarage replies,avarage isReplied
10 Tweets,4,10,10.9,129.1,7748.6,663.4,981.7,1.0


In [26]:
print('Characteristics of lowest tweets for 2008')
top_characteristics(_lowest_tweets_20, _tops)

Characteristics of lowest tweets for 2008


Unnamed: 0,hasMedia,hasHashtag,avarage hashtags,avarage text length,avarage likes,avarage retweet,avarage replies,avarage isReplied
10 Tweets,0,10,3.7,96.4,474.9,64.2,20.4,1.0


In [27]:
print('Characteristics of top tweets for 2018')
top_characteristics(_top_tweets_19, _tops)

Characteristics of top tweets for 2018


Unnamed: 0,hasMedia,hasHashtag,avarage hashtags,avarage text length,avarage likes,avarage retweet,avarage replies,avarage isReplied
10 Tweets,0,10,13.6,136.2,15317.9,2827.3,393.0,1.0


In [28]:
print('Characteristics of lowest tweets for 2018')
top_characteristics(_lowest_tweets_19, _tops)

Characteristics of lowest tweets for 2018


Unnamed: 0,hasMedia,hasHashtag,avarage hashtags,avarage text length,avarage likes,avarage retweet,avarage replies,avarage isReplied
10 Tweets,0,10,11.8,114.9,921.0,64.1,48.0,1.0
