In [797]:
import pandas as pd
from textblob import TextBlob
import re

In [815]:
df = pd.read_csv('tweets.csv', encoding="utf-8")

In [816]:
df.isnull().sum()

username             0
to                1874
text                93
retweets             0
favorites            0
replies              0
id                   0
permalink            0
author_id            0
date                 0
formatted_date       0
hashtags          8192
mentions          7833
geo               8205
urls              7272
dtype: int64

In [817]:
# drop insignificant columns
df.drop(['username', 'id', 'permalink', 'author_id', 'geo', 'hashtags', 'formatted_date', 'mentions'], axis=1, inplace=True)

In [818]:
df['text'].astype(str)

0                          @TheEconomist Interesting name
1       @Benzinga I didn't take anyone's car and never...
2       My best guess for 2016: ~70% landing success r...
3                          @JeffBezos @SpaceX Thanks Jeff
4       Falcon lands on droneship, but the lockout col...
                              ...                        
8200    Yeah, very important to provide C/BiPAP device...
8201    Invasive ventilators are for worst case patien...
8202    Exactly. Moreover, all hospitals were given ex...
8203                         Will call when we reach Mars
8204                                              Exactly
Name: text, Length: 8205, dtype: object

In [819]:
df.describe()

Unnamed: 0,retweets,favorites,replies
count,8205.0,8205.0,8205.0
mean,2252.576843,18078.81,487.715661
std,10393.932164,54552.58,1594.076078
min,0.0,60.0,0.0
25%,41.0,917.0,41.0
50%,122.0,2339.0,98.0
75%,982.0,12074.0,379.0
max,391852.0,1712683.0,51016.0


In [820]:
df.head()

Unnamed: 0,to,text,retweets,favorites,replies,date,urls
0,TheEconomist,@TheEconomist Interesting name,263,1262,71,2016-01-23 19:43:17+00:00,
1,Benzinga,@Benzinga I didn't take anyone's car and never...,47,242,53,2016-01-20 17:56:05+00:00,
2,,My best guess for 2016: ~70% landing success r...,1431,5151,393,2016-01-19 04:11:54+00:00,
3,JeffBezos,@JeffBezos @SpaceX Thanks Jeff,158,1068,50,2016-01-18 03:09:21+00:00,
4,,"Falcon lands on droneship, but the lockout col...",8026,8693,1234,2016-01-18 03:07:21+00:00,https://www.instagram.com/p/BAqirNbwEc0/


In [821]:
# parse date as index and convert timezone to EST
df['date'] = pd.to_datetime( df['date'], format='%Y-%m-%d %H:%M:%S')
df.set_index('date', inplace=True)
df.index = df.index.tz_convert('EST')

In [822]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8205 entries, 2016-01-23 14:43:17-05:00 to 2020-04-01 23:12:18-05:00
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   to         6331 non-null   object
 1   text       8112 non-null   object
 2   retweets   8205 non-null   int64 
 3   favorites  8205 non-null   int64 
 4   replies    8205 non-null   int64 
 5   urls       933 non-null    object
dtypes: int64(3), object(3)
memory usage: 448.7+ KB


In [823]:
# sentiment_analysis
def sentiment_calc(text):
    try:
        return TextBlob(text).sentiment
    except:
        return (0, 0)

df['sentiment'] = df['text'].apply(sentiment_calc)
df['polarity'] = df['sentiment'].apply(lambda x: tuple(x)[0])
df['subjectivity'] = df['sentiment'].apply(lambda x: tuple(x)[1])
df.drop(['sentiment'], axis=1, inplace=True)

In [844]:
# extract info of if url is news
reg = r'https?:\/\/([\w\d.]+)\/[\w\d\/\-\?\=]*'
news_keywords = r'wsj|news|forbes|bloomberg|finance|money|investopedia|marketwatch|cnbc|times|fortune|nasdaq|cnn|huffpost|cnn|usatoday|npr'
def get_domain(url):
    url = str(url)
    result =  re.search(reg, url)
    if result:
        return result.group(1)
    return None

df['url_domain'] = df['urls'].apply(get_domain)
df['url_news'] = df['url_domain'].str.contains(news_keywords)
df['url_news'].fillna(False, inplace=True)

df.drop(['url_domain'], axis=1, inplace=True)

In [845]:
# extract info of if url is video
df['url_video'] = df['urls'].str.contains('video|youtube.com|watch')
df['url_video'].fillna(False, inplace=True)
df.drop(['urls'], axis=1, inplace=True)

In [846]:
df.drop(['urls', , 'url_domain', ''], axis=1, inplace=True)

SyntaxError: invalid syntax (<ipython-input-846-1d4103202d57>, line 1)

In [847]:
# find if tweet is a retweet
df['is_retweet'] = df['to'].notnull()
df.drop(['to'], axis=1, inplace=True)

In [848]:
# extract keyword count
tesla = '[Tt]esla'
closely_related_keywords = r'Model [\w\d]|Cars? | cars? |[Tt]ruck|[Ee]lectric|[Ss]olar|[Rr]oof|Semi|[Aa]uto(nomous|pilot|steer(ing)?)?|[Pp]anel|[Ee]nergy|[Mm]egapack|drive(ing)|[Tt]axi? | acceleration|top speed|brak(e|ing)'
money_related_keywords = r'[Aa]ffordable|[Cc]heap|[Ee]xpensive|[Pp]rice]|[Ss]tock'
other_related_keywords = r'[Ss]pace|[Mm]ars|[Ll]aunch|AI|neural|Boring Company|Hyperloop|[Ii]terview|boringcompany'
def word_match_count(text, pattern):
    if text is None:
        return 0
    text = str(text)
    return len(re.findall(pattern, text))

df['closely_related'] = df['text'].apply(word_match_count, args=(closely_related_keywords,))
df['money_related'] = df['text'].apply(word_match_count, args=(money_related_keywords,))
df['other_related'] = df['text'].apply(word_match_count, args=(other_related_keywords,))
df['tesla'] = df['text'].apply(word_match_count, args=(tesla,))
df.drop(['text'], axis=1, inplace=True)

In [849]:
# convert bool to int
df[['is_retweet', 'url_news', 'url_video']] = df[['is_retweet', 'url_news', 'url_video']].astype(int)

In [850]:
df.describe()

Unnamed: 0,retweets,favorites,replies,polarity,subjectivity,url_news,url_video,is_retweet,closely_related,money_related,other_related,tesla
count,8205.0,8205.0,8205.0,8205.0,8205.0,8205.0,8205.0,8205.0,8205.0,8205.0,8205.0,8205.0
mean,2252.576843,18078.81,487.715661,0.119454,0.334493,0.007313,0.015235,0.771603,0.160024,0.006216,0.090189,0.101523
std,10393.932164,54552.58,1594.076078,0.26683,0.312552,0.085206,0.122492,0.419826,0.509405,0.083122,0.340157,0.343575
min,0.0,60.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,41.0,917.0,41.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
50%,122.0,2339.0,98.0,0.0,0.325,0.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,982.0,12074.0,379.0,0.25,0.566667,0.0,0.0,1.0,0.0,0.0,0.0,0.0
max,391852.0,1712683.0,51016.0,1.0,1.0,1.0,1.0,1.0,7.0,2.0,4.0,3.0
