In [1]:
# https://towardsdatascience.com/sentiment-analysis-for-stock-price-prediction-in-python-bed40c65d178

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import twint
from bs4 import BeautifulSoup
import re
import yfinance as yf
from nltk.tokenize import WordPunctTokenizer
import flair

# Fixes runtime errors and compatibility issues while running Twint in notebook
import nest_asyncio
nest_asyncio.apply()


In [2]:

# Configure twint search settings (https://github.com/twintproject/twint/wiki/Configuration)
c = twint.Config()
c.Username = 'elonmusk'
c.User_full = True
c.Count = True
c.Stats = True
c.Pandas = True
c.Store_pandas = True
c.Hide_output = True
c.Pandas_clean = True
#c.Limit = 10
c.Since = '2019-01-01'
#c.Search = 'TSLA'

In [3]:
# Run twint search with configurations
twint.run.Search(c)

[!] No more data! Scraping will stop now.
found 0 deleted tweets in this search.
[+] Finished: Successfully collected 7159 Tweets from @elonmusk.


In [4]:
# Set columns for dataframe

def available_columns():
    return twint.output.panda.Tweets_df.columns
def twint_to_pandas(columns):
    return twint.output.panda.Tweets_df[columns]

In [5]:
available_columns()

Index(['id', 'conversation_id', 'created_at', 'date', 'timezone', 'place',
       'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str',
       'username', 'name', 'day', 'hour', 'link', 'urls', 'photos', 'video',
       'thumbnail', 'retweet', 'nlikes', 'nreplies', 'nretweets', 'quote_url',
       'search', 'near', 'geo', 'source', 'user_rt_id', 'user_rt',
       'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src',
       'trans_dest'],
      dtype='object')

In [6]:
# Turn into pandas DataFrame
df_pd = twint_to_pandas(["date", "day", "hour", "username", "tweet", "hashtags", "nlikes", "nreplies", "nretweets"])

In [7]:
df_pd.head(3)

Unnamed: 0,date,day,hour,username,tweet,hashtags,nlikes,nreplies,nretweets
0,2021-04-26 15:11:05,1,15,elonmusk,@PPathole An advantage I did have is that my f...,[],2160,213,209
1,2021-04-26 14:10:31,1,14,elonmusk,@PPathole True. The opposite in fact – I provi...,[],2680,243,269
2,2021-04-26 02:11:54,1,2,elonmusk,@BLKMDL3 @JohnnaCrider1 We just got approval t...,[],3611,337,213


In [8]:
df_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7156 entries, 0 to 7155
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   date       7156 non-null   object
 1   day        7156 non-null   int64 
 2   hour       7156 non-null   object
 3   username   7156 non-null   object
 4   tweet      7156 non-null   object
 5   hashtags   7156 non-null   object
 6   nlikes     7156 non-null   int64 
 7   nreplies   7156 non-null   int64 
 8   nretweets  7156 non-null   int64 
dtypes: int64(4), object(5)
memory usage: 503.3+ KB


In [9]:
# Convert date column to datetime

df_pd['date'] = pd.to_datetime(df_pd['date'])
df_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7156 entries, 0 to 7155
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   date       7156 non-null   datetime64[ns]
 1   day        7156 non-null   int64         
 2   hour       7156 non-null   object        
 3   username   7156 non-null   object        
 4   tweet      7156 non-null   object        
 5   hashtags   7156 non-null   object        
 6   nlikes     7156 non-null   int64         
 7   nreplies   7156 non-null   int64         
 8   nretweets  7156 non-null   int64         
dtypes: datetime64[ns](1), int64(4), object(4)
memory usage: 503.3+ KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pd['date'] = pd.to_datetime(df_pd['date'])


In [10]:
# tweet_df = df_pd.set_index('date')
tweet_df = df_pd

In [11]:
tweet_df.head(3)

Unnamed: 0,date,day,hour,username,tweet,hashtags,nlikes,nreplies,nretweets
0,2021-04-26 15:11:05,1,15,elonmusk,@PPathole An advantage I did have is that my f...,[],2160,213,209
1,2021-04-26 14:10:31,1,14,elonmusk,@PPathole True. The opposite in fact – I provi...,[],2680,243,269
2,2021-04-26 02:11:54,1,2,elonmusk,@BLKMDL3 @JohnnaCrider1 We just got approval t...,[],3611,337,213


In [12]:
# Look for duplicates

duplicated = tweet_df[tweet_df.duplicated(['tweet'])]

In [13]:
duplicated.count()

date         50
day          50
hour         50
username     50
tweet        50
hashtags     50
nlikes       50
nreplies     50
nretweets    50
dtype: int64

In [14]:
duplicated

Unnamed: 0,date,day,hour,username,tweet,hashtags,nlikes,nreplies,nretweets
847,2021-01-07 05:23:50,4,5,elonmusk,The most entertaining outcome is the most likely,[],156003,3360,10409
980,2020-12-16 23:08:02,3,23,elonmusk,@TheBabylonBee 🤣🤣,[],25366,589,381
1157,2020-11-17 13:48:50,2,13,elonmusk,@westcoastbill Indeed,[],1039,54,30
1807,2020-09-04 14:21:48,5,14,elonmusk,@Tesmanian_com Yes,[],7166,323,210
2147,2020-07-30 03:54:43,4,3,elonmusk,@tobyliiiiiiiiii Sure,[],5638,264,82
2280,2020-07-21 01:19:10,2,1,elonmusk,@flcnhvy 🤣🤣,[],2983,58,29
2431,2020-07-05 16:36:16,7,16,elonmusk,@WholeMarsBlog 🤣🤣,[],17319,204,204
2447,2020-07-04 22:40:55,6,22,elonmusk,@nichegamer 🤣🤣,[],3204,175,55
2622,2020-06-21 23:18:52,7,23,elonmusk,@TeslaOwnersEBay Yes,[],1613,81,41
2637,2020-06-21 04:07:42,7,4,elonmusk,We must pass The Great Filter,[],173133,6330,14868


In [15]:
# Will keep duplicates for now

# Remove unneccesary columns

del tweet_df['username']

In [16]:
# Check length of each tweet

tweet_df['pre_clean_len'] = [len(i) for i in tweet_df['tweet']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweet_df['pre_clean_len'] = [len(i) for i in tweet_df['tweet']]


In [17]:
ordered = tweet_df.sort_values(by='pre_clean_len', ascending=False)

In [18]:
ordered.head()

Unnamed: 0,date,day,hour,tweet,hashtags,nlikes,nreplies,nretweets,pre_clean_len
4415,2019-12-13 21:31:32,5,21,@bluemoondance74 @ThugsAndMiracle @tfspeakcies...,[],6556,191,662,424
3424,2020-04-15 22:10:25,3,22,@fat__tire @Kristennetten @Robotbeat @ghotiing...,[],662,172,71,394
3423,2020-04-15 22:18:55,3,22,@sivanithu @tobyliiiiiiiiii @cabral_psyd @Erda...,[],1513,28,66,394
3430,2020-04-15 21:50:01,3,21,@Robotbeat @ghotiing @CruizVinicius @josh11987...,[],893,232,93,381
3418,2020-04-15 22:38:55,3,22,@Erdayastronaut @sivanithu @tobyliiiiiiiiii @c...,[],2458,130,136,371


In [19]:
# Need to remove retweets with characters with @.

tweet_df[tweet_df['pre_clean_len'] > 140].head()

Unnamed: 0,date,day,hour,tweet,hashtags,nlikes,nreplies,nretweets,pre_clean_len
0,2021-04-26 15:11:05,1,15,@PPathole An advantage I did have is that my f...,[],2160,213,209,175
23,2021-04-22 16:20:50,4,16,@AstroJordy I’m staying at a friend’s place in...,[],3564,315,277,272
26,2021-04-22 02:02:03,4,2,@JohnnaCrider1 @WholeMarsBlog If Tesla owners ...,[],3668,229,407,309
30,2021-04-22 01:38:03,4,1,@annerajb Newest units can probably do as much...,[],1544,55,96,261
31,2021-04-22 01:14:54,4,1,Powerwall 2 peak &amp; steady power capability...,[],26789,1265,1224,285


In [20]:
# Cleaning data

# It seems that some of the HMTL encoding has not been properly converted (https://towardsdatascience.com/another-twitter-sentiment-analysis-bb5b01ebad90)
def clean_html(tweet):
    """NEED TO FILL"""
    return str(tweet).replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>').replace('&quot;', '"').replace('&#39;', "'")
# Remove mentions
def clean_mention(tweet):
    """NEED TO FILL"""
    return re.sub(r'@[A-Za-z0-9_]+','', tweet)
# Remove URLS
def clean_url(tweet):
    """NEED TO FILL"""
    return re.sub('https?://[A-Za-z0-9./]+','', tweet)


In [21]:
#tweet_df.loc['2021-04-22 01:14:54']['tweet']
#_ = tweet_df.loc['2021-04-22 01:14:54']['tweet'].apply(clean_html)
#_

In [22]:
#tweet_df.loc['2021-04-22 01:38:03']['tweet']

In [23]:
#_ = tweet_df.loc['2021-04-22 01:38:03']['tweet'].apply(clean_mention)

In [24]:
cleaned_tweet = tweet_df['tweet'].apply(clean_html).apply(clean_mention).apply(clean_url).str.lower()
cleaned_df = tweet_df
cleaned_df['tweet'] = cleaned_tweet

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['tweet'] = cleaned_tweet


In [25]:
cleaned_tweet

0        an advantage i did have is that my father is ...
1        true. the opposite in fact – i provide financ...
2         we just got approval to open a new superchar...
3                                                        
4                                             so awesome!
                              ...                        
7151      yes, support of the chinese government is ve...
7152                              govt has been unwelcome
7153                                     congratulations!
7154                            great work by tesla team!
7155                            congratulations  team!!  
Name: tweet, Length: 7156, dtype: object

In [26]:
del cleaned_df['pre_clean_len']
cleaned_df['clean_len'] = [len(i) for i in cleaned_df['tweet']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['clean_len'] = [len(i) for i in cleaned_df['tweet']]


In [27]:
_ = cleaned_df.sort_values(by='clean_len')

In [28]:
_

Unnamed: 0,date,day,hour,tweet,hashtags,nlikes,nreplies,nretweets,clean_len
3750,2020-03-11 11:32:06,3,11,,[],226467,4860,22333,1
3811,2020-03-02 03:59:46,1,03,,[],96835,863,12515,1
641,2021-01-31 16:07:41,7,16,,[],36330,2998,3194,1
4413,2019-12-13 21:55:49,5,21,,[],91012,922,4588,1
839,2021-01-07 13:49:47,4,13,,[],317781,7068,24434,1
...,...,...,...,...,...,...,...,...,...
6631,2019-03-16 16:38:07,6,16,model 3 partial permanent magnet induction ...,[],1354,65,79,283
4304,2019-12-28 18:22:36,6,18,we started zip2 with ~$2k from me plus my...,[],3993,126,327,283
2564,2020-06-30 18:01:31,2,18,extremely big difference between died beca...,[],1276,259,121,284
4011,2020-02-03 12:14:41,1,12,"first, we need to make it super safe & eas...",[],2423,74,138,284


In [29]:
# removing hashtag column for now

del cleaned_df['hashtags']

In [30]:
small = cleaned_df[cleaned_df['clean_len'] == 1]

In [31]:
small

Unnamed: 0,date,day,hour,tweet,nlikes,nreplies,nretweets,clean_len
9,2021-04-24 05:03:34,6,05,,227115,6506,18313,1
59,2021-04-16 13:01:25,5,13,👀,257572,14161,26867,1
139,2021-04-09 03:32:42,5,03,,350815,6523,34967,1
301,2021-03-18 22:40:07,4,22,,343139,8786,23645,1
392,2021-03-05 17:37:43,5,17,🙄,271834,16082,17736,1
...,...,...,...,...,...,...,...,...
6935,2019-02-03 23:03:22,7,23,,78670,2157,9432,1
6936,2019-02-03 22:56:56,7,22,,144404,2478,19780,1
7070,2019-01-10 23:42:49,4,23,,5582,242,400,1
7108,2019-01-06 14:45:28,7,14,,47703,872,5088,1


In [32]:
cleaned_df = cleaned_df[cleaned_df['clean_len'] > 1]

In [33]:
cleaned_df

Unnamed: 0,date,day,hour,tweet,nlikes,nreplies,nretweets,clean_len
0,2021-04-26 15:11:05,1,15,an advantage i did have is that my father is ...,2160,213,209,162
1,2021-04-26 14:10:31,1,14,true. the opposite in fact – i provide financ...,2680,243,269,72
2,2021-04-26 02:11:54,1,02,we just got approval to open a new superchar...,3611,337,213,68
3,2021-04-26 02:08:53,1,02,,10005,408,279,2
4,2021-04-25 15:41:31,7,15,so awesome!,11697,484,386,12
...,...,...,...,...,...,...,...,...
7151,2019-01-03 12:51:34,4,12,"yes, support of the chinese government is ve...",743,31,50,65
7152,2019-01-03 09:14:25,4,09,govt has been unwelcome,1001,69,54,24
7153,2019-01-02 23:47:23,3,23,congratulations!,14212,186,441,17
7154,2019-01-02 20:25:02,3,20,great work by tesla team!,32546,687,2346,25


In [34]:
_ = cleaned_df.sort_values(by='clean_len')

In [35]:
_

Unnamed: 0,date,day,hour,tweet,nlikes,nreplies,nretweets,clean_len
605,2021-02-05 06:49:29,5,06,🖤,40428,796,621,2
5152,2019-09-06 18:01:54,5,18,😲,1501,118,62,2
1567,2020-10-05 18:42:24,1,18,,436,32,16,2
1566,2020-10-05 18:58:07,1,18,💯,457,16,13,2
1436,2020-10-14 19:17:27,3,19,😢,29360,291,623,2
...,...,...,...,...,...,...,...,...
5907,2019-05-27 02:32:24,1,02,"there are already 4900 satellites in orbit,...",1493,202,195,283
3025,2020-05-09 12:49:40,6,12,"san joaquin county, right next door to alam...",2087,126,221,283
2564,2020-06-30 18:01:31,2,18,extremely big difference between died beca...,1276,259,121,284
4011,2020-02-03 12:14:41,1,12,"first, we need to make it super safe & eas...",2423,74,138,284


In [36]:
# Tokenization 
# https://towardsdatascience.com/another-twitter-sentiment-analysis-bb5b01ebad90
# https://towardsdatascience.com/sentiment-analysis-for-stock-price-prediction-in-python-bed40c65d178

