In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import twint
from bs4 import BeautifulSoup
import re
import yfinance as yf
from nltk.tokenize import WordPunctTokenizer
from datetime import datetime as dt
from datetime import timedelta

# Fixes runtime errors and compatibility issues while running Twint in notebook
import nest_asyncio
nest_asyncio.apply()

In [2]:
tsla = yf.Ticker('TSLA')
tsla_df = tsla.history(period = '5y')

In [3]:
tsla_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-05-02,48.299999,48.638,46.964001,48.360001,19219500,0,0.0
2016-05-03,47.472,47.782001,46.324001,46.464001,21511000,0,0.0
2016-05-04,46.057999,46.891998,44.080002,44.512001,43502500,0,0.0
2016-05-05,45.692001,45.728001,41.958,42.306,56274000,0,0.0
2016-05-06,42.174,43.273998,41.622002,42.986,28426000,0,0.0


In [4]:
tsla_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1259 entries, 2016-05-02 to 2021-04-30
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Open          1259 non-null   float64
 1   High          1259 non-null   float64
 2   Low           1259 non-null   float64
 3   Close         1259 non-null   float64
 4   Volume        1259 non-null   int64  
 5   Dividends     1259 non-null   int64  
 6   Stock Splits  1259 non-null   float64
dtypes: float64(5), int64(2)
memory usage: 78.7 KB


In [5]:
tsla_df.reset_index(inplace=True)

In [6]:
# get starting value for twint search
start_date = dt.strftime(tsla_df.Date.min(), '%Y-%m-%d')
start_date

'2016-05-02'

In [7]:
# apply twint configurations

c = twint.Config()
c.Username = 'elonmusk'
c.User_full = True
c.Count = True
c.Stats = True
c.Pandas = True
c.Store_pandas = True
c.Hide_output = True
c.Pandas_clean = True
#c.Limit = 10
c.Since = start_date
#c.Search = 'TSLA'

In [8]:
twint.run.Search(c)

[!] No more data! Scraping will stop now.
found 0 deleted tweets in this search.
[+] Finished: Successfully collected 11173 Tweets from @elonmusk.


In [9]:
twint.output.panda.Tweets_df.columns;

In [10]:
columns = ['date', 'hour', 'tweet']

In [11]:
musk_tweets = twint.output.panda.Tweets_df[columns]

In [12]:
# Convert to datetime and remove time from datetime series

musk_tweets['date'] = pd.to_datetime(musk_tweets['date']).dt.date

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  musk_tweets['date'] = pd.to_datetime(musk_tweets['date']).dt.date


In [13]:
musk_tweets.head()

Unnamed: 0,date,hour,tweet
0,2021-05-02,18,@heydave7 @Tesla A remarkable junction in history
1,2021-05-02,17,@jaentwistle One of many reasons why we need l...
2,2021-05-02,16,@jpr007 @EPCalderhead Haha
3,2021-05-01,23,I love Art Deco
4,2021-05-01,21,@RSTYCG 👍


In [14]:
musk_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11173 entries, 0 to 11172
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    11173 non-null  object
 1   hour    11173 non-null  object
 2   tweet   11173 non-null  object
dtypes: object(3)
memory usage: 262.0+ KB


In [15]:
musk_tweets = musk_tweets.sort_values(by='date', ascending = True)

In [16]:
musk_tweets.reset_index(inplace=True)

In [17]:
del musk_tweets['index']

In [18]:
# Clean Data 
# (https://towardsdatascience.com/another-twitter-sentiment-analysis-bb5b01ebad90)

tok = WordPunctTokenizer()
pat1 = r'@[A-Za-z0-9_]+'
pat2 = r'https?://[A-Za-z0-9./]+'
combined_pat = r'|'.join((pat1, pat2))

def tweet_cleaner(text):
    """Cleans tweets by removing @ mentions, html encoding, urls, and utf-8"""
    
    # Clean tweet encodings and replace with pat1/pat2
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    stripped = re.sub(combined_pat, '', souped)
    
    # If Utf-8 is in tweet
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped
        
    letters_only = re.sub("[^a-zA-Z]", " ", clean)
    lower_case = letters_only.lower()
    
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    
    words = tok.tokenize(lower_case)
    return (" ".join(words)).strip()

In [19]:
musk_tweets['tweet'] = musk_tweets['tweet'].apply(tweet_cleaner)

In [20]:
def tweet_df_maker(tweet_df):
    """takes a dataframe with date, tweet, and hours and returns dataframe with day and tweet.
       Tweets from same date are grouped together
       Tweets after market close to next day."""
    
    tweets_collected = {}
    
    
    # If tweet hour is < 16, apply to current day, if >= 16 apply to next day
    
    current_tweet = ""
    tweets_collected[tweet_df['date'].iloc[0]] = current_tweet
   
    for i in range(len(tweet_df.index)):
        
        current_date = tweet_df['date'].iloc[i]
        if current_date in tweets_collected:
            if int(tweet_df['hour'].iloc[i]) < 16:
                tweets_collected[current_date] += " " + str(tweet_df['tweet'].iloc[i])
            else: 
                next_date = current_date + timedelta(days=1)
                tweets_collected[next_date] = ""
                tweets_collected[next_date] += " " + str(tweet_df['tweet'].iloc[i])
        else:
            tweets_collected[current_date] = str(tweet_df['tweet'].iloc[i])
            
    tweet_df = pd.DataFrame.from_dict(tweets_collected, orient='index', columns = ['Tweet'])
    tweet_df.reset_index(inplace=True)
    tweet_df = tweet_df.rename(columns={'index':'Date'})
    
    return tweet_df

In [21]:
tweets = tweet_df_maker(musk_tweets)

In [36]:
# Set up tweet_stocks

tweet_stock = tweets.set_index('Date').join(tsla_df.set_index('Date'), how='left')
tweet_stock = tweet_stock[['Tweet', 'Close']]
tweet_stock['TSLA Price'] = tweet_stock['Close'].fillna(method='ffill')
del tweet_stock['Close']
tweet_stock['Percent Change'] = tweet_stock['TSLA Price'].pct_change()

In [37]:
tweet_stock

Unnamed: 0_level_0,Tweet,TSLA Price,Percent Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-05-02,bioweapon defense mode is real this is what h...,48.360001,
2016-05-05,tesla is increasing the production ramp as fas...,42.306000,-0.125186
2016-05-06,woohoo may need to increase size of rocket sto...,42.986000,0.016073
2016-05-07,i think so certainly agree that it is first a...,42.986000,0.000000
2016-05-09,coming soon,41.784000,-0.027963
...,...,...,...
2021-04-29,best selling by revenue in possibly by unit v...,677.000000,-0.025058
2021-04-30,fsd display v will show actual probability di...,709.440002,0.047917
2021-05-01,woke james bond snl may,709.440002,0.000000
2021-05-02,i love art deco,709.440002,0.000000
