In [379]:
# See utils.py for the imports

from utils import *
import os
import warnings
warnings.filterwarnings('ignore')

import timeit
import nest_asyncio
nest_asyncio.apply()
#%load_ext line_profiler

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

# Where to save the figures (adapted this from https://github.com/ageron/handson-ml2)
PROJECT_ROOT_DIR = "."
PROJECT_ID = "stock_movement_tweet_data_wrangling"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", PROJECT_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=72):
    """ 
    resolution quality
    300 high 
    150 medium
    72 low
    """
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)
    
plt.style.use('fivethirtyeight')

In [380]:
def combine_tweets_user(df):
    # convert dates to datetime and split
    datetimes = pd.to_datetime(df['created_at'])
    df['time'], df['date'] = datetimes.dt.time, datetimes.dt.date
    
    # get open and closing times
    open_t = dt.strptime('09:30:00', '%H:%M:%S').time()
    close_t = dt.strptime('16:00:00', '%H:%M:%S').time()
    
    # filter dataframes based off when tweet occurs
    before_df = df[df['time'] < open_t]
    during_df = df[((df['time'] >= open_t) & (df['time'] < close_t))]
    after_df = df[df['time'] >= close_t]
    
    # reset tweet times to next open or close time
    before_df['time'] = open_t
    during_df['time'] = close_t
    
    # if tweet occurs after close, set it to next day and set time to open
    after_df['date'] += timedelta(days=1)
    after_df['time'] = open_t

    # reassemble dataframes with updated dates/times
    df = before_df.append(during_df).append(after_df)
    
    # sort by date and time
    df['date'] = pd.to_datetime(df['date'].astype(str) + ' ' + df['time'].astype(str))
    df = df.sort_values(by='date', ascending=True)
    
    # collect tweets in a dictionary (key:value ->  Date : [tweets])
    collected_tweets = {}
    tweet = []
    combined_df = pd.DataFrame(index=sorted(list(set(df['date']))))
    combined_df['tweet'] = [[] for _ in range(len(combined_df))]
    
    # iterate over dataframe and add tweets to matching dates
    for i in range(len(combined_df)):
        time = combined_df.index[i]
        matching_dates = df[df['date'] == time]
        for tweet in matching_dates['tweet']:
            combined_df['tweet'].iloc[i].append(tweet)
     
    # add number of tweets and username and return
    combined_df['num_tweets'] = combined_df['tweet'].apply(lambda x: len(x))
    combined_df['username'] = df['username'].unique()[0]
    return combined_df

def combine_tweets(full_df, verbose=False):
    """
    group tweets together by next open/close date
    
    Arguments:
    full_df -- A dataframe of users tweets and created_at dates. 
    
    Returns:
    merged_df -- A dataframe of grouped user tweets as lists before the next open/close datetime
  
    A tweet is grouped via the following criteria:
    market_open = 09:30:00 EST
    market_close = 16:00:00 EST
    
    - If tweet created_at occurs before market_open on same date, it is grouped as before_market_tweets. 
    - It tweet is on or after market_open but before market_close, (i.e. during market hours) it is grouped as during_market_tweets. 
    - If tweet is on or after market_close, it is grouped as after_market_tweets.
    """
    
    users = full_df.username.unique()
    merged_df = pd.DataFrame()
    if verbose:
        max_length = 11963
        start = timeit.default_timer()
        print('')
        print('digesting dataframes....')
        print('---{:.2f} % complete -------'.format(0.0))
    for user in users:
        merged_df = merged_df.append(combine_tweets_user(full_df[full_df['username'] == user]))
        if verbose:
            print('')
            print('--{:.2f} % complete -------'.format(100*len(merged_df) / max_length))
    if verbose:
        stop = timeit.default_timer()
        print('digestion completed ---- runtime {:.2f} seconds'.format(stop - start))
    return merged_df.reset_index(drop=False).rename(columns={'index':'date'})
    
def combine_tweets_stocks(ceos_merged, stocks_full):
    new_df = pd.DataFrame()
    for user in ceos_merged['username'].unique():
        ticker = handles_tickers[user]
        tweet_df = ceos_merged[ceos_merged['username'] == user]
        stock_df = stocks_full[stocks_full['ticker'] == ticker]
        new_df = new_df.append(tweet_df.merge(stock_df, how='left', on='date')).dropna(subset=['ticker'])  
    return new_df

def fix_closed_market_tweets(test):
    test = test.dropna()
    test.reset_index(inplace=True, drop=True)
    test = test.set_index('date')
    for i in range(len(test)):
        test['tweet'].iloc[i] += " "

    i = 0
    combined=pd.DataFrame()

    while i < len(test):
        if test['price'].iloc[i] == 0:
            combined = test.iloc[i]
            j = i + 1
            while test['price'].iloc[j] == 0:
                combined += test.iloc[j]
                j += 1
            username = test.iloc[j]['username']
            price = test.iloc[j]['price']
            percent_change = test.iloc[j]['percent change']
            test.iloc[j] += combined
            test['username'].iloc[j] = username
            test['price'].iloc[j] = price
            test['percent change'].iloc[j] = percent_change
            i = j
        i += 1
    test = test[test['price'] !=0]
    return test

def organize_stocks(df): 
    """
    Combines the open/close dates and prices into a single dataframe
    """
    stock_open = pd.DataFrame()
    stock_close = pd.DataFrame()

    stock_open['date'] = pd.to_datetime(df['date'].dt.strftime('%Y-%m-%d 09:30:00'))
    stock_close['date'] = pd.to_datetime(df['date'].dt.strftime('%Y-%m-%d 16:00:00'))

    stock_open['price'] = df['open']
    stock_close['price'] = df['close']
    
    start_date_open = dt.strftime(stock_open.date.min(), '%Y-%m-%d %H:%M:%S')
    start_date_close = dt.strftime(stock_close.date.min(), '%Y-%m-%d %H:%M:%S')

    end_date_open = dt.strftime(stock_open.date.max(), '%Y-%m-%d %H:%M:%S')
    end_date_close = dt.strftime(stock_close.date.max(), '%Y-%m-%d %H:%M:%S')
    
    date_indx_open = pd.Series(pd.date_range(start_date_open, end_date_open), name='date')
    date_indx_close = pd.Series(pd.date_range(start_date_close, end_date_close), name='date')
    
    stock_open = pd.merge(date_indx_open, stock_open, how='left')
    stock_close = pd.merge(date_indx_close, stock_close, how='left')
    
    stock = pd.concat([stock_open, stock_close])
    return stock.sort_values(by='date', ascending=True)

# Collecting Data

In [381]:
# Collect the stock data from yahoo Finance and organize a dictionary of dataframes with the open/close prices combined. 
usernames = ['elonmusk', 'levie', 'jack', 'Benioff','richardbranson', 'JohnLegere']
stock_names = ['TSLA','BOX', 'TWTR','CRM','SPCE', 'TMUS']
user_stock_mapping = dict(zip(usernames, stock_names))
stocks = {stock:pd.read_pickle(f'data/{stock}.pkl') for stock in stock_names}

In [382]:
# Get needed columns and change the date to lowercase
for stock in stocks:
    stocks[stock].reset_index(inplace=True)
    stocks[stock] =  stocks[stock][['Date','Open', 'Close']]
    stocks[stock].columns = stocks[stock].columns.str.lower()

In [383]:
for stock in stocks:
    stocks[stock] = organize_stocks(stocks[stock])

### De-trending the time series stock data 

We will use the percent change in the stock prices as a target that we will later bin into categories.  

In [384]:
# create a percent change column from the price feature
for stock in stocks:
    stocks[stock]['percent change'] = stocks[stock]['price'].pct_change()
    stocks[stock].fillna(0, inplace=True)
    
# convert to dataframe
stocks_full = pd.DataFrame()
for stock in stocks:
    # get stock ticker for grouping
    stocks[stock]['ticker'] = stock
    stocks_full = stocks_full.append(stocks[stock])

# pickle and save
#pd.to_pickle(stocks_full, './data/stocks_full_df.pkl')

## 1.2.5 Collecting the tweets <a id='1.2.5_Collecting_Tweet'></a>
We will collect the CEOs tweets over the same time-span as the collected stocks

In [385]:
# Get the tweets from the CEOs of the companies
        
handles_tickers = {'elonmusk':'TSLA', 'levie':'BOX','jack':'TWTR', 'Benioff':'CRM', 
            'richardbranson':'SPCE', 'JohnLegere':'TMUS'}

start_date = {}
for user in handles_tickers:
    start_date[user] = dt.strftime(stocks[handles_tickers[user]].reset_index().date.min(), '%Y-%m-%d %H:%M:%S')
    
ceos = pd.read_pickle('./data/ceos.pkl')

In [386]:
ceos_merged = combine_tweets(ceos, verbose=True)
wrangled_df = combine_tweets_stocks(ceos_merged, stocks_full)


digesting dataframes....
---0.00 % complete -------

--18.38 % complete -------

--26.44 % complete -------

--38.93 % complete -------

--54.63 % complete -------

--71.44 % complete -------

--100.00 % complete -------
digestion completed ---- runtime 73.59 seconds


In [387]:
wrangled_df.loc[22]['tweet']

22    [Journalist Q&amp;A for 30 mins and embargo en...
22    [7,000 people registered for BoxWorks this wee...
22    [When was the last time you tried something fo...
Name: tweet, dtype: object

In [388]:
def combine_tweets_stocks(ceos_merged, stocks_full):
    new_df = pd.DataFrame()
    for user in ceos_merged['username'].unique():
        ticker = handles_tickers[user]
        tweet_df = ceos_merged[ceos_merged['username'] == user]
        stock_df = stocks_full[stocks_full['ticker'] == ticker]
        
        # drop na values where 
        new_df = new_df.append(tweet_df.merge(stock_df, how='left', on='date')).dropna(subset=['ticker'])  
    return new_df

In [389]:
####################
######!!WORK!!######
####################

In [390]:
####################
######!!WORK!!######
####################

In [391]:
####################
######!!WORK!!######
####################

In [392]:
####################
######!!WORK!!######
####################

In [393]:
####################
######!!WORK!!######
####################

In [394]:
####################
######!!WORK!!######
####################

In [395]:
df = wrangled_df[wrangled_df['username'] == 'elonmusk'].copy(deep=True)

In [396]:
df.set_index('date')['2021-07-16 16:00:00': '2021-07-19 09:30:00']

Unnamed: 0_level_0,tweet,num_tweets,username,price,percent change,ticker
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-07-16 16:00:00,"[@TeslaNY Do you even press?, @Teslarati Impro...",4,elonmusk,644.219971,-0.015977,TSLA
2021-07-17 09:30:00,"[@fael097 Pure coincidence!, @ValaAfshar Even ...",9,elonmusk,0.0,0.0,TSLA
2021-07-17 16:00:00,"[@SamTwits Nice, https://t.co/d4ZOSKZESP, Tap...",12,elonmusk,0.0,0.0,TSLA
2021-07-18 09:30:00,"[@ArtifactsHub And all-time hodl champion, @Ar...",6,elonmusk,0.0,0.0,TSLA
2021-07-18 16:00:00,"[@thePiggsBoson Problem 1st, theory 2nd is for...",1,elonmusk,0.0,0.0,TSLA
2021-07-19 09:30:00,"[@DragTimes @Tesla Nice, @grimnut @Tesla @Whol...",3,elonmusk,629.890015,-0.022244,TSLA


In [397]:
df = df.set_index('date')
combined = pd.DataFrame()
i = 0
while i < len(df):
    
    # find where price is zero (saturday, sunday, or holiday)
    if df['price'].iloc[i] == 0:
        combined = df[['tweet', 'num_tweets']].iloc[i]
        
        # iterate through dataframe until next non-zero price entry
        j = i + 1
        while df['price'].iloc[j] == 0:
            combined += df[['tweet', 'num_tweets']].iloc[j]
            j += 1
        username = df.iloc[j]['username']
        price = df.iloc[j]['price']
        percent_change = df.iloc[j]['percent change']
        df.iloc[j] += combined
        df['username'].iloc[j] = username
        df['price'].iloc[j] = price
        df['percent change'].iloc[j] = percent_change
        i = j
    i += 1
df = df[df['price'] !=0]

In [398]:
df

Unnamed: 0_level_0,tweet,num_tweets,username,price,percent change,ticker
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-08-23 16:00:00,[Journalist Q&amp;A for 30 mins and embargo en...,2,elonmusk,44.967999,0.002318,TSLA
2016-08-30 16:00:00,[Thanks for the longstanding faith in SpaceX. ...,3,elonmusk,42.268002,-0.022072,
2016-08-31 16:00:00,"[@Lockyep Not allowed, according to HK regulat...",5,elonmusk,42.402000,0.007508,TSLA
2016-09-01 16:00:00,[Loss of Falcon vehicle today during propellan...,1,elonmusk,40.153999,-0.039424,TSLA
2016-09-02 09:30:00,[Finishing Autopilot blog postponed to end of...,2,elonmusk,40.466000,0.007770,TSLA
...,...,...,...,...,...,...
2021-07-16 09:30:00,"[@AaronS5_ @FrenchieEAP @karpathy Yes, @Austin...",4,elonmusk,654.679993,0.006271,TSLA
2021-07-16 16:00:00,"[@TeslaNY Do you even press?, @Teslarati Impro...",4,elonmusk,644.219971,-0.015977,TSLA
2021-07-19 09:30:00,"[@DragTimes @Tesla Nice, @grimnut @Tesla @Whol...",31,elonmusk,629.890015,-0.022244,
2021-07-19 16:00:00,"[@jack @BitcoinMagazine @CathieDWood Sure, I h...",2,elonmusk,646.219971,0.025925,TSLA


In [399]:
df['2021-07-16 16:00:00': '2021-07-19 09:30:00']

Unnamed: 0_level_0,tweet,num_tweets,username,price,percent change,ticker
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-07-16 16:00:00,"[@TeslaNY Do you even press?, @Teslarati Impro...",4,elonmusk,644.219971,-0.015977,TSLA
2021-07-19 09:30:00,"[@DragTimes @Tesla Nice, @grimnut @Tesla @Whol...",31,elonmusk,629.890015,-0.022244,


In [14]:
####################
######!!SNIP!!######
####################
def snip1(df):
    df = df.set_index('date')
    combined = pd.DataFrame()
    i = 0
    while i < len(df):
        if df['price'].iloc[i] == 0:
            combined = df.iloc[i]
            j = i + 1
            while df['price'].iloc[j] == 0:
                combined += df.iloc[j]
                j += 1
            username = df.iloc[j]['username']
            price = df.iloc[j]['price']
            percent_change = df.iloc[j]['percent change']
            df.iloc[j] += combined
            df['username'].iloc[j] = username
            df['price'].iloc[j] = price
            df['percent change'].iloc[j] = percent_change
            i = j
        i += 1
    df = df[df['price'] !=0]
    return df

In [18]:
df = wrangled_df[wrangled_df['username'] == 'elonmusk']
df

Unnamed: 0,date,tweet,num_tweets,username,price,percent change,ticker
22,2016-08-23 16:00:00,[Journalist Q&amp;A for 30 mins and embargo en...,2,elonmusk,44.967999,0.002318,TSLA
23,2016-08-28 09:30:00,"[@Kotaku one of my favorite games as a kid, @B...",2,elonmusk,44.162666,0.011081,TSLA
24,2016-08-30 16:00:00,[Thanks for the longstanding faith in SpaceX. ...,1,elonmusk,42.268002,-0.022072,TSLA
25,2016-08-31 16:00:00,"[@Lockyep Not allowed, according to HK regulat...",5,elonmusk,42.402000,0.007508,TSLA
26,2016-09-01 16:00:00,[Loss of Falcon vehicle today during propellan...,1,elonmusk,40.153999,-0.039424,TSLA
...,...,...,...,...,...,...,...
2194,2021-07-18 09:30:00,"[@ArtifactsHub And all-time hodl champion, @Ar...",6,elonmusk,638.153341,-0.010441,TSLA
2195,2021-07-18 16:00:00,"[@thePiggsBoson Problem 1st, theory 2nd is for...",1,elonmusk,645.553304,0.011596,TSLA
2196,2021-07-19 09:30:00,"[@DragTimes @Tesla Nice, @grimnut @Tesla @Whol...",3,elonmusk,629.890015,-0.024263,TSLA
2197,2021-07-19 16:00:00,"[@jack @BitcoinMagazine @CathieDWood Sure, I h...",2,elonmusk,646.219971,0.025925,TSLA


In [19]:
test = snip1(df)

In [25]:
test['num_tweets'] 

date
2016-08-23 16:00:00    2
2016-08-28 09:30:00    2
2016-08-30 16:00:00    1
2016-08-31 16:00:00    5
2016-09-01 16:00:00    1
                      ..
2021-07-18 09:30:00    6
2021-07-18 16:00:00    1
2021-07-19 09:30:00    3
2021-07-19 16:00:00    2
2021-07-20 09:30:00    5
Name: num_tweets, Length: 2177, dtype: int64

In [None]:
####################
######!!REST!!######
####################

In [None]:
####################
######!!FULL!!######
####################

def fix_closed_market_tweets(test):
    test = test.dropna()
    test.reset_index(inplace=True, drop=True)
    test = test.set_index('date')
    for i in range(len(test)):
        test['tweet'].iloc[i] += " "

    i = 0
    combined=pd.DataFrame()

    while i < len(test):
        if test['price'].iloc[i] == 0:
            combined = test.iloc[i]
            j = i + 1
            while test['price'].iloc[j] == 0:
                combined += test.iloc[j]
                j += 1
            username = test.iloc[j]['username']
            price = test.iloc[j]['price']
            percent_change = test.iloc[j]['percent change']
            test.iloc[j] += combined
            test['username'].iloc[j] = username
            test['price'].iloc[j] = price
            test['percent change'].iloc[j] = percent_change
            i = j
        i += 1
    test = test[test['price'] !=0]
    return test

In [None]:
####################
######!!WORK!!######
####################

In [None]:
####################
######!!WORK!!######
####################

In [None]:
####################
######!!WORK!!######
####################

In [None]:
####################
######!!WORK!!######
####################

In [None]:
####################
######!!WORK!!######
####################

In [None]:
####################
######!!WORK!!######
####################

## 1.5 Saving Dataframes <a id='1.5_Exporting_DataFrames'></a>