# Import packages

In [1]:
import os
import re
import time
import requests
import numpy as np
import pandas as pd
from scipy.stats import t
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from statsmodels.api import OLS, add_constant

import yahoo_crawler
yc = yahoo_crawler.yahoo_crawler()

yahooo_crawler


In [None]:
plt.rcParams['font.family'] = 'serif'
plt.rcParams['figure.facecolor'] = '1.'
plt.rcParams["axes.axisbelow"] = False

# Update local stock list

In [None]:
def update_stocklist_data():
    ''' Functions for update stocklist data
    Source: www.nasdaq.com
    '''
    # create stock_list data folder
    folder = os.getcwd() + '\\stock_list\\'
    if not os.path.exists(folder):
        os.makedirs(folder)

    # soure url
    url = 'https://www.nasdaq.com/screening/companies-by-name.aspx?letter=0&exchange=%s&render=download'

    # available exhanges
    exchange = ['nasdaq', 'nyse', 'amex']

    for exchg in exchange:
        resp = requests.get(url%exchg)
        with open(folder + '%s.xlsx'%exchg, 'wb') as output:
            output.write(resp.content)
    pass

# update stock_list
update_stocklist_data()

# Read local stock list

In [None]:
# data folder
folder = os.getcwd() + '\\stock_list\\'

# file names
files = os.listdir( folder )

stolis_df_list = []
for f in files:
    df = pd.read_csv( folder + f )
    stolis_df_list.append(df)
    print(f.upper(),df.shape, '\n==================================================\n',
          df[['Name']].head() )
    print('==================================================\n')

# concatenate companies from three exhanges
stolis_df_ = pd.concat(stolis_df_list, axis = 0)

# drop out fund
stolis_df_ = stolis_df_[stolis_df_['industry'] == stolis_df_['industry']]

# drop dupplicated company names
stolis_df = stolis_df_.drop_duplicates(['Name']).reset_index(drop = True)

print('Total %s companies, unique %s companies.' % (stolis_df_.shape[0], stolis_df.shape[0]))

# Fetch yahoo finance stock data

In [None]:
def get_stock_data(symbol, start, end,):
    t = time.time()
    cookie,crumb = yc.get_yahoo_crumb_cookie()

    symbol = symbol# stock id
    params = {'period1' : start,
              'period2' : end,
              'interval': '1d',
              'events'  : 'history',
              'crumb'   : crumb}
    df = yc.GetStockPrice(symbol = symbol,
                               params = params,
                               request_type = 'post')
    
    # set date as index
    df = df.set_index('Date')
    df.sort_index(inplace = True)
    
    t2 = time.time() - t
    print(str(round(t2,4))+' seconds elapsed...')
    return df[['Adj Close']]

In [None]:
# define trading dates with S&P index
SP500          = get_stock_data(symbol = '^GSPC', start = '2014-12-01', end = '2019-04-19')
standard_index = SP500.index

In [None]:
def reindex_data(symbol, tweets_date, standard_index = standard_index, how = 'test'):
    
    assert how in ['test', 'train'], 'Parameter *how* must be either \'test\' or \'train\'!'
    
    if how == 'test':
        start = standard_index[ standard_index <  tweets_date][-11]
        end   = standard_index[ standard_index >= tweets_date][ 11]

        df = get_stock_data(symbol = symbol, start = start, end = end)

        df = df.reindex(standard_index)

        df_ = pd.concat([ df[ df.index <  tweets_date ].tail(10),
                          df[ df.index >= tweets_date ].head(11) ])
    
    elif how == 'train':
        start = standard_index[ standard_index <  tweets_date][-252]
        end   = standard_index[ standard_index >= tweets_date][  11]

        df = get_stock_data(symbol = symbol, start = start, end = end)

        df = df.reindex(standard_index)

        df_ = pd.concat([ df[ df.index <  tweets_date ].tail(251),
                          df[ df.index >= tweets_date ].head(11) ])
        
    return df_

# Fetch marked tweets data

In [None]:
tweets = pd.read_excel('mentioned_stock.xlsx')
tweets['ticker'] = tweets['ticker'].apply(lambda x: x.replace(' ', ''))
tweets.head()

In [None]:
tweets['date'].min()

In [None]:
tweets_i = tweets.iloc[2,:]
tweets_i

In [None]:
symbol = tweets_i['ticker']
symbol

In [None]:
def my_normalize(df):
    df = df.copy()
    df -= df.mean()
    df /= df.std()
    return df

In [None]:
def fetch_data_set(symbol, tweets_date, plot = False):
    ''' Prepare dataset
    '''
    
    dict_ = {}
    ###################################################################################################
    #1, Prepare stock price series
    S_i = reindex_data(symbol = symbol, tweets_date = tweets_date, how = 'train')

    S_m = SP500.reindex(S_i.index)

    df = pd.concat([S_i, S_m ], axis = 1)
    df.columns = ['S_i', 'S_m']
    
    assert len(df[df['S_i'] != df['S_i']]) < 10, 'Missing values of stock %s are more than 10!'%symbol
    
    # fillna for stocks
    df = df.fillna(method = 'pad').fillna(method = 'bfill')
    
    assert len(df[df['S_i'] != df['S_i']]) == 0, 'Failed to fillna for tock %s!'%symbol
    
    dict_['price'] = df.copy()
    ###################################################################################################
    #2, Prepare stockprice series
    
    # calculate log return of stock_i and S&P500
    ret = np.log(df).diff().dropna()
    
    dict_['return'] = ret.copy()
    
    # regression R_i against R_m
    X = add_constant( ret['S_m'].iloc[: 240] )
    model = OLS( ret['S_i'].iloc[: 240], X ).fit()
    
    
    # predict S_i
    X = add_constant( ret['S_m'] )
    R_i_pred = model.predict(X)
    
    AR = ret['S_i'] - R_i_pred
    
    # store price, return, parameters, t values, and p values, AR
    dict_.update( {
        'price'   : df.copy(),
        'return'  : ret.copy(),
        'params'  : model.params ,
        't_values': model.tvalues, 
        'p_values': model.pvalues,
        'AR'      : AR.copy()
                 } )
    
    if plot:
        fig = plt.figure(figsize = [10, 6])
        
        ax1 = fig.add_subplot(221)
        ax1.plot(my_normalize(df['S_i']), label = symbol)
        ax1.plot(my_normalize(df['S_m']), label = 'S&P500' )
        ax1.set_ylabel('Normalize price serires')
        ax1.spines['top'  ].set_color('none')
        ax1.spines['left' ].set_color('none')
        ax1.spines['right'].set_color('none')
        ax1.legend(loc = 'best')
        ax1.grid(linestyle = ':', axis = 'y')
        plt.xticks(rotation = 15)
        
        ax2 = fig.add_subplot(222)
        ax2.plot(ret['S_i'], '-*', label = '%s log return'%symbol)
        ax2.plot(ret['S_m'], '-*', label = 'S&P500 log return' )
        ax2.set_ylabel('Log Return')
        ax2.spines['top'  ].set_color('none')
        ax2.spines['left' ].set_color('none')
        ax2.spines['right'].set_color('none')
        ax2.legend(loc = 'best')
        ax2.grid(linestyle = ':', axis = 'y')
        ax2.yaxis.set_major_formatter(mtick.PercentFormatter(1))
        plt.xticks(rotation = 15)
        
        ax3 = fig.add_subplot(212)
        ax3.plot(AR, '-*', label = '%s AR'%symbol)
        ax3.set_ylabel('AR')
        ax3.spines['top'  ].set_color('none')
        ax3.spines['left' ].set_color('none')
        ax3.spines['right'].set_color('none')
        ax3.legend(loc = 'best')
        ax3.grid(linestyle = ':', axis = 'y')
        ax3.yaxis.set_major_formatter(mtick.PercentFormatter(1))
        plt.xticks(rotation = 15)
        
        plt.suptitle('Price, Return, and AR of %s vs. S&P500'%symbol, fontsize = 20)
        plt.show()
        
    return dict_

In [None]:
dict_ = fetch_data_set(symbol, tweets_i['date'], plot = True)
dict_.keys()

In [None]:
mega_dict = {}

t = time.time()

for i in range( tweets.shape[0] ):
    tweets_i    = tweets.iloc[i, :]
    symbol      = tweets_i['ticker'].replace(' ', '')
    tweets_date = tweets_i['date']
    
    
    try:
        mega_dict[ '%s %s'%(symbol,tweets_date) ] = fetch_data_set(symbol, tweets_date, plot = False)
        print('Successfully fetched data for %s...'%symbol)
        print('==========================================\n')
    
    except Exception as e:
        print('Failed to fetch data for %s!'%symbol)
        print(e)
        print('==========================================\n')
    
        
print(time.time() - t,'sec(s) elapsed...')

In [None]:
mega_dict.keys()

In [None]:
mega_dict['F 2019-03-20 20:51:41'].keys()

# Summarize OLS result

In [None]:
def label_sig(p_value):
    ''' Lable significance level accorning p_value
    '''
    if p_value > 0.1:
        return ''
    elif p_value <= 0.1 and p_value > 0.05:
        return '*'
    elif p_value <= 0.05 and p_value > 0.01:
        return '**'
    elif p_value <= 0.01:
        return '***'

In [None]:
def summary_OLS(mega_dict):
    ''' Summarize OLS result
    '''
    summary = pd.DataFrame()
    for k, v in zip( mega_dict.keys(), mega_dict.values() ):
        symbol = re.findall(r'(\w+)\s2', k)[0]
        date   = pd.to_datetime(re.findall(r'%s (.*)'%symbol, k)[0])
        
        company = tweets[ (tweets['ticker'] == symbol) & (tweets['date'] == date) ]['company'].values[0]
        
        intercept = '%.5f(%.2f)%s'%( 
                                v['params'  ]['const'],
                                v['t_values']['const'],
                                label_sig(v['p_values']['const'])
                                    )
        
        slope = '%.2f(%.2f)%s'%( 
                                v['params'  ]['S_m'],
                                v['t_values']['S_m'],
                                label_sig(v['p_values']['S_m'])
                                    )
        
        summary = summary.append( { 
                                'Stock'    : company,
                                'Intercept': intercept, 
                                'Slope'    : slope
                                    } ,ignore_index=True)
    return summary[['Stock', 'Intercept', 'Slope']]

regression_res = summary_OLS(mega_dict)
regression_res.to_csv('coefficients_from_market_model_regression.csv')
regression_res.head()

# AAR test

In [None]:
def collect_same_emotion(mega_dict = mega_dict, emotion = 1.):
    ''' Collect data set with the same emotion
    '''
    data_lst = []
    for k, v in zip( mega_dict.keys(), mega_dict.values() ):
        symbol = re.findall(r'(\w+)\s2', k)[0]
        date   = pd.to_datetime(re.findall(r'%s (.*)'%symbol, k)[0])
        
        emotion_ = tweets[ (tweets['ticker'] == symbol ) \
                         & (tweets['date'  ] == date   ) ]['type'].values[0]
        
        if emotion_ == emotion:
            
            data_lst.append( v['AR'].reset_index(drop = True) )
    return data_lst

In [None]:
data_pos = collect_same_emotion(emotion =  1)

data_neg = collect_same_emotion(emotion = -1)

print('Positive sample size is %s, Negative sample size is %s.'%( len(data_pos), len(data_neg) ))

In [None]:
def summary_AAR(emotion = 1):
    ''' Summarize AAR for specific emotion
    '''
    data = collect_same_emotion(emotion = emotion)
    
    size = len( data )
    
    df = pd.concat( data, axis = 1 )
    
    sigma = df.head(240).mean(axis = 1).std()
    
    sigma = df.head(240).mean(axis = 1).std()
    
    AAR = df.tail(21).mean(axis = 1)
    
    AAR.index = np.arange(-10, 11, 1)
    
    t_stats = AAR / sigma
    
    p_values = t_stats.apply(lambda x:  2 * (  0.5 - abs( t.cdf(x, df = 240 - 2) - 0.5 ) ) ) 
    
    report = pd.concat([AAR, t_stats, p_values], axis = 1)
    
    report.columns = ['AAR', 't_statistic', 'p_values']
    
    report['AAR'        ] = report.apply(lambda x: "{:.2%}".format(x['AAR']) + label_sig(x['p_values']), axis = 1)
    report['t_statistic'] = report['t_statistic'].apply(lambda x: round(x, 4))
    report['p_values'   ] = report['p_values'   ].apply(lambda x: round(x, 4))
    return report

In [None]:
pos_AAR_summary = summary_AAR(emotion =  1)
pos_AAR_summary.to_csv('aar_positive_tweets.csv')

neg_AAR_summary = summary_AAR(emotion = -1)
neg_AAR_summary.to_csv('aar_negative_tweets.csv')