In [1]:
#Imports
import pandas as pd
from pandas import DataFrame, Series
from pandas.tseries.offsets import DateOffset
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline 

from pandas.tools.plotting import lag_plot
from pandas.plotting import autocorrelation_plot

from statsmodels.graphics.tsaplots import plot_acf


from scipy.misc import imread
import arch.unitroot 




  from pandas.core import datetools


## Loading Datasets

### Loading the stock price data.

In [2]:
unitedQuotes = pd.read_csv('/home/theresa/Desktop/DataSets/UnitedTweets/UALQuotesEpoch.txt')

In [3]:
unitedQuotes.head()

Unnamed: 0,EpochTime,EpochTime.1,CLOSE,HIGH,LOW,OPEN,VOLUME,first differences
0,1495719000,1495719000,79.16,79.18,79.16,79.18,6997,
1,1495719060,1495719060,79.24,79.24,79.03,79.05,991,0.08
2,1495719120,1495719120,79.31,79.31,79.095,79.095,2500,0.07
3,1495719180,1495719180,79.4,79.432,79.16,79.16,2500,0.09
4,1495719240,1495719240,79.35,79.405,79.33,79.405,1200,-0.05


### Loading the tweets, which were previously analyzed for their sentiment polarity.

In [4]:
unitedTweets = pd.read_csv('/home/theresa/Desktop/DataSets/UnitedTweets/UnitedTweet_sentiment_polarity.csv')

In [5]:
unitedTweets.head()

Unnamed: 0,created_at,CleanText,Sentiment,Sentiment Category
0,2017-05-22 09:32:08-04:00,nathanoafc kfcbarstool rumour sea lion works u...,0.0,Neutral
1,2017-05-22 09:32:31-04:00,iamjohnk aminespn united airlines flight full ...,0.35,Positive
2,2017-05-22 09:33:28-04:00,united hold customer representative minutes wa...,0.0,Neutral
3,2017-05-22 09:33:32-04:00,kevwodonnell would appear snp taking pr advice...,-0.175,Neutral
4,2017-05-22 09:33:39-04:00,oldpicsarchive united airlines stewardess wi m...,0.0,Neutral


In [6]:
#unitedTimes = pd.to_datetime(unitedTweets['created_at'])
#unitedTimes = unitedTimes.tz_localize('UTC').tz_convert("US/Eastern")
unitedTweets.index = unitedTweets['created_at']
#type(unitedTweets.index)

Below, I load the 1 minute sentiment averages I computed earlier via the following code:

    sentiment1m = unitedTweets['Sentiment'].resample('1t', how='mean')
    

For the sake of the following analysis, I also filled in 0 for the sentiment of minutes not containing 
tweets about United Airlines.  (I can see arguments against doing this, but let's assume for the moment that if someone has an emotion, they tweet about it!)

I also have already added a column containing the first differences.

In [7]:
sentiment1m = pd.read_csv('/home/theresa/Desktop/DataSets/UnitedTweets/sentiment1mWithDifference')

In [8]:
sentiment1m.head()

Unnamed: 0.1,Unnamed: 0,Sentiment,first differences
0,0,0.175,
1,2017-05-22 09:33:00-04:00,-0.058333,-0.233333
2,2017-05-22 09:34:00-04:00,0.357143,0.415476
3,2017-05-22 09:35:00-04:00,0.0,-0.357143
4,2017-05-22 09:36:00-04:00,0.088889,0.088889


In [9]:
sentiment1m.columns

Index(['Unnamed: 0', 'Sentiment', 'first differences'], dtype='object')

In [10]:
sentiment1m.index = sentiment1m['Unnamed: 0']
sentiment1m = sentiment1m.drop(['Unnamed: 0'], axis=1)
sentiment1m.head()

Unnamed: 0_level_0,Sentiment,first differences
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.175,
2017-05-22 09:33:00-04:00,-0.058333,-0.233333
2017-05-22 09:34:00-04:00,0.357143,0.415476
2017-05-22 09:35:00-04:00,0.0,-0.357143
2017-05-22 09:36:00-04:00,0.088889,0.088889


In [11]:
indices = [0]

In [12]:
for index in sentiment.index:
    indices.append(index)
    
    

NameError: name 'sentiment' is not defined

In [13]:
sentiment1m.index = indices

ValueError: Length mismatch: Expected axis has 29509 elements, new values have 1 elements

In [None]:
sentiment1m.head()

## Testing for auto-correlations

The function numpy.corrcoef calculates the Pearson coefficient of the two inputted lists.  We see below that the 1-minute change in UAL stock price can't be used to predict the next minute's change.

In [None]:
np.corrcoef(unitedQuotes['first differences'][1:-1], unitedQuotes['first differences'][2:])

Below is the graph of the first differences plotted against a lagged version of itself (here, the lag is 1).

In [None]:

lag_plot(unitedQuotes['first differences'][1:], lag=1)
plt.show()


Just to make sure, I wrote the following function to see if one minute's changes could reasonably predict the change further on into the future.

In [None]:
def PearsonCoeff(list1,list2, n) :
#input two lists of equal length.  The output will be a list of the Pearson Coefficients
#when the lags are in range(1,n+1).

#the length of each list should be more than n
    correlations = []
    for i in range(1,n+1):
        corr = np.corrcoef(list1[1:-i],list2[i+1:])[0,1]
        correlations.append((i, corr))
    return correlations

In [None]:
correlations = PearsonCoeff(unitedQuotes['first differences'],unitedQuotes['first differences'],4000)

In [None]:
max(abs(y) for (x,y) in correlations)

In [None]:
np.argmax([y for (x,y) in correlations])

In [None]:
len([y for (x,y) in correlations if abs(y)>=.1])

There are a couple lags which lie just outside of the interval indicating 99% confidence that the first differences aren't autocorrelated.  This confidence interval is indicated on the graph by the dotted lines.

In [None]:
data = unitedQuotes['first differences'][1:]

autocorrelation_plot(data)

## Checking for stationarity vs. unit roots

#### Tests on the stock prices time-series

Below, I do two complementary tests to check for unit roots in the first differences of the UAL stock prices.  The ADF test has a null hypothesis saying that there is a unit root in the time-series, and the KPSS test has a null hypothesis that the time-series is trend-stationary.  

A miniscule p-value for the ADF and a large p-value for the KPSS together mean that it's unlikely that the time-series in question has a unit root.  A unit root would destroy the validity of standard analyses on time-series, so these results mean we can proceed without further differencing.

In [None]:
from arch.unitroot import ADF
adf = ADF(unitedQuotes['first differences'][1:], method='AIC')
print(adf.summary().as_text())
#this shows that the differences do not have a unit root

In [None]:
arch.unitroot.KPSS(unitedQuotes["first differences"][1:])

#### Tests on the differenced tweet sentiment time-series
Below, we see some evidence that there is not a unit root in the 1 minute tweeted sentiment averages after taking the first differece. Since having a unit root throws off a lot of time-series analyses, this is a good thing.

In [None]:
adf = ADF(sentiment1m['first differences'][1:], method='AIC')
print(adf.summary().as_text())

In [None]:
arch.unitroot.KPSS(sentiment1m['first differences'][1:]) 


At this point, neither series of first differences has unit roots, and the first differences in the stock quootes is not autocorrelated.  So now it's time to prepare the two series for Granger causality analysis.
