In [1]:
%matplotlib inline

import os
import datetime

import IPython
import IPython.display
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf

mpl.rcParams['figure.figsize'] = (8, 6)
mpl.rcParams['axes.grid'] = False



In [2]:
pd. set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 0)

In [3]:
ticker_list = ["aapl", "amzn", "msft"]

ticker = ticker_list[0]

ticker_upper = ticker.upper()

print(ticker_upper)

AAPL


In [4]:
alphavantage_csv_path = f"/app/StockPricePredictions/data/alphavantage/time_series_daily_adjusted/{ticker_upper}/{ticker_upper}.csv"

In [5]:
df = pd.read_csv(alphavantage_csv_path, low_memory=False)
df.sort_values(by=["date"], ascending=True, inplace=True)

df = df[df["date"]>='2010-01-01']

df["date_time"] = pd.to_datetime(df['date'])

df["day_of_week"] = df["date_time"].dt.dayofweek

df["adjusted_close_shift"] = df["5. adjusted close"].shift(-1)

df["percentage_change"] = (df["adjusted_close_shift"] - df["5. adjusted close"]) / df["5. adjusted close"] * 100.0

df["increase"] = df["percentage_change"].apply(lambda x: 1 if x > 0 else 0)

df.set_index("date", inplace=True)

In [6]:
df.shape[0]

3064

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3064 entries, 2010-01-04 to 2022-03-04
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   1. open               3064 non-null   float64       
 1   2. high               3064 non-null   float64       
 2   3. low                3064 non-null   float64       
 3   4. close              3064 non-null   float64       
 4   5. adjusted close     3064 non-null   float64       
 5   6. volume             3064 non-null   float64       
 6   7. dividend amount    3064 non-null   float64       
 7   8. split coefficient  3064 non-null   float64       
 8   date_time             3064 non-null   datetime64[ns]
 9   day_of_week           3064 non-null   int64         
 10  adjusted_close_shift  3063 non-null   float64       
 11  percentage_change     3063 non-null   float64       
 12  increase              3064 non-null   int64         
dtypes: datet

In [8]:
df.tail()
# df[df.percentage_change == 0]

Unnamed: 0_level_0,1. open,2. high,3. low,4. close,5. adjusted close,6. volume,7. dividend amount,8. split coefficient,date_time,day_of_week,adjusted_close_shift,percentage_change,increase
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2022-02-28,163.06,165.42,162.43,165.12,165.12,95056629.0,0.0,1.0,2022-02-28,0,163.2,-1.162791,0
2022-03-01,164.695,166.6,161.97,163.2,163.2,83474425.0,0.0,1.0,2022-03-01,1,166.56,2.058824,1
2022-03-02,164.39,167.36,162.95,166.56,166.56,79724750.0,0.0,1.0,2022-03-02,2,166.23,-0.198127,0
2022-03-03,168.47,168.91,165.55,166.23,166.23,76678441.0,0.0,1.0,2022-03-03,3,163.17,-1.840823,0
2022-03-04,164.49,165.55,162.1,163.17,163.17,83819592.0,0.0,1.0,2022-03-04,4,,,0


In [9]:
df.increase.value_counts()

1    1621
0    1443
Name: increase, dtype: int64

In [10]:
# The day of the week with Monday=0, Sunday=6.

# 5 = Saturday
# 6 = Sunday
df.day_of_week.value_counts()

1    629
2    628
3    619
4    614
0    574
Name: day_of_week, dtype: int64

In [11]:
loc = df.index.get_loc('2010-01-05')
loc

print(loc)

df.iloc[loc]["date_time"].strftime("%Y-%m-%d")

1


'2010-01-05'

In [12]:
benzinga_csv_path = f"/app/StockPricePredictions/data/alphavantage/time_series_daily_adjusted/{ticker_upper}/{ticker}_finbert_20100101_20220304.csv"

In [13]:
df_benzinga = pd.read_csv(benzinga_csv_path, low_memory=False)

In [14]:
df_benzinga['date'] = pd.to_datetime(df_benzinga['Stock'])

In [15]:
df_benzinga["day_of_week"] = df_benzinga["date"].dt.dayofweek

In [16]:
df_benzinga.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29077 entries, 0 to 29076
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Headline     29077 non-null  object        
 1   Stock        29077 non-null  object        
 2   Positive     29077 non-null  float64       
 3   Negative     29077 non-null  float64       
 4   Neutral      29077 non-null  float64       
 5   date         29077 non-null  datetime64[ns]
 6   day_of_week  29077 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(1), object(2)
memory usage: 1.6+ MB


In [17]:
df_benzinga.head(1)

Unnamed: 0,Headline,Stock,Positive,Negative,Neutral,date,day_of_week
0,"Time to Sell Apple Puts In case you missed the meteoric run in Apple over the last several months, there may still be a way to profit without having to chase the stock at these lofty levels: selling puts. When you sell puts you in a sense become an insurance salesman. You agree to buy the stock at some point in the future, should the stock fall to that level or lower before a given date. For this agreement, you receive a premium up front. With Apple currently trading at $210/share, just off its 52-week high, I propose selling out-of-the-money puts for April or July. The April $190 contract bids at $8.15 and the July $180 contract bids at $10.25. I chose such long-dated contracts for their healty premiums. Just a reminder that the first rule of thumb with selling puts is to only sell puts on a stock you would be willing to own. Since I firmly believe in Apple's fundamentals and future business prospects, I would willingly buy the stock at $190 in April. However, nothing is certain, and Apple shares currently have a lot of positive news already ""baked in"". If the upcoming tablet is a flop or never arrives at all, or if the company somehow misses earnings forecasts, the stock price may fall sharply. Since I agree with most analysts that Apple will earn roughly $11-12 in non-GAAP EPS in FY2010 and that a $240-260 price target is reasonable, selling out-of-the-money puts appears to be a great way to generate income and set the price you would be willing to pay for Apple shares. Therefore, it is my bet that Apple will stay above $190 by April and $180 by July and the put options will expire worthless, giving the seller a nice premium for either contract.",2010-01-02,0.085465,0.15849,0.756044,2010-01-02,5


In [18]:
df_benzinga.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29077 entries, 0 to 29076
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Headline     29077 non-null  object        
 1   Stock        29077 non-null  object        
 2   Positive     29077 non-null  float64       
 3   Negative     29077 non-null  float64       
 4   Neutral      29077 non-null  float64       
 5   date         29077 non-null  datetime64[ns]
 6   day_of_week  29077 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(1), object(2)
memory usage: 1.6+ MB


In [19]:
# The day of the week with Monday=0, Sunday=6.

# 5 = Saturday
# 6 = Sunday

df_benzinga.day_of_week.value_counts()

1    6598
2    5977
0    5639
3    5184
4    4465
6    667 
5    547 
Name: day_of_week, dtype: int64

In [20]:
df_benzinga.head(1)

Unnamed: 0,Headline,Stock,Positive,Negative,Neutral,date,day_of_week
0,"Time to Sell Apple Puts In case you missed the meteoric run in Apple over the last several months, there may still be a way to profit without having to chase the stock at these lofty levels: selling puts. When you sell puts you in a sense become an insurance salesman. You agree to buy the stock at some point in the future, should the stock fall to that level or lower before a given date. For this agreement, you receive a premium up front. With Apple currently trading at $210/share, just off its 52-week high, I propose selling out-of-the-money puts for April or July. The April $190 contract bids at $8.15 and the July $180 contract bids at $10.25. I chose such long-dated contracts for their healty premiums. Just a reminder that the first rule of thumb with selling puts is to only sell puts on a stock you would be willing to own. Since I firmly believe in Apple's fundamentals and future business prospects, I would willingly buy the stock at $190 in April. However, nothing is certain, and Apple shares currently have a lot of positive news already ""baked in"". If the upcoming tablet is a flop or never arrives at all, or if the company somehow misses earnings forecasts, the stock price may fall sharply. Since I agree with most analysts that Apple will earn roughly $11-12 in non-GAAP EPS in FY2010 and that a $240-260 price target is reasonable, selling out-of-the-money puts appears to be a great way to generate income and set the price you would be willing to pay for Apple shares. Therefore, it is my bet that Apple will stay above $190 by April and $180 by July and the put options will expire worthless, giving the seller a nice premium for either contract.",2010-01-02,0.085465,0.15849,0.756044,2010-01-02,5


In [21]:
df_benzinga["in_index"] = df_benzinga["Stock"].apply(lambda x: True if x in df.index else False)

In [22]:
df_benzinga[df_benzinga.in_index==False]["day_of_week"].value_counts()

6    667
5    547
0    109
4    22 
3    13 
1    10 
2    4  
Name: day_of_week, dtype: int64

In [23]:
df.index.get_loc('2010-01-04')


0

In [24]:
from datetime import date, datetime, timedelta

In [25]:
test2 =  '2010-01-09'

dtobj1 = datetime.strptime(test2, "%Y-%m-%d")

print(dtobj1)

days = timedelta(2)

print(days)



dtobj2 = (dtobj1 - days).strftime("%Y-%m-%d")
dtobj2

2010-01-09 00:00:00
2 days, 0:00:00


'2010-01-07'

In [26]:
def update_date(x):

    global ERROR_COUNTER

    if x.in_index == True:
        return x.Stock
    else:
        # try:

            dt_time = datetime.strptime(x.Stock, "%Y-%m-%d")

            # loc = df.index.get_loc(date_part)
            if x.day_of_week == 6: # Sunday to Thursday
                days = timedelta(3)
                idx_lookup = (dt_time - days).strftime("%Y-%m-%d")
                print(idx_lookup)
                try:
                    loc = df.index.get_loc(idx_lookup)
                    return df.iloc[loc]["date_time"].strftime("%Y-%m-%d")
                except Exception as e1:
                    ERROR_COUNTER += 1
                    print(str(e1))
                    return x.Stock
            if x.day_of_week == 5: # Saturday to Thursday
                days = timedelta(2)
                idx_lookup = (dt_time - days).strftime("%Y-%m-%d")
                print(idx_lookup)
                try:
                    loc = df.index.get_loc(idx_lookup)
                    return df.iloc[loc]["date_time"].strftime("%Y-%m-%d")
                except Exception as e2:
                    ERROR_COUNTER += 1
                    print(str(e2))
                    return x.Stock
            if x.day_of_week == 4: # Friday to Thursday
                days = timedelta(1)
                idx_lookup = (dt_time - days).strftime("%Y-%m-%d")
                print(idx_lookup)
                try:
                    loc = df.index.get_loc(idx_lookup)
                    return df.iloc[loc]["date_time"].strftime("%Y-%m-%d")
                except Exception as e3:
                    ERROR_COUNTER += 1
                    print(str(3))
                    return x.Stock
            if x.day_of_week == 0: # Monday to Thursday
                days = timedelta(4)
                idx_lookup = (dt_time - days).strftime("%Y-%m-%d")
                print(idx_lookup)
                try:
                    loc = df.index.get_loc(idx_lookup)
                    return df.iloc[loc]["date_time"].strftime("%Y-%m-%d")
                except Exception as e3:
                    ERROR_COUNTER += 1
                    print(str(3))
                    return x.Stock

            else:
                return x.Stock

        # except Exception as e:
        #     ERROR_COUNTER += 1
        #     print(str(e))
        #     print(x.day_of_week, x.Stock, x.in_index)
        #     return x.Stock
    # loc = df.index.get_loc('2010-01-01')
    # loc



In [27]:
# The day of the week with Monday=0, Sunday=6.

# 5 = Saturday
# 6 = Sunday

ERROR_COUNTER = 0 

df_benzinga["backfill_date"] = df_benzinga["Stock"] # df_benzinga.apply(update_date, axis=1)


In [28]:
ERROR_COUNTER

0

In [29]:
df_benzinga["in_index"] = df_benzinga["backfill_date"].apply(lambda x: True if x in df.index else False)

In [30]:
df_benzinga[df_benzinga.in_index==False]["day_of_week"].value_counts()

6    667
5    547
0    109
4    22 
3    13 
1    10 
2    4  
Name: day_of_week, dtype: int64

In [31]:
df_benzinga[df_benzinga.in_index==False].shape

(1372, 9)

In [32]:
# df_benzinga["text"] = df_benzinga["title"] + " " + df_benzinga["body"]

In [33]:
# df_benzinga['text'] = np.where(df_benzinga["text"], df_benzinga["title"], df_benzinga["text"])

In [34]:
df_benzinga = df_benzinga[["backfill_date", "Positive", "Negative", "Neutral"]]

In [35]:
df_benzinga.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29077 entries, 0 to 29076
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   backfill_date  29077 non-null  object 
 1   Positive       29077 non-null  float64
 2   Negative       29077 non-null  float64
 3   Neutral        29077 non-null  float64
dtypes: float64(3), object(1)
memory usage: 908.8+ KB


In [36]:
df_benzinga.head()

Unnamed: 0,backfill_date,Positive,Negative,Neutral
0,2010-01-02,0.085465,0.15849,0.756044
1,2010-01-04,0.044536,0.688256,0.267208
2,2010-01-04,0.943631,0.02205,0.034319
3,2010-01-04,0.016127,0.955271,0.028602
4,2010-01-04,0.818431,0.036988,0.144581


In [37]:
df_benzinga.rename(columns={"backfill_date": "date"}, inplace=True)

In [38]:
def clean_tweet(df):
    # replace URLs with a whitespace
    df['text'] = df['text'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ')
    
    return df

In [39]:
df_benzinga.head()

Unnamed: 0,date,Positive,Negative,Neutral
0,2010-01-02,0.085465,0.15849,0.756044
1,2010-01-04,0.044536,0.688256,0.267208
2,2010-01-04,0.943631,0.02205,0.034319
3,2010-01-04,0.016127,0.955271,0.028602
4,2010-01-04,0.818431,0.036988,0.144581


In [40]:
# df_benzinga = clean_tweet(df_benzinga)

In [41]:
df_benzinga.set_index("date", inplace=True)

In [42]:
df_benzinga.head()

Unnamed: 0_level_0,Positive,Negative,Neutral
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-02,0.085465,0.15849,0.756044
2010-01-04,0.044536,0.688256,0.267208
2010-01-04,0.943631,0.02205,0.034319
2010-01-04,0.016127,0.955271,0.028602
2010-01-04,0.818431,0.036988,0.144581


In [43]:
df_benzinga_duplicated_index = df_benzinga[df_benzinga.index.duplicated(keep=False)]

In [44]:
df_benzinga_duplicated_index.shape[0]

28562

In [45]:
df_benzinga_nonduplicated_index = df_benzinga[~df_benzinga.index.duplicated(keep=False)]

In [46]:
df_benzinga_nonduplicated_index.shape[0]

515

In [47]:
# df_twint.groupby('date')['negative'].mean()

df_benzinga_groupby = df_benzinga.groupby("date").agg(
     negative = ("Negative", "mean"),
     nuetral = ("Neutral", "mean"),
     positive = ("Positive", "mean"),
     # compound = ("compound", "mean"),
     # nlikes = ("nlikes", "sum"),
     # nretweets = ("nretweets", "sum"),
     )


In [48]:
df_benzinga_groupby

Unnamed: 0_level_0,negative,nuetral,positive
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-02,0.158490,0.756044,0.085465
2010-01-04,0.215179,0.370223,0.414598
2010-01-05,0.243738,0.529363,0.226900
2010-01-06,0.077748,0.422044,0.500207
2010-01-07,0.251646,0.076096,0.672258
...,...,...,...
2022-02-28,0.637912,0.302927,0.059162
2022-03-01,0.468003,0.407704,0.124294
2022-03-02,0.442909,0.424314,0.132777
2022-03-03,0.317731,0.523530,0.158739


In [49]:
df_merge = pd.merge(df, df_benzinga_groupby, how="inner", left_index=True, right_index=True)

In [50]:
df_merge.shape

(3044, 16)

In [51]:
df_merge.day_of_week.value_counts()

1    628
2    625
3    616
4    603
0    572
Name: day_of_week, dtype: int64

In [52]:
# df.index
# df_text.index
df_merge[df_merge.negative.isnull()]
# df_merge.shape

Unnamed: 0_level_0,1. open,2. high,3. low,4. close,5. adjusted close,6. volume,7. dividend amount,8. split coefficient,date_time,day_of_week,adjusted_close_shift,percentage_change,increase,negative,nuetral,positive
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1


In [53]:
df.shape

(3064, 13)

In [54]:
df_benzinga_groupby.shape

(3846, 3)

In [55]:
# df_merge.fillna(0.0, inplace=True)
# df_merge.fillna(method="ffill", inplace=True)

In [56]:
df_merge[df_merge.negative.isnull()].shape

(0, 16)

In [57]:
df_merge.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3044 entries, 2010-01-04 to 2022-03-04
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   1. open               3044 non-null   float64       
 1   2. high               3044 non-null   float64       
 2   3. low                3044 non-null   float64       
 3   4. close              3044 non-null   float64       
 4   5. adjusted close     3044 non-null   float64       
 5   6. volume             3044 non-null   float64       
 6   7. dividend amount    3044 non-null   float64       
 7   8. split coefficient  3044 non-null   float64       
 8   date_time             3044 non-null   datetime64[ns]
 9   day_of_week           3044 non-null   int64         
 10  adjusted_close_shift  3043 non-null   float64       
 11  percentage_change     3043 non-null   float64       
 12  increase              3044 non-null   int64         
 13  negative

In [58]:
# df_merge.fillna(0.0, inplace=True)

In [59]:
##############df_merge.to_csv(f"/app/StockPricePredictions/data/alphavantage/time_series_daily_adjusted/{ticker_upper}/{ticker_upper}_WITH_BENZINGA_FINBERT_SA.csv")
df_merge.head()

Unnamed: 0_level_0,1. open,2. high,3. low,4. close,5. adjusted close,6. volume,7. dividend amount,8. split coefficient,date_time,day_of_week,adjusted_close_shift,percentage_change,increase,negative,nuetral,positive
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2010-01-04,213.43,214.5,212.38,214.01,6.543876,17633200.0,0.0,1.0,2010-01-04,0,6.55519,0.172889,1,0.215179,0.370223,0.414598
2010-01-05,214.6,215.59,213.25,214.38,6.55519,21496600.0,0.0,1.0,2010-01-05,1,6.450921,-1.590633,0,0.243738,0.529363,0.2269
2010-01-06,214.38,215.23,210.75,210.97,6.450921,19720000.0,0.0,1.0,2010-01-06,2,6.438996,-0.18486,0,0.077748,0.422044,0.500207
2010-01-07,211.75,212.0,209.05,210.58,6.438996,17040400.0,0.0,1.0,2010-01-07,3,6.481804,0.66483,1,0.251646,0.076096,0.672258
2010-01-08,210.3,212.0,209.06,211.98,6.481804,15986100.0,0.0,1.0,2010-01-08,4,6.424624,-0.882159,0,0.227638,0.496481,0.275881


In [60]:
# df_merge = df_merge.sample(frac=1).reset_index(drop=True)

In [61]:
df_merge.shape

(3044, 16)

In [62]:
# df_merge[(df_merge["negative"]==0.0) & (df_merge["nuetral"]==0.0) & (df_merge["positive"]==0.0)].shape

In [63]:
# df_merge = df_merge[~(df_merge["negative"]==0.0) & ~(df_merge["nuetral"]==0.0) & ~(df_merge["positive"]==0.0)].copy()

In [64]:
X = df_merge[["negative", "nuetral", "positive"]]
y = df_merge["increase"]

In [65]:
y.head()

date
2010-01-04    1
2010-01-05    0
2010-01-06    0
2010-01-07    1
2010-01-08    0
Name: increase, dtype: int64

In [66]:
SPLIT = int(0.8 * len(df_merge))

X_train = X[:SPLIT]
X_test = X[SPLIT:]

y_train = y[:SPLIT]
y_test = y[SPLIT:]



In [67]:
X_train

Unnamed: 0_level_0,negative,nuetral,positive
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-04,0.215179,0.370223,0.414598
2010-01-05,0.243738,0.529363,0.226900
2010-01-06,0.077748,0.422044,0.500207
2010-01-07,0.251646,0.076096,0.672258
2010-01-08,0.227638,0.496481,0.275881
...,...,...,...
2019-09-23,0.107465,0.571686,0.320848
2019-09-24,0.138127,0.659392,0.202481
2019-09-25,0.016325,0.843971,0.139704
2019-09-26,0.400860,0.417549,0.181590


In [68]:
from sklearn.ensemble import RandomForestClassifier

In [69]:
rfc = RandomForestClassifier(n_estimators=200, criterion="entropy")
rfc.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=200)

In [70]:
preds = rfc.predict(X_test)

In [71]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [72]:
matrix = confusion_matrix(y_test, preds)

In [73]:
matrix

array([[148, 136],
       [141, 184]])

In [74]:
score = accuracy_score(y_test, preds)

In [75]:
score

0.5451559934318555

In [76]:
print(classification_report(y_test, preds))


              precision    recall  f1-score   support

           0       0.51      0.52      0.52       284
           1       0.57      0.57      0.57       325

    accuracy                           0.55       609
   macro avg       0.54      0.54      0.54       609
weighted avg       0.55      0.55      0.55       609



In [77]:
corr_matrix = df_merge[["negative", "nuetral", "positive", "increase"]].corr()
print (corr_matrix)

          negative   nuetral  positive  increase
negative  1.000000 -0.653451 -0.496630 -0.015334
nuetral  -0.653451  1.000000 -0.332498  0.013191
positive -0.496630 -0.332498  1.000000  0.003979
increase -0.015334  0.013191  0.003979  1.000000


In [78]:
from statsmodels.tsa.stattools import grangercausalitytests

In [79]:
#perform Granger-Causality test
grangercausalitytests(df_merge[["positive", "increase"]], maxlag=[1])


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=129.3183, p=0.0000  , df_denom=3040, df_num=1
ssr based chi2 test:   chi2=129.4459, p=0.0000  , df=1
likelihood ratio test: chi2=126.7684, p=0.0000  , df=1
parameter F test:         F=129.3183, p=0.0000  , df_denom=3040, df_num=1


{1: ({'ssr_ftest': (129.3183332091189, 2.246789020777627e-29, 3040.0, 1),
   'ssr_chi2test': (129.4459499853121, 5.4171687511900634e-30, 1),
   'lrtest': (126.76837505202457, 2.0877134958697827e-29, 1),
   'params_ftest': (129.31833320911898, 2.246789020777627e-29, 3040.0, 1.0)},
  [<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7f6f42bc64f0>,
   <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7f6f42d139d0>,
   array([[0., 1., 0.]])])}