# Import Dependencies

In [162]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from xgboost import XGBRegressor

from transformers import pipeline

import seaborn as sns
import matplotlib.pyplot as plt 

# Data Prep

## Import data

In [93]:
djia_df = pd.read_csv('../data/DJIA_table.csv', index_col='Date')
reddit_df = pd.read_csv('../data/RedditNews.csv')

In [94]:
djia_df.head(5)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-07-01,17924.240234,18002.380859,17916.910156,17949.369141,82160000,17949.369141
2016-06-30,17712.759766,17930.609375,17711.800781,17929.990234,133030000,17929.990234
2016-06-29,17456.019531,17704.509766,17456.019531,17694.679688,106380000,17694.679688
2016-06-28,17190.509766,17409.720703,17190.509766,17409.720703,112190000,17409.720703
2016-06-27,17355.210938,17355.210938,17063.080078,17140.240234,138740000,17140.240234


In [95]:
reddit_df

Unnamed: 0,Date,News
0,2016-07-01,A 117-year-old woman in Mexico City finally re...
1,2016-07-01,IMF chief backs Athens as permanent Olympic host
2,2016-07-01,"The president of France says if Brexit won, so..."
3,2016-07-01,British Man Who Must Give Police 24 Hours' Not...
4,2016-07-01,100+ Nobel laureates urge Greenpeace to stop o...
...,...,...
73603,2008-06-08,b'Man goes berzerk in Akihabara and stabs ever...
73604,2008-06-08,b'Threat of world AIDS pandemic among heterose...
73605,2008-06-08,b'Angst in Ankara: Turkey Steers into a Danger...
73606,2008-06-08,"b""UK: Identity cards 'could be used to spy on ..."


## Feature Engineering

### DJIA Data

In [97]:
(djia_df['Close'] - djia_df['Adj Close']).unique()

array([0.])

In [98]:
djia_df.drop(['Adj Close'], axis=1, inplace=True)

In [99]:
for i in range(1, 6):
    djia_df[f'Open Minus {i}'] = djia_df['Open'].shift(i)
    djia_df[f'High Minus {i}'] = djia_df['High'].shift(i)
    djia_df[f'Low Minus {i}'] = djia_df['Low'].shift(i)
    djia_df[f'Close Minus {i}'] = djia_df['Close'].shift(i)

In [101]:
djia_df.dropna(inplace=True)

### Reddit Data

In [57]:
reddit_joined_df = reddit_df.groupby('Date').agg(join_news=('News', '<split>'.join))

reddit_joined_df['join_news'] = reddit_joined_df['join_news'].str.split('<split>')
reddit_joined_df = reddit_joined_df[reddit_joined_df['join_news'].str.len() == 25]

reddit_joined_df[[f'News {i}' for i in range(1, 26)]] = reddit_joined_df['join_news'].apply(pd.Series)

In [109]:
reddit_joined_df.head(2)

Unnamed: 0_level_0,join_news,News 1,News 2,News 3,News 4,News 5,News 6,News 7,News 8,News 9,...,News 16,News 17,News 18,News 19,News 20,News 21,News 22,News 23,News 24,News 25
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2008-06-08,[b'Nim Chimpsky: The tragedy of the chimp who ...,b'Nim Chimpsky: The tragedy of the chimp who t...,"b""Canada: Beware slippery slope' to censorship...",b'EU Vice-President Luisa Morgantini and the I...,"b""Israeli minister: Israel will attack Iran if...",b'Albino Killings in Tanzania. At least 19 alb...,b'Chiapas: army occupies Zapatista communities...,"b'Polar bear swims 200 miles, is shot dead upo...","b'News is a contraband item in Pakistan now, a...","b'Albinos, Long Shunned, Face Threat in Tanzan...",...,"b""S. Korean protesters, police clash in beef r...","b""Oil reserves 'will last decades' - a BBC Sco...",b'Cameras designed to detect terrorist facial ...,b'Israeli peace activists protest 41 years of ...,"b""A 5.1 earthquake hits China's Southern Qingh...",b'Man goes berzerk in Akihabara and stabs ever...,b'Threat of world AIDS pandemic among heterose...,b'Angst in Ankara: Turkey Steers into a Danger...,"b""UK: Identity cards 'could be used to spy on ...","b'Marriage, they said, was reduced to the stat..."
2008-06-09,"[b'United States quits Human Rights Council', ...",b'United States quits Human Rights Council',"b""Pentagon blocked Cheney's attack on Iran""","b""'J Street,' a new liberal Jewish organizatio...","b'Former Ambassador Joseph Wilson: ""[The U.S. ...",b'EU leaders anxiously await Irish verdict on ...,"b"" 17 Hit or Stabbed, 7 Confirmed Dead in Toky...",b'Treaty tensions mount as Iraq tells the US i...,"b""Council paints over street artist Banksy's g...","b'""Finders keepers"" can get complicated when h...",...,b'35 Ukraine miners trapped underground - thre...,b'Bilderberg attendee Geithner calls for globa...,"b'Jos Manuel Barroso bullies the Irish, saying...","b""Don't worry everyone, AIDS is officially over!""",b'What if Bush Attacks Iran on His Way Out of ...,b'Future of the United States of Europe in the...,"b""'Military coup' in Zimbabwe as Mugabe is for...",b'Rising Oil Prices Spark Strikes in Spain and...,"b'Chvez to FARC: Asks to end armed strugle, ""T...",b'Fliers in for pain as airlines pack it in'


### Join datasets

In [106]:
joined_df = djia_df.join(reddit_joined_df.drop(columns=['join_news']), how='inner')

In [114]:
joined_df.head(2)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Open Minus 1,High Minus 1,Low Minus 1,Close Minus 1,Open Minus 2,...,News 16,News 17,News 18,News 19,News 20,News 21,News 22,News 23,News 24,News 25
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-06-24,17946.630859,17946.630859,17356.339844,17400.75,239000000,17355.210938,17355.210938,17063.080078,17140.240234,17190.509766,...,Sinn Fein calls for a referendum on Irish reun...,$70 billion wiped off the Australian sharemark...,Nigel Farage disowns Vote Leave '350m for the ...,Top EU leader: we want Britain out as soon as ...,Nigel Farage: 350 million pledge to fund the N...,Thousands of London banking job cuts to start ...,Google says there was a large spike in searche...,EU referendum; Gibraltar backs Remain with 94%...,"After Brexit, U.K. Residents Google 'What Is T...",A Turkish man has been found guilty of insulti...
2016-06-23,17844.109375,18011.070312,17844.109375,18011.070312,98070000,17946.630859,17946.630859,17356.339844,17400.75,17355.210938,...,"Colombia, FARC announce full ceasefire, 'last ...",Gunmen kill Sufi devotional singer Amjad Sabri...,India launches 20 satellites in single mission,F-16s to be manufactured soon in an assembly l...,Australia's gun laws stopped mass shootings an...,French cement company in Syria buys oil from I...,Pope to visit Armenia after irking Turkey with...,Merkel says NATO must be strengthened,"China cracks down on online comments, click-ba...",The prime minister of India is set to get a br...


### Feature extraction (News Sentiment)

#### Didn't run this because it takes a long time to run

In [143]:
# sentiment_analyzer = pipeline('sentiment-analysis', framework='pt')

In [144]:
# joined_df['News Sentiment'] = joined_df['News 1'].apply(lambda x: sentiment_analyzer(x)[0]['label'])

### Final DataFrame

#### Dropping News information due to processing limitations

In [145]:
final_df = joined_df.drop(columns=[f'News {i}' for i in range(1, 26)])

#### I will exclusively be utilizing open data at the moment as I have opted to forecast the opening price of the next day based on historical data

In [147]:
final_df = final_df[['Open'] + [f'Open Minus {i}' for i in range(1, 6)]]

In [148]:
final_df.head(2)

Unnamed: 0_level_0,Open,Open Minus 1,Open Minus 2,Open Minus 3,Open Minus 4,Open Minus 5
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-06-24,17946.630859,17355.210938,17190.509766,17456.019531,17712.759766,17924.240234
2016-06-23,17844.109375,17946.630859,17355.210938,17190.509766,17456.019531,17712.759766


# Training and Testing Model

## Split data

In [149]:
X_train, X_test, y_train, y_test = train_test_split(final_df.drop(columns=['Open']), final_df['Open'], test_size=0.2, random_state=42)

## Initialize Grid Search Fine Tuning

In [156]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [154]:
params = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [2, 3, 4, 5, 6, 7, 8],
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3]
}

In [159]:
grid = GridSearchCV(XGBRegressor(), params, cv=cv, n_jobs=20, verbose=1)

## Train Model

In [160]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 175 candidates, totalling 875 fits


In [164]:
print(grid.best_params_)

{'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200}


## Test Model

In [161]:
y_pred = grid.predict(X_test)

In [163]:
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'R2: {r2_score(y_test, y_pred)}')

MSE: 27196.295095825277
MAE: 112.1565230609257
R2: 0.9973087202448274
