# MODEL DEVELOPMENT

In [4]:
# Load the needed packages:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
%matplotlib inline

## 1.0 Load data and create a Mastertable

In [2]:
# Load CSV files:
poloniex = pd.read_csv("Data/poloniex_data.csv")
ggtrends = pd.read_csv("Data/google_trends.csv")
twitter = pd.read_csv("Data/twitter_agg_ddb.csv")
reddit = pd.read_csv("Data/allreddit_nlp.csv")
forum = pd.read_csv("Data/merit_compound.csv")

In [33]:
# A bit of preprocessing:
# GOOGLE TRENDS:
ggtrends = ggtrends.loc[:, ~ggtrends.columns.str.contains('^Unnamed')]
# REDDIT:
reddit = reddit.loc[:, ~reddit.columns.str.contains('^Unnamed')]
reddit = reddit.rename(columns={'date_notime': 'date', 'count_comments': 'Reddit Comments (#)', 'mean_sa': 'Reddit Average SA'})
# TWITTER:
twitter = twitter.rename(columns={'created_at': 'date','Average SA': 'Twitter Average SA'})
twitter['date'] =  pd.to_datetime(twitter['date'], format='%Y%m%d %H:%M:%S')
twitter['date'] = twitter['date'].dt.date
twitter['date'] = twitter['date'].apply(str)

# FORUM:
forum = forum.loc[:, ~forum.columns.str.contains('^Unnamed')]
forum = forum.rename(columns={'newdate': 'date', 'compound': 'Forum SA Merit', 'merit_compound': 'Forum SA Merit (weighted)'})

In [34]:
# Merge the datasets into one Master Table:
master = pd.merge(poloniex, ggtrends, how = 'inner', on = 'date')
master = pd.merge(master, twitter, how = 'outer', on = 'date')
master = pd.merge(master, forum, how = 'inner', on = 'date')
master = pd.merge(master, reddit, how = 'inner', on = 'date')

In [35]:
# Create the UP/DOWN class:
master['invest'] = master['return_day+1']>0

In [36]:
master

Unnamed: 0,date,return_day+1,close,volume,googletrends_buy_sell,Tweets (#),Active Influencers (#),Twitter Average SA,Forum SA Merit,Forum SA Merit (weighted),Reddit Comments (#),Reddit Average SA,invest
0,2017-05-01,0.020854,1530.000000,2.003840e+07,0.857143,,,,0.363063,170.581733,1262,0.100726,True
1,2017-05-02,0.034389,1561.907000,1.157105e+07,0.857143,105.0,25.0,0.156365,0.324900,163.314061,1600,0.108506,True
2,2017-05-03,-0.007255,1615.620000,1.506086e+07,0.904762,86.0,30.0,0.207976,0.359885,177.341913,1353,0.110652,False
3,2017-05-04,-0.037720,1603.898572,2.632924e+07,0.851852,111.0,27.0,0.152614,0.380796,190.054485,1766,0.102435,False
4,2017-05-05,0.035510,1543.400000,3.239718e+07,0.833333,75.0,28.0,0.117646,0.381558,158.822512,1689,0.099547,True
5,2017-05-06,0.013731,1598.205817,2.139785e+07,0.880000,54.0,18.0,0.088421,0.301068,139.721539,1471,0.091877,True
6,2017-05-07,0.052788,1620.150000,3.042350e+07,0.900000,38.0,14.0,0.085202,0.389856,170.660980,1338,0.098016,True
7,2017-05-08,0.030979,1705.674354,3.568746e+07,0.851852,110.0,24.0,0.128361,0.441199,189.538708,1525,0.101982,True
8,2017-05-09,0.023592,1758.513940,3.103138e+07,0.892857,121.0,28.0,0.105434,0.350617,148.563778,1923,0.100268,True
9,2017-05-10,0.033391,1800.000000,1.959019e+07,0.925926,123.0,29.0,0.105670,0.475941,183.982314,1873,0.096796,True


# CatBoost Predictions

In [35]:
import catboost as cb

In [36]:
# load everything
train = pd.read_csv("Data/20180919_mastertable_train.csv")
validation = pd.read_csv("Data/20180919_mastertable_validation.csv")
test = pd.read_csv("Data/20180919_mastertable_test.csv")

# drop the index column
train,validation,test = train.drop(["Unnamed: 0","date","return_day+1"],axis=1),validation.drop(["Unnamed: 0","date","return_day+1"],axis=1),test.drop(["Unnamed: 0","date","return_day+1"],axis=1)
train

Unnamed: 0,close,volume,googletrends_buy_sell,Tweets (#),Active Influencers (#),Twitter Average SA,Forum SA Merit,Forum SA Merit (weighted),Reddit Comments (#),Reddit Average SA,invest
0,1530.000000,2.003840e+07,0.857143,,,,0.363063,170.581733,1262,0.100726,True
1,1561.907000,1.157105e+07,0.857143,105.0,25.0,0.156365,0.324900,163.314061,1600,0.108506,True
2,1615.620000,1.506086e+07,0.904762,86.0,30.0,0.207976,0.359885,177.341913,1353,0.110652,False
3,1603.898572,2.632924e+07,0.851852,111.0,27.0,0.152614,0.380796,190.054485,1766,0.102435,False
4,1543.400000,3.239718e+07,0.833333,75.0,28.0,0.117646,0.381558,158.822512,1689,0.099547,True
5,1598.205817,2.139785e+07,0.880000,54.0,18.0,0.088421,0.301068,139.721539,1471,0.091877,True
6,1620.150000,3.042350e+07,0.900000,38.0,14.0,0.085202,0.389856,170.660980,1338,0.098016,True
7,1705.674354,3.568746e+07,0.851852,110.0,24.0,0.128361,0.441199,189.538708,1525,0.101982,True
8,1758.513940,3.103138e+07,0.892857,121.0,28.0,0.105434,0.350617,148.563778,1923,0.100268,True
9,1800.000000,1.959019e+07,0.925926,123.0,29.0,0.105670,0.475941,183.982314,1873,0.096796,True


In [37]:
y_train = train["invest"]
x_train = train.drop(["invest"],axis=1)

y_validation = validation["invest"]
x_validation = validation.drop(["invest"],axis=1)

y_test = test["invest"]
x_test = test.drop(["invest"],axis=1)

In [58]:
model = cb.CatBoostClassifier(eval_metric='AUC')
model.fit(x_train, y_train, eval_set = (x_validation,y_validation))

Learning rate set to 0.068163


0:	test: 0.3794872	best: 0.3794872 (0)	total: 44.5ms	remaining: 44.5s
1:	test: 0.4282051	best: 0.4282051 (1)	total: 109ms	remaining: 54.2s
2:	test: 0.4461538	best: 0.4461538 (2)	total: 146ms	remaining: 48.5s
3:	test: 0.4435897	best: 0.4461538 (2)	total: 160ms	remaining: 39.9s
4:	test: 0.3769231	best: 0.4461538 (2)	total: 176ms	remaining: 34.9s
5:	test: 0.3794872	best: 0.4461538 (2)	total: 190ms	remaining: 31.5s
6:	test: 0.3487179	best: 0.4461538 (2)	total: 228ms	remaining: 32.4s
7:	test: 0.3589744	best: 0.4461538 (2)	total: 281ms	remaining: 34.9s
8:	test: 0.3743590	best: 0.4461538 (2)	total: 322ms	remaining: 35.4s
9:	test: 0.3487179	best: 0.4461538 (2)	total: 342ms	remaining: 33.9s
10:	test: 0.3846154	best: 0.4461538 (2)	total: 363ms	remaining: 32.7s
11:	test: 0.3564103	best: 0.4461538 (2)	total: 376ms	remaining: 31s
12:	test: 0.3564103	best: 0.4461538 (2)	total: 389ms	remaining: 29.5s
13:	test: 0.3615385	best: 0.4461538 (2)	total: 404ms	remaining: 28.5s
14:	test: 0.3615385	best: 0.446

122:	test: 0.3948718	best: 0.4461538 (2)	total: 3.09s	remaining: 22s
123:	test: 0.3948718	best: 0.4461538 (2)	total: 3.13s	remaining: 22.2s
124:	test: 0.3948718	best: 0.4461538 (2)	total: 3.16s	remaining: 22.1s
125:	test: 0.3948718	best: 0.4461538 (2)	total: 3.17s	remaining: 22s
126:	test: 0.3897436	best: 0.4461538 (2)	total: 3.19s	remaining: 22s
127:	test: 0.3794872	best: 0.4461538 (2)	total: 3.23s	remaining: 22s
128:	test: 0.3794872	best: 0.4461538 (2)	total: 3.3s	remaining: 22.3s
129:	test: 0.3743590	best: 0.4461538 (2)	total: 3.34s	remaining: 22.4s
130:	test: 0.3794872	best: 0.4461538 (2)	total: 3.38s	remaining: 22.4s
131:	test: 0.3846154	best: 0.4461538 (2)	total: 3.42s	remaining: 22.5s
132:	test: 0.3743590	best: 0.4461538 (2)	total: 3.45s	remaining: 22.5s
133:	test: 0.3846154	best: 0.4461538 (2)	total: 3.46s	remaining: 22.4s
134:	test: 0.3948718	best: 0.4461538 (2)	total: 3.48s	remaining: 22.3s
135:	test: 0.3897436	best: 0.4461538 (2)	total: 3.49s	remaining: 22.2s
136:	test: 0.38

242:	test: 0.4102564	best: 0.4461538 (2)	total: 6.51s	remaining: 20.3s
243:	test: 0.4102564	best: 0.4461538 (2)	total: 6.52s	remaining: 20.2s
244:	test: 0.4102564	best: 0.4461538 (2)	total: 6.58s	remaining: 20.3s
245:	test: 0.4102564	best: 0.4461538 (2)	total: 6.62s	remaining: 20.3s
246:	test: 0.4102564	best: 0.4461538 (2)	total: 6.65s	remaining: 20.3s
247:	test: 0.4102564	best: 0.4461538 (2)	total: 6.67s	remaining: 20.2s
248:	test: 0.4102564	best: 0.4461538 (2)	total: 6.69s	remaining: 20.2s
249:	test: 0.4102564	best: 0.4461538 (2)	total: 6.73s	remaining: 20.2s
250:	test: 0.4102564	best: 0.4461538 (2)	total: 6.78s	remaining: 20.2s
251:	test: 0.4102564	best: 0.4461538 (2)	total: 6.8s	remaining: 20.2s
252:	test: 0.4102564	best: 0.4461538 (2)	total: 6.82s	remaining: 20.1s
253:	test: 0.4102564	best: 0.4461538 (2)	total: 6.83s	remaining: 20.1s
254:	test: 0.4051282	best: 0.4461538 (2)	total: 6.84s	remaining: 20s
255:	test: 0.4205128	best: 0.4461538 (2)	total: 6.85s	remaining: 19.9s
256:	test

365:	test: 0.3948718	best: 0.4461538 (2)	total: 9.47s	remaining: 16.4s
366:	test: 0.4051282	best: 0.4461538 (2)	total: 9.52s	remaining: 16.4s
367:	test: 0.4051282	best: 0.4461538 (2)	total: 9.63s	remaining: 16.5s
368:	test: 0.4051282	best: 0.4461538 (2)	total: 9.84s	remaining: 16.8s
369:	test: 0.4051282	best: 0.4461538 (2)	total: 9.89s	remaining: 16.8s
370:	test: 0.4102564	best: 0.4461538 (2)	total: 9.91s	remaining: 16.8s
371:	test: 0.4205128	best: 0.4461538 (2)	total: 9.92s	remaining: 16.8s
372:	test: 0.4205128	best: 0.4461538 (2)	total: 9.94s	remaining: 16.7s
373:	test: 0.4205128	best: 0.4461538 (2)	total: 9.95s	remaining: 16.6s
374:	test: 0.4205128	best: 0.4461538 (2)	total: 9.96s	remaining: 16.6s
375:	test: 0.4205128	best: 0.4461538 (2)	total: 9.97s	remaining: 16.6s
376:	test: 0.4205128	best: 0.4461538 (2)	total: 9.99s	remaining: 16.5s
377:	test: 0.4205128	best: 0.4461538 (2)	total: 10s	remaining: 16.5s
378:	test: 0.4205128	best: 0.4461538 (2)	total: 10s	remaining: 16.4s
379:	test:

484:	test: 0.3948718	best: 0.4461538 (2)	total: 12.9s	remaining: 13.7s
485:	test: 0.3846154	best: 0.4461538 (2)	total: 12.9s	remaining: 13.6s
486:	test: 0.3794872	best: 0.4461538 (2)	total: 12.9s	remaining: 13.6s
487:	test: 0.3846154	best: 0.4461538 (2)	total: 12.9s	remaining: 13.6s
488:	test: 0.3846154	best: 0.4461538 (2)	total: 12.9s	remaining: 13.5s
489:	test: 0.3846154	best: 0.4461538 (2)	total: 12.9s	remaining: 13.5s
490:	test: 0.3846154	best: 0.4461538 (2)	total: 13s	remaining: 13.4s
491:	test: 0.3846154	best: 0.4461538 (2)	total: 13s	remaining: 13.4s
492:	test: 0.3897436	best: 0.4461538 (2)	total: 13s	remaining: 13.3s
493:	test: 0.3897436	best: 0.4461538 (2)	total: 13s	remaining: 13.3s
494:	test: 0.3897436	best: 0.4461538 (2)	total: 13s	remaining: 13.3s
495:	test: 0.3846154	best: 0.4461538 (2)	total: 13s	remaining: 13.2s
496:	test: 0.3846154	best: 0.4461538 (2)	total: 13s	remaining: 13.2s
497:	test: 0.3897436	best: 0.4461538 (2)	total: 13s	remaining: 13.1s
498:	test: 0.3897436	b

602:	test: 0.3794872	best: 0.4461538 (2)	total: 14.6s	remaining: 9.61s
603:	test: 0.3794872	best: 0.4461538 (2)	total: 14.6s	remaining: 9.58s
604:	test: 0.3794872	best: 0.4461538 (2)	total: 14.6s	remaining: 9.56s
605:	test: 0.3794872	best: 0.4461538 (2)	total: 14.7s	remaining: 9.53s
606:	test: 0.3794872	best: 0.4461538 (2)	total: 14.7s	remaining: 9.5s
607:	test: 0.3794872	best: 0.4461538 (2)	total: 14.7s	remaining: 9.47s
608:	test: 0.3846154	best: 0.4461538 (2)	total: 14.7s	remaining: 9.44s
609:	test: 0.3846154	best: 0.4461538 (2)	total: 14.7s	remaining: 9.41s
610:	test: 0.3846154	best: 0.4461538 (2)	total: 14.7s	remaining: 9.37s
611:	test: 0.3846154	best: 0.4461538 (2)	total: 14.7s	remaining: 9.34s
612:	test: 0.3794872	best: 0.4461538 (2)	total: 14.7s	remaining: 9.31s
613:	test: 0.3794872	best: 0.4461538 (2)	total: 14.8s	remaining: 9.28s
614:	test: 0.3794872	best: 0.4461538 (2)	total: 14.8s	remaining: 9.25s
615:	test: 0.3794872	best: 0.4461538 (2)	total: 14.8s	remaining: 9.22s
616:	te

725:	test: 0.3897436	best: 0.4461538 (2)	total: 16.5s	remaining: 6.23s
726:	test: 0.3897436	best: 0.4461538 (2)	total: 16.7s	remaining: 6.27s
727:	test: 0.3897436	best: 0.4461538 (2)	total: 16.8s	remaining: 6.26s
728:	test: 0.3897436	best: 0.4461538 (2)	total: 16.8s	remaining: 6.25s
729:	test: 0.3897436	best: 0.4461538 (2)	total: 16.8s	remaining: 6.22s
730:	test: 0.3897436	best: 0.4461538 (2)	total: 16.8s	remaining: 6.19s
731:	test: 0.3897436	best: 0.4461538 (2)	total: 16.8s	remaining: 6.17s
732:	test: 0.3897436	best: 0.4461538 (2)	total: 16.9s	remaining: 6.14s
733:	test: 0.3897436	best: 0.4461538 (2)	total: 16.9s	remaining: 6.11s
734:	test: 0.3897436	best: 0.4461538 (2)	total: 16.9s	remaining: 6.09s
735:	test: 0.3897436	best: 0.4461538 (2)	total: 16.9s	remaining: 6.06s
736:	test: 0.3897436	best: 0.4461538 (2)	total: 16.9s	remaining: 6.03s
737:	test: 0.3948718	best: 0.4461538 (2)	total: 16.9s	remaining: 6.01s
738:	test: 0.3897436	best: 0.4461538 (2)	total: 16.9s	remaining: 5.98s
739:	t

853:	test: 0.3794872	best: 0.4461538 (2)	total: 18.7s	remaining: 3.2s
854:	test: 0.3794872	best: 0.4461538 (2)	total: 18.7s	remaining: 3.17s
855:	test: 0.3794872	best: 0.4461538 (2)	total: 18.7s	remaining: 3.15s
856:	test: 0.3794872	best: 0.4461538 (2)	total: 18.8s	remaining: 3.13s
857:	test: 0.3794872	best: 0.4461538 (2)	total: 18.8s	remaining: 3.11s
858:	test: 0.3794872	best: 0.4461538 (2)	total: 18.8s	remaining: 3.08s
859:	test: 0.3794872	best: 0.4461538 (2)	total: 18.8s	remaining: 3.06s
860:	test: 0.3794872	best: 0.4461538 (2)	total: 18.8s	remaining: 3.04s
861:	test: 0.3794872	best: 0.4461538 (2)	total: 18.8s	remaining: 3.01s
862:	test: 0.3794872	best: 0.4461538 (2)	total: 18.8s	remaining: 2.99s
863:	test: 0.3794872	best: 0.4461538 (2)	total: 18.8s	remaining: 2.97s
864:	test: 0.3794872	best: 0.4461538 (2)	total: 18.9s	remaining: 2.94s
865:	test: 0.3794872	best: 0.4461538 (2)	total: 18.9s	remaining: 2.92s
866:	test: 0.3794872	best: 0.4461538 (2)	total: 18.9s	remaining: 2.9s
867:	tes

973:	test: 0.3846154	best: 0.4461538 (2)	total: 20.9s	remaining: 559ms
974:	test: 0.3846154	best: 0.4461538 (2)	total: 21.1s	remaining: 541ms
975:	test: 0.3846154	best: 0.4461538 (2)	total: 21.2s	remaining: 521ms
976:	test: 0.3846154	best: 0.4461538 (2)	total: 21.3s	remaining: 501ms
977:	test: 0.3846154	best: 0.4461538 (2)	total: 21.3s	remaining: 480ms
978:	test: 0.3846154	best: 0.4461538 (2)	total: 21.4s	remaining: 459ms
979:	test: 0.3846154	best: 0.4461538 (2)	total: 21.5s	remaining: 440ms
980:	test: 0.3846154	best: 0.4461538 (2)	total: 21.7s	remaining: 421ms
981:	test: 0.3846154	best: 0.4461538 (2)	total: 21.8s	remaining: 399ms
982:	test: 0.3846154	best: 0.4461538 (2)	total: 21.8s	remaining: 377ms
983:	test: 0.3846154	best: 0.4461538 (2)	total: 22s	remaining: 358ms
984:	test: 0.3846154	best: 0.4461538 (2)	total: 22.1s	remaining: 337ms
985:	test: 0.3846154	best: 0.4461538 (2)	total: 22.2s	remaining: 315ms
986:	test: 0.3846154	best: 0.4461538 (2)	total: 22.2s	remaining: 292ms
987:	tes

<catboost.core.CatBoostClassifier at 0x113825748>