# MODEL DEVELOPMENT

In [4]:
# Load the needed packages:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
%matplotlib inline

## 1.0 Load data and create a Mastertable

In [2]:
# Load CSV files:
poloniex = pd.read_csv("Data/poloniex_data.csv")
ggtrends = pd.read_csv("Data/google_trends.csv")
twitter = pd.read_csv("Data/twitter_agg_ddb.csv")
reddit = pd.read_csv("Data/allreddit_nlp.csv")
forum = pd.read_csv("Data/merit_compound.csv")

In [33]:
# A bit of preprocessing:
# GOOGLE TRENDS:
ggtrends = ggtrends.loc[:, ~ggtrends.columns.str.contains('^Unnamed')]
# REDDIT:
reddit = reddit.loc[:, ~reddit.columns.str.contains('^Unnamed')]
reddit = reddit.rename(columns={'date_notime': 'date', 'count_comments': 'Reddit Comments (#)', 'mean_sa': 'Reddit Average SA'})
# TWITTER:
twitter = twitter.rename(columns={'created_at': 'date','Average SA': 'Twitter Average SA'})
twitter['date'] =  pd.to_datetime(twitter['date'], format='%Y%m%d %H:%M:%S')
twitter['date'] = twitter['date'].dt.date
twitter['date'] = twitter['date'].apply(str)

# FORUM:
forum = forum.loc[:, ~forum.columns.str.contains('^Unnamed')]
forum = forum.rename(columns={'newdate': 'date', 'compound': 'Forum SA Merit', 'merit_compound': 'Forum SA Merit (weighted)'})

In [34]:
# Merge the datasets into one Master Table:
master = pd.merge(poloniex, ggtrends, how = 'inner', on = 'date')
master = pd.merge(master, twitter, how = 'outer', on = 'date')
master = pd.merge(master, forum, how = 'inner', on = 'date')
master = pd.merge(master, reddit, how = 'inner', on = 'date')

In [35]:
# Create the UP/DOWN class:
master['invest'] = master['return_day+1']>0

In [36]:
master

Unnamed: 0,date,return_day+1,close,volume,googletrends_buy_sell,Tweets (#),Active Influencers (#),Twitter Average SA,Forum SA Merit,Forum SA Merit (weighted),Reddit Comments (#),Reddit Average SA,invest
0,2017-05-01,0.020854,1530.000000,2.003840e+07,0.857143,,,,0.363063,170.581733,1262,0.100726,True
1,2017-05-02,0.034389,1561.907000,1.157105e+07,0.857143,105.0,25.0,0.156365,0.324900,163.314061,1600,0.108506,True
2,2017-05-03,-0.007255,1615.620000,1.506086e+07,0.904762,86.0,30.0,0.207976,0.359885,177.341913,1353,0.110652,False
3,2017-05-04,-0.037720,1603.898572,2.632924e+07,0.851852,111.0,27.0,0.152614,0.380796,190.054485,1766,0.102435,False
4,2017-05-05,0.035510,1543.400000,3.239718e+07,0.833333,75.0,28.0,0.117646,0.381558,158.822512,1689,0.099547,True
5,2017-05-06,0.013731,1598.205817,2.139785e+07,0.880000,54.0,18.0,0.088421,0.301068,139.721539,1471,0.091877,True
6,2017-05-07,0.052788,1620.150000,3.042350e+07,0.900000,38.0,14.0,0.085202,0.389856,170.660980,1338,0.098016,True
7,2017-05-08,0.030979,1705.674354,3.568746e+07,0.851852,110.0,24.0,0.128361,0.441199,189.538708,1525,0.101982,True
8,2017-05-09,0.023592,1758.513940,3.103138e+07,0.892857,121.0,28.0,0.105434,0.350617,148.563778,1923,0.100268,True
9,2017-05-10,0.033391,1800.000000,1.959019e+07,0.925926,123.0,29.0,0.105670,0.475941,183.982314,1873,0.096796,True


# CatBoost Predictions

In [35]:
import catboost as cb

In [100]:
# load everything
train = pd.read_csv("Data/20180919_mastertable_train.csv")
validation = pd.read_csv("Data/20180919_mastertable_validation.csv")
test = pd.read_csv("Data/20180919_mastertable_test.csv")

# drop the index column
train,validation,test = train.drop(["Unnamed: 0","date","return_day+1"],axis=1),validation.drop(["Unnamed: 0","date","return_day+1"],axis=1),test.drop(["Unnamed: 0","date","return_day+1"],axis=1)
train

Unnamed: 0,close,volume,googletrends_buy_sell,Tweets (#),Active Influencers (#),Twitter Average SA,Forum SA Merit,Forum SA Merit (weighted),Reddit Comments (#),Reddit Average SA,invest
0,1530.000000,2.003840e+07,0.857143,,,,0.363063,170.581733,1262,0.100726,True
1,1561.907000,1.157105e+07,0.857143,105.0,25.0,0.156365,0.324900,163.314061,1600,0.108506,True
2,1615.620000,1.506086e+07,0.904762,86.0,30.0,0.207976,0.359885,177.341913,1353,0.110652,False
3,1603.898572,2.632924e+07,0.851852,111.0,27.0,0.152614,0.380796,190.054485,1766,0.102435,False
4,1543.400000,3.239718e+07,0.833333,75.0,28.0,0.117646,0.381558,158.822512,1689,0.099547,True
5,1598.205817,2.139785e+07,0.880000,54.0,18.0,0.088421,0.301068,139.721539,1471,0.091877,True
6,1620.150000,3.042350e+07,0.900000,38.0,14.0,0.085202,0.389856,170.660980,1338,0.098016,True
7,1705.674354,3.568746e+07,0.851852,110.0,24.0,0.128361,0.441199,189.538708,1525,0.101982,True
8,1758.513940,3.103138e+07,0.892857,121.0,28.0,0.105434,0.350617,148.563778,1923,0.100268,True
9,1800.000000,1.959019e+07,0.925926,123.0,29.0,0.105670,0.475941,183.982314,1873,0.096796,True


In [123]:
y_train = train["invest"]
x_train = train.drop(["invest"],axis=1)

y_validation = validation["invest"]
x_validation = validation.drop(["invest"],axis=1)

y_test = test["invest"]
x_test = test.drop(["invest"],axis=1)

In [128]:
model = cb.CatBoostClassifier(eval_metric='AUC',random_seed = 2)
model.fit(x_train, y_train, eval_set = (x_validation,y_validation))

Learning rate set to 0.068163


0:	test: 0.3461538	best: 0.3461538 (0)	total: 45.4ms	remaining: 45.4s
1:	test: 0.2974359	best: 0.3461538 (0)	total: 101ms	remaining: 50.4s
2:	test: 0.3205128	best: 0.3461538 (0)	total: 144ms	remaining: 47.8s
3:	test: 0.3487179	best: 0.3487179 (3)	total: 174ms	remaining: 43.3s
4:	test: 0.3179487	best: 0.3487179 (3)	total: 186ms	remaining: 37.1s
5:	test: 0.3923077	best: 0.3923077 (5)	total: 212ms	remaining: 35.2s
6:	test: 0.3871795	best: 0.3923077 (5)	total: 261ms	remaining: 37.1s
7:	test: 0.4435897	best: 0.4435897 (7)	total: 313ms	remaining: 38.8s
8:	test: 0.4205128	best: 0.4435897 (7)	total: 333ms	remaining: 36.6s
9:	test: 0.4000000	best: 0.4435897 (7)	total: 357ms	remaining: 35.4s
10:	test: 0.4256410	best: 0.4435897 (7)	total: 374ms	remaining: 33.6s
11:	test: 0.4410256	best: 0.4435897 (7)	total: 387ms	remaining: 31.8s
12:	test: 0.4512821	best: 0.4512821 (12)	total: 402ms	remaining: 30.5s
13:	test: 0.4717949	best: 0.4717949 (13)	total: 416ms	remaining: 29.3s
14:	test: 0.4666667	best: 0

118:	test: 0.5128205	best: 0.5282051 (106)	total: 2.66s	remaining: 19.7s
119:	test: 0.5128205	best: 0.5282051 (106)	total: 2.71s	remaining: 19.9s
120:	test: 0.5076923	best: 0.5282051 (106)	total: 2.73s	remaining: 19.9s
121:	test: 0.5076923	best: 0.5282051 (106)	total: 2.75s	remaining: 19.8s
122:	test: 0.5076923	best: 0.5282051 (106)	total: 2.76s	remaining: 19.7s
123:	test: 0.4974359	best: 0.5282051 (106)	total: 2.77s	remaining: 19.6s
124:	test: 0.4974359	best: 0.5282051 (106)	total: 2.79s	remaining: 19.5s
125:	test: 0.4974359	best: 0.5282051 (106)	total: 2.8s	remaining: 19.4s
126:	test: 0.4974359	best: 0.5282051 (106)	total: 2.81s	remaining: 19.3s
127:	test: 0.5025641	best: 0.5282051 (106)	total: 2.82s	remaining: 19.2s
128:	test: 0.5025641	best: 0.5282051 (106)	total: 2.84s	remaining: 19.2s
129:	test: 0.5076923	best: 0.5282051 (106)	total: 2.85s	remaining: 19.1s
130:	test: 0.5076923	best: 0.5282051 (106)	total: 2.86s	remaining: 19s
131:	test: 0.4974359	best: 0.5282051 (106)	total: 2.91

232:	test: 0.4769231	best: 0.5282051 (106)	total: 4.49s	remaining: 14.8s
233:	test: 0.4666667	best: 0.5282051 (106)	total: 4.53s	remaining: 14.8s
234:	test: 0.4717949	best: 0.5282051 (106)	total: 4.54s	remaining: 14.8s
235:	test: 0.4717949	best: 0.5282051 (106)	total: 4.55s	remaining: 14.7s
236:	test: 0.4666667	best: 0.5282051 (106)	total: 4.56s	remaining: 14.7s
237:	test: 0.4615385	best: 0.5282051 (106)	total: 4.58s	remaining: 14.6s
238:	test: 0.4666667	best: 0.5282051 (106)	total: 4.59s	remaining: 14.6s
239:	test: 0.4666667	best: 0.5282051 (106)	total: 4.6s	remaining: 14.6s
240:	test: 0.4666667	best: 0.5282051 (106)	total: 4.61s	remaining: 14.5s
241:	test: 0.4820513	best: 0.5282051 (106)	total: 4.63s	remaining: 14.5s
242:	test: 0.4820513	best: 0.5282051 (106)	total: 4.64s	remaining: 14.5s
243:	test: 0.4769231	best: 0.5282051 (106)	total: 4.66s	remaining: 14.4s
244:	test: 0.4769231	best: 0.5282051 (106)	total: 4.67s	remaining: 14.4s
245:	test: 0.4820513	best: 0.5282051 (106)	total: 4.

353:	test: 0.5076923	best: 0.5282051 (106)	total: 6.63s	remaining: 12.1s
354:	test: 0.5076923	best: 0.5282051 (106)	total: 6.68s	remaining: 12.1s
355:	test: 0.5076923	best: 0.5282051 (106)	total: 6.71s	remaining: 12.1s
356:	test: 0.5128205	best: 0.5282051 (106)	total: 6.76s	remaining: 12.2s
357:	test: 0.5230769	best: 0.5282051 (106)	total: 6.8s	remaining: 12.2s
358:	test: 0.4974359	best: 0.5282051 (106)	total: 6.81s	remaining: 12.2s
359:	test: 0.5025641	best: 0.5282051 (106)	total: 6.84s	remaining: 12.2s
360:	test: 0.5076923	best: 0.5282051 (106)	total: 6.88s	remaining: 12.2s
361:	test: 0.5076923	best: 0.5282051 (106)	total: 6.9s	remaining: 12.2s
362:	test: 0.5076923	best: 0.5282051 (106)	total: 6.92s	remaining: 12.1s
363:	test: 0.5076923	best: 0.5282051 (106)	total: 6.93s	remaining: 12.1s
364:	test: 0.4974359	best: 0.5282051 (106)	total: 6.94s	remaining: 12.1s
365:	test: 0.4974359	best: 0.5282051 (106)	total: 6.95s	remaining: 12s
366:	test: 0.4923077	best: 0.5282051 (106)	total: 6.99s

468:	test: 0.4717949	best: 0.5282051 (106)	total: 9.23s	remaining: 10.4s
469:	test: 0.4666667	best: 0.5282051 (106)	total: 9.25s	remaining: 10.4s
470:	test: 0.4666667	best: 0.5282051 (106)	total: 9.28s	remaining: 10.4s
471:	test: 0.4717949	best: 0.5282051 (106)	total: 9.3s	remaining: 10.4s
472:	test: 0.4666667	best: 0.5282051 (106)	total: 9.33s	remaining: 10.4s
473:	test: 0.4666667	best: 0.5282051 (106)	total: 9.34s	remaining: 10.4s
474:	test: 0.4666667	best: 0.5282051 (106)	total: 9.35s	remaining: 10.3s
475:	test: 0.4717949	best: 0.5282051 (106)	total: 9.37s	remaining: 10.3s
476:	test: 0.4666667	best: 0.5282051 (106)	total: 9.38s	remaining: 10.3s
477:	test: 0.4666667	best: 0.5282051 (106)	total: 9.39s	remaining: 10.3s
478:	test: 0.4666667	best: 0.5282051 (106)	total: 9.4s	remaining: 10.2s
479:	test: 0.4666667	best: 0.5282051 (106)	total: 9.42s	remaining: 10.2s
480:	test: 0.4666667	best: 0.5282051 (106)	total: 9.45s	remaining: 10.2s
481:	test: 0.4666667	best: 0.5282051 (106)	total: 9.4

586:	test: 0.4820513	best: 0.5282051 (106)	total: 11.6s	remaining: 8.15s
587:	test: 0.4769231	best: 0.5282051 (106)	total: 11.6s	remaining: 8.12s
588:	test: 0.4717949	best: 0.5282051 (106)	total: 11.6s	remaining: 8.1s
589:	test: 0.4820513	best: 0.5282051 (106)	total: 11.6s	remaining: 8.09s
590:	test: 0.4769231	best: 0.5282051 (106)	total: 11.7s	remaining: 8.09s
591:	test: 0.4820513	best: 0.5282051 (106)	total: 11.7s	remaining: 8.08s
592:	test: 0.4820513	best: 0.5282051 (106)	total: 11.8s	remaining: 8.07s
593:	test: 0.4820513	best: 0.5282051 (106)	total: 11.8s	remaining: 8.06s
594:	test: 0.4871795	best: 0.5282051 (106)	total: 11.9s	remaining: 8.07s
595:	test: 0.4820513	best: 0.5282051 (106)	total: 11.9s	remaining: 8.05s
596:	test: 0.4769231	best: 0.5282051 (106)	total: 11.9s	remaining: 8.05s
597:	test: 0.4820513	best: 0.5282051 (106)	total: 12s	remaining: 8.03s
598:	test: 0.4717949	best: 0.5282051 (106)	total: 12s	remaining: 8.02s
599:	test: 0.4871795	best: 0.5282051 (106)	total: 12s	re

701:	test: 0.4717949	best: 0.5282051 (106)	total: 14.2s	remaining: 6.04s
702:	test: 0.4717949	best: 0.5282051 (106)	total: 14.3s	remaining: 6.03s
703:	test: 0.4769231	best: 0.5282051 (106)	total: 14.3s	remaining: 6.01s
704:	test: 0.4769231	best: 0.5282051 (106)	total: 14.3s	remaining: 6s
705:	test: 0.4666667	best: 0.5282051 (106)	total: 14.3s	remaining: 5.97s
706:	test: 0.4666667	best: 0.5282051 (106)	total: 14.4s	remaining: 5.96s
707:	test: 0.4717949	best: 0.5282051 (106)	total: 14.4s	remaining: 5.94s
708:	test: 0.4717949	best: 0.5282051 (106)	total: 14.4s	remaining: 5.92s
709:	test: 0.4769231	best: 0.5282051 (106)	total: 14.5s	remaining: 5.91s
710:	test: 0.4769231	best: 0.5282051 (106)	total: 14.5s	remaining: 5.9s
711:	test: 0.4769231	best: 0.5282051 (106)	total: 14.6s	remaining: 5.89s
712:	test: 0.4769231	best: 0.5282051 (106)	total: 14.6s	remaining: 5.88s
713:	test: 0.4769231	best: 0.5282051 (106)	total: 14.6s	remaining: 5.86s
714:	test: 0.4769231	best: 0.5282051 (106)	total: 14.7s

815:	test: 0.4769231	best: 0.5282051 (106)	total: 17s	remaining: 3.84s
816:	test: 0.4769231	best: 0.5282051 (106)	total: 17.1s	remaining: 3.83s
817:	test: 0.4769231	best: 0.5282051 (106)	total: 17.1s	remaining: 3.81s
818:	test: 0.4717949	best: 0.5282051 (106)	total: 17.1s	remaining: 3.79s
819:	test: 0.4769231	best: 0.5282051 (106)	total: 17.2s	remaining: 3.77s
820:	test: 0.4717949	best: 0.5282051 (106)	total: 17.2s	remaining: 3.75s
821:	test: 0.4717949	best: 0.5282051 (106)	total: 17.3s	remaining: 3.74s
822:	test: 0.4769231	best: 0.5282051 (106)	total: 17.3s	remaining: 3.72s
823:	test: 0.4769231	best: 0.5282051 (106)	total: 17.3s	remaining: 3.7s
824:	test: 0.4769231	best: 0.5282051 (106)	total: 17.3s	remaining: 3.68s
825:	test: 0.4769231	best: 0.5282051 (106)	total: 17.4s	remaining: 3.66s
826:	test: 0.4769231	best: 0.5282051 (106)	total: 17.4s	remaining: 3.63s
827:	test: 0.4769231	best: 0.5282051 (106)	total: 17.4s	remaining: 3.61s
828:	test: 0.4769231	best: 0.5282051 (106)	total: 17.4

928:	test: 0.4717949	best: 0.5282051 (106)	total: 19.9s	remaining: 1.52s
929:	test: 0.4717949	best: 0.5282051 (106)	total: 19.9s	remaining: 1.5s
930:	test: 0.4717949	best: 0.5282051 (106)	total: 19.9s	remaining: 1.48s
931:	test: 0.4717949	best: 0.5282051 (106)	total: 19.9s	remaining: 1.45s
932:	test: 0.4717949	best: 0.5282051 (106)	total: 19.9s	remaining: 1.43s
933:	test: 0.4717949	best: 0.5282051 (106)	total: 20s	remaining: 1.41s
934:	test: 0.4717949	best: 0.5282051 (106)	total: 20s	remaining: 1.39s
935:	test: 0.4717949	best: 0.5282051 (106)	total: 20s	remaining: 1.37s
936:	test: 0.4717949	best: 0.5282051 (106)	total: 20s	remaining: 1.34s
937:	test: 0.4717949	best: 0.5282051 (106)	total: 20s	remaining: 1.32s
938:	test: 0.4717949	best: 0.5282051 (106)	total: 20s	remaining: 1.3s
939:	test: 0.4717949	best: 0.5282051 (106)	total: 20s	remaining: 1.28s
940:	test: 0.4717949	best: 0.5282051 (106)	total: 20.1s	remaining: 1.26s
941:	test: 0.4666667	best: 0.5282051 (106)	total: 20.1s	remaining: 

<catboost.core.CatBoostClassifier at 0x1a192dd940>

In [129]:
preds_proba = model.predict(x_test, prediction_type='Probability')

In [130]:
preds_proba

array([[0.6838835 , 0.3161165 ],
       [0.667363  , 0.332637  ],
       [0.53765668, 0.46234332],
       [0.62487125, 0.37512875],
       [0.64999491, 0.35000509],
       [0.6080733 , 0.3919267 ],
       [0.50747763, 0.49252237],
       [0.70091441, 0.29908559],
       [0.41710815, 0.58289185],
       [0.77642331, 0.22357669],
       [0.80425975, 0.19574025],
       [0.52876998, 0.47123002],
       [0.7030624 , 0.2969376 ],
       [0.59808015, 0.40191985],
       [0.64611049, 0.35388951],
       [0.54050519, 0.45949481],
       [0.73869421, 0.26130579],
       [0.74861896, 0.25138104],
       [0.63969354, 0.36030646],
       [0.67403663, 0.32596337],
       [0.63469014, 0.36530986],
       [0.63707993, 0.36292007],
       [0.65814328, 0.34185672],
       [0.54432085, 0.45567915],
       [0.63740661, 0.36259339],
       [0.68442622, 0.31557378],
       [0.54494798, 0.45505202],
       [0.59454372, 0.40545628],
       [0.58151975, 0.41848025],
       [0.68235048, 0.31764952],
       [0.

In [131]:
import sklearn
from sklearn import metrics
sklearn.metrics.roc_auc_score(y_test, preds_proba[:,0])

0.4579741379310345