In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.model_selection import GridSearchCV

In [14]:
train_test_data = pd.read_csv('../../../../data/train_test_data/train_test_data_new_rolling_avg.csv')

## Neural Network Modelling

### Preprocessing

In [15]:
train_test_data.dropna(inplace=True)
train_test_data

Unnamed: 0,timestamp,rank,ticker,name,mentions,mentioning_users,upvotes,sentiment,rank_24h_ago,mentions_24h_ago,...,revenue_growth_ttm_yoy,dividend_exists,rank_percentage_change_24h,mentions_percentage_change_24h,opening_price,closing_price,label,sentiment_rolling_avg,upvotes_rolling_avg,mentioning_users_rolling_avg
0,2023-05-25,9,AAPL,Apple,90,71.0,441,58.0,8,45.0,...,-0.24000,1,0.125000,1.000000,172.41,172.99,1,58.000000,441.000000,71.000000
1,2023-05-29,10,AAPL,Apple,23,23.0,94,77.0,9,36.0,...,-0.24000,1,0.111111,-0.361111,173.32,175.43,1,77.000000,94.000000,23.000000
2,2023-05-30,9,AAPL,Apple,40,37.0,649,38.0,11,22.0,...,-0.24000,1,-0.181818,0.818182,176.96,177.30,1,57.500000,371.500000,30.000000
3,2023-06-01,8,AAPL,Apple,115,88.0,306,49.0,12,50.0,...,-0.24000,1,-0.333333,1.300000,177.70,180.09,1,43.500000,477.500000,62.500000
4,2023-06-02,5,AAPL,Apple,176,126.0,604,57.0,8,122.0,...,-0.24000,1,-0.375000,0.442623,181.03,180.95,0,53.000000,455.000000,107.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
385,2023-07-13,7,TSLA,Tesla,110,92.0,1301,51.0,4,117.0,...,38.34000,0,0.750000,-0.059829,274.59,277.90,1,59.333333,907.333333,110.666667
386,2023-07-14,6,TSLA,Tesla,148,109.0,906,63.0,5,110.0,...,38.34000,0,0.200000,0.345455,277.01,281.38,1,62.666667,967.000000,98.666667
387,2023-05-29,12,TSM,TSMC,20,19.0,81,100.0,24,13.0,...,32.95000,1,-0.500000,0.538462,100.85,103.21,1,100.000000,81.000000,19.000000
388,2023-05-26,10,ULTA,ULTA Beauty,60,42.0,181,66.0,183,6.0,...,18.28000,0,-0.945355,9.000000,436.11,420.27,0,66.000000,181.000000,42.000000


In [16]:
# Remove columns that are not needed for modelling

train_test_data = train_test_data.drop(
    ['name', 
     'ticker', 
     'timestamp',
    'opening_price',
   'closing_price',
   'rank_24h_ago',
   'mentions_24h_ago',
   'rank', 'dividend_exists'
    ], axis=1).sort_values(
    by=['mentions'], 
    ascending=False)

In [17]:
train_test_data

Unnamed: 0,mentions,mentioning_users,upvotes,sentiment,beta,earnings_per_share_ttm,price_to_equity_ttm,return_on_equity_ttm,dividend_yield_annual,total_debt_to_equity_quarterly,revenue_growth_ttm_yoy,rank_percentage_change_24h,mentions_percentage_change_24h,label,sentiment_rolling_avg,upvotes_rolling_avg,mentioning_users_rolling_avg
248,3216,1370.0,30860,57.0,1.770654,1.7415,173.5333,18.66000,0.052394,0.4956,0.2200,-0.666667,10.694545,0,57.000000,30860.0,1370.000000
45,1684,1076.0,10793,68.0,2.890432,-2.4261,0.0000,-22.98162,0.000000,0.0000,14.6000,0.000000,1.284939,0,71.333333,6439.0,656.666667
46,1401,856.0,8556,59.0,2.890432,-2.4261,0.0000,-22.98162,0.000000,0.0000,14.6000,0.000000,-0.170515,1,65.666667,8046.0,834.333333
41,1363,889.0,8328,68.0,2.707838,-2.4261,0.0000,-22.98162,0.000000,0.0000,14.6000,1.000000,3.168196,0,68.000000,8328.0,889.000000
249,1268,738.0,24699,61.0,1.770654,1.7415,214.7679,18.66000,0.042127,0.4956,0.2200,0.000000,-0.584262,1,59.000000,27779.5,1054.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
333,16,15.0,161,83.0,0.995169,2.0614,18.2810,18.24000,0.000000,1.6717,27.2400,-0.517241,0.600000,0,83.000000,161.0,15.000000
343,16,14.0,76,100.0,0.886050,8.1536,9.7048,13.41000,4.833334,3.5598,107.4333,-0.562500,1.666667,1,100.000000,76.0,14.000000
20,15,14.0,56,70.0,1.188162,5.8857,31.1836,165.72000,0.513369,1.7635,-0.2400,0.666667,-0.758065,1,68.500000,194.5,39.000000
122,12,13.0,110,0.0,0.683394,1.0993,3.2375,7.49000,0.000000,0.9927,62.1500,-0.714286,0.200000,1,0.000000,110.0,13.000000


In [18]:
# scale all data except ticker, name, timestamp
from sklearn.preprocessing import StandardScaler

columns_not_to_scale = ['ticker','timestamp', 'label']
columns_to_scale = [column for column in train_test_data.columns if column not in columns_not_to_scale]
print(columns_to_scale)

scaler = StandardScaler()
train_test_data.loc[:, columns_to_scale] = scaler.fit_transform(train_test_data.loc[:, columns_to_scale])

train_test_data

['mentions', 'mentioning_users', 'upvotes', 'sentiment', 'beta', 'earnings_per_share_ttm', 'price_to_equity_ttm', 'return_on_equity_ttm', 'dividend_yield_annual', 'total_debt_to_equity_quarterly', 'revenue_growth_ttm_yoy', 'rank_percentage_change_24h', 'mentions_percentage_change_24h', 'sentiment_rolling_avg', 'upvotes_rolling_avg', 'mentioning_users_rolling_avg']


Unnamed: 0,mentions,mentioning_users,upvotes,sentiment,beta,earnings_per_share_ttm,price_to_equity_ttm,return_on_equity_ttm,dividend_yield_annual,total_debt_to_equity_quarterly,revenue_growth_ttm_yoy,rank_percentage_change_24h,mentions_percentage_change_24h,label,sentiment_rolling_avg,upvotes_rolling_avg,mentioning_users_rolling_avg
248,12.450056,9.453167,10.960321,-0.694330,0.163635,-0.137332,0.938755,-0.136897,-0.430074,-0.252517,-0.247701,-0.743954,0.770740,0,-0.767180,11.522512,9.610597
45,6.250697,7.280629,3.576897,0.170479,1.572830,-0.821154,-0.570895,-0.973736,-0.464973,-0.364175,-0.210830,0.171092,-0.208750,0,0.448464,2.088181,4.266910
46,5.105515,5.654920,2.753818,-0.537092,1.572830,-0.821154,-0.570895,-0.973736,-0.464973,-0.364175,-0.210830,0.171092,-0.360255,1,-0.032139,2.708998,5.597837
41,4.951745,5.898776,2.669928,0.170479,1.343042,-0.821154,-0.570895,-0.973736,-0.464973,-0.364175,-0.210830,1.543661,-0.012713,0,0.165756,2.817940,6.007353
249,4.567320,4.782949,8.693451,-0.379854,0.163635,-0.137332,1.297474,-0.136897,-0.436913,-0.252517,-0.247701,0.171092,-0.403324,1,-0.597555,10.332452,7.243393
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
333,-0.498996,-0.559722,-0.335027,1.349764,-0.812280,-0.084843,-0.411860,-0.145338,-0.464973,0.012458,-0.178420,-0.538857,-0.280049,0,1.437942,-0.337139,-0.539911
343,-0.498996,-0.567112,-0.366301,2.686288,-0.949602,0.914769,-0.486468,-0.242402,2.754459,0.437846,0.027200,-0.600978,-0.169014,1,2.879753,-0.369976,-0.547403
20,-0.503043,-0.567112,-0.373660,0.327717,-0.569407,0.542651,-0.299614,2.818452,-0.123024,0.033141,-0.248880,1.086138,-0.421416,1,0.208162,-0.324197,-0.360124
122,-0.515183,-0.574501,-0.353791,-5.175614,-1.204636,-0.242704,-0.542730,-0.361372,-0.464973,-0.140520,-0.088909,-0.809314,-0.321687,1,-5.601486,-0.356841,-0.554894


### Initial Model

In [19]:
train_test_data.describe()

Unnamed: 0,mentions,mentioning_users,upvotes,sentiment,beta,earnings_per_share_ttm,price_to_equity_ttm,return_on_equity_ttm,dividend_yield_annual,total_debt_to_equity_quarterly,revenue_growth_ttm_yoy,rank_percentage_change_24h,mentions_percentage_change_24h,label,sentiment_rolling_avg,upvotes_rolling_avg,mentioning_users_rolling_avg
count,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0
mean,7.479397e-17,0.0,0.0,-4.955101e-16,5.048593e-16,3.739699e-17,-8.414322000000001e-17,0.0,-3.739699e-17,-3.739699e-17,-1.1686560000000002e-17,4.2071610000000005e-17,7.245666000000001e-17,0.552632,-3.552714e-16,1.869849e-17,3.739699e-17
std,1.001318,1.001318,1.001318,1.001318,1.001318,1.001318,1.001318,1.001318,1.001318,1.001318,1.001318,1.001318,1.001318,0.497878,1.001318,1.001318,1.001318
min,-0.5151827,-0.655787,-0.38617,-5.175614,-1.723019,-3.252385,-0.5708947,-1.494896,-0.4649734,-0.3641749,-0.3861345,-1.187368,-0.4224054,0.0,-5.601486,-0.3908373,-0.6372964
25%,-0.3907504,-0.434099,-0.336314,-0.6943301,-0.5901559,-0.4435226,-0.5135106,-0.463662,-0.4649734,-0.3516258,-0.2649186,-0.7127155,-0.350226,0.0,-0.5551492,-0.3317303,-0.4125619
50%,-0.272388,-0.271528,-0.271741,0.0132411,0.1010661,-0.1073544,-0.3010499,-0.208239,-0.4649734,-0.2635337,-0.2282398,-0.08236525,-0.2800488,1.0,-0.01800403,-0.2559149,-0.2664845
75%,-0.02048847,0.009276,-0.114447,0.583229,0.2933057,0.5415349,0.07787135,0.048792,-0.1074624,0.03314066,-0.1638947,0.3671734,-0.08715814,1.0,0.6463597,-0.1126705,-0.006791319
max,12.45006,9.453167,10.960321,2.686288,5.275618,6.036428,14.94485,2.990877,5.277178,5.899691,7.655599,9.779076,11.42019,1.0,2.879753,11.52251,9.610597


In [20]:
train_test_data.value_counts('label')

label
1    210
0    170
Name: count, dtype: int64

In [21]:
x_train, x_test, y_train, y_test = train_test_split(train_test_data.drop('label', axis=1), train_test_data['label'], test_size=0.35, random_state=42)

In [32]:
# scitkit learn neural network

from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(100,100,100,100), max_iter=int(1e5))

mlp.fit(x_train, y_train)

predictions = mlp.predict(x_test)

print(confusion_matrix(y_test,predictions))

print(classification_report(y_test,predictions))

[[25 36]
 [28 44]]
              precision    recall  f1-score   support

           0       0.47      0.41      0.44        61
           1       0.55      0.61      0.58        72

    accuracy                           0.52       133
   macro avg       0.51      0.51      0.51       133
weighted avg       0.51      0.52      0.51       133



In [23]:
x_train

Unnamed: 0,mentions,mentioning_users,upvotes,sentiment,beta,earnings_per_share_ttm,price_to_equity_ttm,return_on_equity_ttm,dividend_yield_annual,total_debt_to_equity_quarterly,revenue_growth_ttm_yoy,rank_percentage_change_24h,mentions_percentage_change_24h,sentiment_rolling_avg,upvotes_rolling_avg,mentioning_users_rolling_avg
155,-0.470670,-0.507995,-0.340546,-0.772949,-1.237296,0.505224,-0.390784,-0.275762,1.762927,0.048979,-0.288059,0.857377,-0.392187,-0.851993,-0.342934,-0.487473
331,-0.260248,-0.264139,-0.322149,0.249098,0.422499,-0.466806,-0.570895,-0.540210,-0.464973,-0.115084,0.168657,0.399854,-0.357883,0.646360,-0.138441,-0.000549
33,-0.260248,-0.227191,-0.324724,1.742860,-0.585087,0.542651,-0.297278,2.818452,-0.125943,0.033141,-0.248880,1.269147,-0.347776,0.674630,-0.324648,-0.260242
202,0.225341,0.194016,-0.023015,-0.222616,-0.275035,0.672619,-0.123510,0.074716,-0.464973,-0.364175,-0.172292,-1.044612,0.681092,-0.258306,-0.009539,0.224186
293,-0.070059,0.038834,0.034016,-0.143997,0.111080,-0.443523,-0.570895,-0.655713,-0.464973,-0.364175,-0.195702,-0.613233,-0.151262,0.929068,-0.213774,-0.277721
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147,0.099897,0.038834,-0.201465,-1.166044,0.977232,-0.502821,-0.570895,-0.582417,-0.464973,-0.100327,-0.198522,-1.083152,0.691502,-1.276055,-0.196904,0.066872
287,-0.053873,-0.160685,-0.264014,0.249098,-0.834372,0.076170,-0.262794,2.990877,0.505207,3.259406,-0.210676,-1.029906,1.192891,0.250569,-0.262579,-0.135389
195,-0.373552,-0.441489,-0.351584,1.035288,-0.493710,-0.536097,-0.506763,-0.086054,0.450739,-0.248664,-0.318520,-0.779148,-0.258429,1.437942,-0.358966,-0.446272
225,-0.466624,-0.493216,-0.259231,0.091860,-0.532712,1.090696,-0.244659,0.261008,0.064271,-0.289330,-0.228240,-0.515192,-0.238411,0.250569,-0.267215,-0.483728


### Hyper-parameter tuning

In [24]:
# hyper parameter tuning through random search for the neural network

from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest

hidden_layer_sizes = [(10,10,10), (20,20,20), (30,30,30), (40,40,40), (50,50,50), 
                      (60,60,60), (70,70,70), (80,80,80), (90,90,90), 
                      (100,100,100)]

# Create the random grid

random_grid = {'hidden_layer_sizes': hidden_layer_sizes,
               'learning_rate': ['constant', 'invscaling', 'adaptive'],
                'solver': ['lbfgs', 'sgd', 'adam'],
                'max_iter': [int(1e5)],
                'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10],
                'momentum': [0.1, 0.2, 0.3, 0.4, 0.5],
                'nesterovs_momentum': [True, False],
                'early_stopping': [True, False],
                'validation_fraction': [0.1, 0.2, 0.3, 0.4, 0.5],
                'beta_1': [0.1, 0.2, 0.3, 0.4, 0.5],
                'beta_2': [0.1, 0.2, 0.3, 0.4, 0.5],
                'epsilon': [1e-08, 1e-07, 1e-06, 1e-05, 1e-04],
                'n_iter_no_change': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],               
               }

# Use the random grid to search for best hyperparameters

# First create the base model to tune

mlp = MLPClassifier()

# Random search of parameters, using 3 fold cross validation,

mlp_random = RandomizedSearchCV(estimator = mlp, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model

mlp_random.fit(x_train, y_train)

mlp_random.best_params_

best_random = mlp_random.best_estimator_

predictions = best_random.predict(x_test)

print(confusion_matrix(y_test,predictions))

print(classification_report(y_test,predictions))

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[[ 0 61]
 [ 0 72]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        61
           1       0.54      1.00      0.70        72

    accuracy                           0.54       133
   macro avg       0.27      0.50      0.35       133
weighted avg       0.29      0.54      0.38       133



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
mlp_random.best_params_

{'validation_fraction': 0.3,
 'solver': 'adam',
 'nesterovs_momentum': False,
 'n_iter_no_change': 90,
 'momentum': 0.3,
 'max_iter': 100000,
 'learning_rate': 'constant',
 'hidden_layer_sizes': (70, 70, 70),
 'epsilon': 0.0001,
 'early_stopping': False,
 'beta_2': 0.1,
 'beta_1': 0.5,
 'alpha': 10}

### Visuallizing feature importance

In [26]:
x_train.columns

Index(['mentions', 'mentioning_users', 'upvotes', 'sentiment', 'beta',
       'earnings_per_share_ttm', 'price_to_equity_ttm', 'return_on_equity_ttm',
       'dividend_yield_annual', 'total_debt_to_equity_quarterly',
       'revenue_growth_ttm_yoy', 'rank_percentage_change_24h',
       'mentions_percentage_change_24h', 'sentiment_rolling_avg',
       'upvotes_rolling_avg', 'mentioning_users_rolling_avg'],
      dtype='object')