In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.model_selection import GridSearchCV

## Neural Network Modelling

### Initial Model

In [2]:
train_test_data = pd.read_csv(r'E:\my-projs\changing-history-wall-street-sentiments\wall-street-sentiments\experimentation\data\raw_data\data.csv')

In [3]:
train_test_data.dropna(inplace=True)
train_test_data

Unnamed: 0,timestamp,rank,ticker,name,mentions,mentioning_users,upvotes,sentiment,rank_24h_ago,mentions_24h_ago,...,return_on_equity_ttm,dividend_yield_annual,total_debt_to_equity_quarterly,revenue_growth_ttm_yoy,dividend_exists,rank_percentage_change_24h,mentions_percentage_change_24h,opening_price,closing_price,label
0,06-18-2023,2,AMD,AMD,56,42.0,112,67.0,4,54.0,...,2.4000,0.000000,0.0451,22.20,0,-0.500000,0.037037,125.82,120.08,0
1,06-18-2023,3,TSLA,Tesla,48,45.0,462,51.0,3,61.0,...,27.9000,0.000000,0.0557,38.34,0,0.000000,-0.213115,258.92,260.54,1
2,06-18-2023,4,NVDA,NVIDIA,44,41.0,262,57.0,2,88.0,...,20.8800,0.037512,0.4467,-12.40,1,1.000000,-0.500000,434.50,426.92,0
4,06-18-2023,7,AAPL,Apple,32,30.0,226,70.0,10,20.0,...,165.7200,0.516101,1.7635,-0.24,1,-0.300000,0.600000,186.73,184.92,0
5,06-18-2023,9,DTE,DTE Energy,23,22.0,146,56.0,6,44.0,...,11.7500,3.344746,1.8338,-15.52,1,0.500000,-0.477273,114.56,114.12,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
705,06-16-2023,7,PLTR,Palantir,184,84.0,588,65.0,12,53.0,...,-7.1566,0.000000,0.0000,20.50,0,-0.416667,2.471698,16.87,16.30,0
706,06-16-2023,8,MSFT,Microsoft,126,82.0,613,73.0,24,35.0,...,38.4600,0.781385,0.3322,7.81,1,-0.666667,2.600000,351.32,342.33,0
707,06-16-2023,9,SPCE,Virgin Galactic,125,99.0,739,68.0,164,3.0,...,1.8000,0.000000,1.1349,-33.95,0,-0.945122,40.666667,5.72,4.73,0
708,06-16-2023,10,NKLA,Nikola,117,79.0,285,75.0,10,56.0,...,1.1500,0.000000,0.5830,3082.56,0,0.000000,1.089286,1.70,1.19,0


In [4]:
# Remove columns that are not needed for modelling

train_test_data = train_test_data.drop(
    ['name', 
     'ticker', 
     'timestamp',
    'opening_price',
   'closing_price',
   'rank_24h_ago',
   'mentions_24h_ago',
   'rank', 'dividend_exists'
    ], axis=1).sort_values(
    by=['mentions'], 
    ascending=False)

In [5]:
train_test_data

Unnamed: 0,mentions,mentioning_users,upvotes,sentiment,beta,earnings_per_share_ttm,price_to_equity_ttm,return_on_equity_ttm,dividend_yield_annual,total_debt_to_equity_quarterly,revenue_growth_ttm_yoy,rank_percentage_change_24h,mentions_percentage_change_24h,label
540,3216,1370.0,30860,57.0,1.770654,1.7415,173.5333,18.66000,0.052394,0.4956,0.22,-0.666667,10.694545,0
520,2571,1191.0,33966,60.0,1.823568,1.9242,226.4381,20.88000,0.035035,0.4467,-12.40,0.000000,1.850333,0
587,1684,1076.0,10793,68.0,2.890432,-2.4261,0.0000,-22.98162,0.000000,0.0000,14.60,0.000000,1.284939,0
590,1401,856.0,8556,59.0,2.890432,-2.4261,0.0000,-22.98162,0.000000,0.0000,14.60,0.000000,-0.170515,1
541,1363,889.0,8328,68.0,2.707838,-2.4261,0.0000,-22.98162,0.000000,0.0000,14.60,1.000000,3.168196,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
498,15,15.0,26,55.0,0.611049,3.3704,19.1572,7.51000,4.187995,1.5801,14.73,-0.740000,2.750000,0
497,15,14.0,113,100.0,2.195313,-43.4870,3.2245,-198.10000,0.000000,14.7739,-42.45,-0.963415,14.000000,0
499,14,14.0,54,83.0,2.839289,-0.8683,222.1660,-11.87000,0.000000,3.9975,9.64,0.400000,-0.125000,0
68,12,13.0,110,0.0,0.683394,1.0993,3.2375,7.49000,0.000000,0.9927,62.15,-0.714286,0.200000,1


In [6]:
train_test_data.value_counts('label')

label
1    345
0    344
dtype: int64

In [7]:
x_train, x_test, y_train, y_test = train_test_split(train_test_data.drop('label', axis=1), train_test_data['label'], test_size=0.2, random_state=42)

In [8]:
# scitkit learn neural network

from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(30,30,30))

mlp.fit(x_train, y_train)

predictions = mlp.predict(x_test)

print(confusion_matrix(y_test,predictions))

print(classification_report(y_test,predictions))

[[34 31]
 [39 34]]
              precision    recall  f1-score   support

           0       0.47      0.52      0.49        65
           1       0.52      0.47      0.49        73

    accuracy                           0.49       138
   macro avg       0.49      0.49      0.49       138
weighted avg       0.50      0.49      0.49       138



### Hyper-parameter tuning

In [9]:
# hyper parameter tuning through random search for the neural network

from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest

hidden_layer_sizes = [(10,10,10), (20,20,20), (30,30,30), (40,40,40), (50,50,50), 
                      (60,60,60), (70,70,70), (80,80,80), (90,90,90), 
                      (100,100,100)]

# Create the random grid

random_grid = {'hidden_layer_sizes': hidden_layer_sizes,
               'learning_rate': ['constant', 'invscaling', 'adaptive'],
                'solver': ['lbfgs', 'sgd', 'adam'],
                'max_iter': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
                'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10],
                'momentum': [0.1, 0.2, 0.3, 0.4, 0.5],
                'nesterovs_momentum': [True, False],
                'early_stopping': [True, False],
                'validation_fraction': [0.1, 0.2, 0.3, 0.4, 0.5],
                'beta_1': [0.1, 0.2, 0.3, 0.4, 0.5],
                'beta_2': [0.1, 0.2, 0.3, 0.4, 0.5],
                'epsilon': [1e-08, 1e-07, 1e-06, 1e-05, 1e-04],
                'n_iter_no_change': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],               
               }

# Use the random grid to search for best hyperparameters

# First create the base model to tune

mlp = MLPClassifier()

# Random search of parameters, using 3 fold cross validation,

mlp_random = RandomizedSearchCV(estimator = mlp, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model

mlp_random.fit(x_train, y_train)

mlp_random.best_params_

best_random = mlp_random.best_estimator_

predictions = best_random.predict(x_test)

print(confusion_matrix(y_test,predictions))

print(classification_report(y_test,predictions))

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[[24 41]
 [32 41]]
              precision    recall  f1-score   support

           0       0.43      0.37      0.40        65
           1       0.50      0.56      0.53        73

    accuracy                           0.47       138
   macro avg       0.46      0.47      0.46       138
weighted avg       0.47      0.47      0.47       138





In [11]:
mlp_random.best_params_

{'validation_fraction': 0.4,
 'solver': 'sgd',
 'nesterovs_momentum': True,
 'n_iter_no_change': 80,
 'momentum': 0.1,
 'max_iter': 800,
 'learning_rate': 'invscaling',
 'hidden_layer_sizes': (70, 70, 70),
 'epsilon': 1e-05,
 'early_stopping': False,
 'beta_2': 0.4,
 'beta_1': 0.4,
 'alpha': 1}