In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Flatten
from keras.optimizers import RMSprop
from keras.datasets import fashion_mnist
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [2]:
training_df = pd.read_csv('nn.csv', index_col=0)
training_df.drop(columns=[training_df.columns[0], training_df.columns[1], training_df.columns[2], training_df.columns[3], training_df.columns[len(training_df.columns)-1]], axis=1,  inplace=True)

# Fill in missing
def fillMissing(df, feature, method):
  if method == "mode":
    df[feature] = df[feature].fillna(df[feature].mode()[0])
  elif method == "median":
    df[feature] = df[feature].fillna(df[feature].median())
  else:
    df[feature] = df[feature].fillna(df[feature].mean())

features_missing = training_df.columns[training_df.isna().any()]
for feature in features_missing:
  fillMissing(training_df, feature = feature, method = "mean")


cols_at_end = ['Analyst']
training_df = training_df[[c for c in training_df if c not in cols_at_end] 
        + [c for c in cols_at_end if c in training_df]]
encoder = LabelEncoder()
encoder.fit(training_df['Analyst'])
encoder_values = encoder.transform(training_df['Analyst'])
training_df['analyst_predictions'] = encoder_values
training_df.drop(['Analyst'], axis=1, inplace=True)
training_df.dropna(inplace=True)

In [3]:
training_df

Unnamed: 0,headline_polarity,convo_polarity,beta,profitMargins,forwardEps,bookValue,heldPercentInstitutions,shortRatio,shortPercentOfFloat,analyst_predictions
0,0.094626,0.191632,1.734251,0.050000,2.790000,22.710000,1.000000,3.080000,0.030000,0
1,0.250000,0.210000,1.300000,0.060000,12.760000,52.440000,1.030000,6.920000,0.070000,0
2,0.094626,0.191632,1.734251,0.009524,2.948967,129.998838,0.648889,4.685499,0.063772,0
3,0.094626,0.191632,1.410000,0.920000,3.830000,16.070000,0.140000,4.980000,0.010000,0
4,0.094626,0.191632,1.080000,0.060000,1.560000,7.000000,0.060000,1.390000,0.063772,0
...,...,...,...,...,...,...,...,...,...,...
4320,0.000000,0.030000,1.734251,0.009524,2.948967,129.998838,0.648889,4.685499,0.063772,4
4321,0.180000,0.130000,0.290000,-0.570000,0.450000,16.680000,0.010000,0.190000,0.063772,4
4322,0.200000,0.040000,1.734251,0.009524,2.948967,129.998838,0.648889,4.685499,0.063772,4
4323,0.350000,0.210000,1.734251,0.070000,1.000000,1.970000,0.140000,3.930000,0.063772,4


In [4]:
X_total = training_df[[x for x in training_df.columns if x not in ['analyst_predictions']]]
y_total = training_df['analyst_predictions']
X_train, X_test, y_train, y_test = train_test_split(X_total, y_total, test_size=0.33, random_state=42)

In [5]:
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(9,)))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(10, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy',
              optimizer=RMSprop(lr=0.001),
              metrics=['accuracy'])

In [6]:
history = model.fit(
    X_train, y_train,
    batch_size=32,
    epochs=200,
    verbose=0,
    validation_data=(X_test, y_test))

In [7]:
score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 1.0395660400390625
Test accuracy: 0.6064425706863403


## Other models

In [69]:
df = pd.read_csv('multiclass_processed_dataset.csv', index_col=0)
df = df.drop(['Symbol'], axis=1)
df

Unnamed: 0,headline_polarity,convo_polarity,beta,profitMargins,forwardEps,bookValue,heldPercentInstitutions,shortRatio,shortPercentOfFloat,Analyst
1,0.25,0.21,1.30,0.06,12.76,52.44,1.03,6.92,0.07,Buy
5,0.19,0.27,0.53,-0.02,10.57,0.18,0.66,2.17,0.01,Buy
12,0.18,0.04,0.28,0.16,4.03,36.38,0.77,2.47,0.01,Buy
17,0.03,0.29,1.02,0.25,5.20,50.20,0.61,4.67,0.02,Buy
21,0.17,0.34,1.34,0.09,5.38,76.73,0.92,1.99,0.01,Buy
...,...,...,...,...,...,...,...,...,...,...
4298,0.16,0.32,0.65,0.26,5.11,9.17,0.93,1.80,0.01,Strong Buy
4301,-0.02,0.18,-3.73,-0.13,-1.19,2.39,0.31,0.69,0.11,Strong Sell
4307,0.13,0.32,2.73,-0.09,0.46,3.86,0.64,11.92,0.13,Strong Sell
4312,0.44,0.35,2.57,0.55,15.14,48.00,0.06,0.27,0.00,Strong Sell


In [40]:
X = df[['headline_polarity', 'convo_polarity', 'forwardEps','bookValue', 'heldPercentInstitutions', 'shortRatio', 'shortPercentOfFloat']]
y = df['Analyst']

encoder = LabelEncoder()
y = encoder.fit_transform(y)

new_X_train, new_X_test, new_y_train, new_y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [47]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf.fit(X_train, y_train)
y_pred_rtf = rf.predict(X_test)
y_pred_rtf

array([1.852, 2.332, 0.699, 1.737, 1.664, 1.93 , 2.206, 2.061, 2.411,
       1.525, 1.976, 1.155, 1.499, 2.404, 1.384, 1.632, 2.192, 1.677,
       1.098, 1.349, 1.448, 1.345, 1.406, 0.87 , 1.725, 2.625, 1.714,
       1.189, 1.343, 1.817, 2.22 , 2.318, 2.488, 2.659, 1.064, 2.254,
       1.403, 1.649, 2.788, 1.876, 2.211, 1.662, 2.372, 1.482, 1.71 ,
       1.728, 2.419, 1.083, 1.589, 2.477, 2.337, 1.602, 1.708, 2.216,
       1.529, 1.628, 2.176, 1.187, 1.149, 1.14 , 0.806, 2.022, 1.204,
       1.213, 2.566, 1.489, 2.091, 1.844, 1.231, 1.532, 1.585, 0.709,
       1.883, 1.404, 1.84 , 2.601, 2.039, 2.03 , 1.229, 1.725, 1.819,
       2.293, 2.019, 1.747, 1.616, 1.393, 1.409, 1.18 , 1.877, 1.253,
       2.38 , 1.701, 1.502, 2.624, 1.703, 1.597, 1.788, 2.311, 1.365,
       0.784, 1.516, 1.045, 1.331, 1.929, 2.058, 1.658, 1.742, 1.597,
       1.813, 1.334, 2.156, 1.371, 2.071, 2.563, 1.845, 2.107, 2.179,
       1.502, 1.781, 3.278, 1.608, 1.621, 1.328, 1.628, 1.827, 1.349,
       1.257, 1.25 ,

In [61]:
#let us build a basic model
from keras.layers import Activation, Dense, Dropout
model = Sequential()
model.add(Dense(512, input_shape=(7,)))
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(1))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [64]:
num_epochs =100
batch_size = 128
history = model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=num_epochs,
                    verbose=0,
                    validation_split=0.2)

In [68]:
df['Analyst'].value_counts()

Strong Buy     420
Buy            264
Hold           168
Strong Sell     18
Sell             2
Name: Analyst, dtype: int64