In [1]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import log_loss
import pickle

In [2]:
training_df = pd.read_csv('../final_dataset.csv', index_col=0)
training_df = training_df.dropna()
ground_truth = training_df['Buy']
training_df = training_df.drop(['Symbol', 'beta', 'profitMargins','Name', 'Analyst', 'agora_pred'],
                          axis=1)
X = training_df[['headline_polarity', 'convo_polarity','forwardEps','bookValue', 'heldPercentInstitutions', 
        'shortRatio', 'shortPercentOfFloat']]
y = training_df['Buy']

In [57]:
max_accuracy = -1
highest_accuracy_model = None
balanced_accuracy = -1
best_predictions = None

test_split = 0.20
best_y_test = None

for i in range(100, 201):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_split)
    model = RandomForestClassifier(n_estimators=i)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)

    if sklearn.metrics.accuracy_score(predictions, y_test) > max_accuracy:
        max_accuracy = sklearn.metrics.accuracy_score(predictions, y_test)
        balanced_accuracy = max(balanced_accuracy, balanced_accuracy_score(predictions, y_test))
        highest_accuracy_model = model
        best_predictions = predictions
        best_y_test = y_test
        print(classification_report(y_test, predictions))
        print(confusion_matrix(y_test, predictions))
        print(log_loss(y_test, old_predictions))
        print()
        
print(max_accuracy)
print(balanced_accuracy)

              precision    recall  f1-score   support

           0       0.67      0.32      0.44        37
           1       0.82      0.95      0.88       117

    accuracy                           0.80       154
   macro avg       0.74      0.64      0.66       154
weighted avg       0.78      0.80      0.77       154

[[ 12  25]
 [  6 111]]
11.702484866596478

              precision    recall  f1-score   support

           0       0.78      0.26      0.39        27
           1       0.86      0.98      0.92       127

    accuracy                           0.86       154
   macro avg       0.82      0.62      0.65       154
weighted avg       0.85      0.86      0.83       154

[[  7  20]
 [  2 125]]
9.361987893277181

              precision    recall  f1-score   support

           0       0.62      0.43      0.51        23
           1       0.91      0.95      0.93       131

    accuracy                           0.88       154
   macro avg       0.77      0.69      0.72

In [58]:
pickle.dump(highest_accuracy_model, open('RF_pickled_final_df.pkl', 'wb'))

In [62]:
#testing previous model

"""import os
print(os.getcwd())"""
test_split = 0.20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_split)
pickled_model = pickle.load(open('RF_pickled_final_old_df.pkl', 'rb'))
old_predictions = pickled_model.predict(X_test)
print(classification_report(y_test, old_predictions))
print(confusion_matrix(y_test, old_predictions))
print(log_loss(y_test, old_predictions))
print('accuracy: ', sklearn.metrics.accuracy_score(old_predictions, y_test))

              precision    recall  f1-score   support

           0       0.94      0.59      0.73        27
           1       0.92      0.99      0.95       127

    accuracy                           0.92       154
   macro avg       0.93      0.79      0.84       154
weighted avg       0.92      0.92      0.91       154

[[ 16  11]
 [  1 126]]
2.8085963679831547
accuracy:  0.922077922077922


In [63]:
pickled_model = pickle.load(open('RF_pickled_final_df.pkl', 'rb'))
new_predictions = pickled_model.predict(X_test)
print(classification_report(y_test, new_predictions))
print(confusion_matrix(y_test, new_predictions))
print(log_loss(y_test, new_predictions))
print('accuracy: ', sklearn.metrics.accuracy_score(new_predictions, y_test))

              precision    recall  f1-score   support

           0       0.93      1.00      0.96        27
           1       1.00      0.98      0.99       127

    accuracy                           0.99       154
   macro avg       0.97      0.99      0.98       154
weighted avg       0.99      0.99      0.99       154

[[ 27   0]
 [  2 125]]
0.4680993946638593
accuracy:  0.987012987012987


In [14]:
print(classification_report(best_y_test, best_predictions))

              precision    recall  f1-score   support

           0       0.90      0.64      0.75        14
           1       0.95      0.99      0.97       102

    accuracy                           0.95       116
   macro avg       0.93      0.82      0.86       116
weighted avg       0.95      0.95      0.94       116



In [23]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
# model = RandomForestClassifier(n_estimators=6)
# model.fit(X_train, y_train)
# predictions = model.predict(X_test)
# print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.67      0.60      0.64        48
           1       0.89      0.92      0.90       170

    accuracy                           0.85       218
   macro avg       0.78      0.76      0.77       218
weighted avg       0.84      0.85      0.85       218



In [15]:
from xgboost import XGBClassifier

max_accuracy = -1
highest_accuracy_model = None
balanced_accuracy = -1
best_predictions = None

test_split_ll = 0.15
test_split_ul = 0.50
test_split = test_split_ll
best_y_test = None

while test_split <= test_split_ul:
    for i in range(1, 201):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_split)
        #model = RandomForestClassifier(n_estimators=i)
        model = XGBClassifier(n_estimators=i)
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)

        if sklearn.metrics.accuracy_score(predictions, y_test) > max_accuracy:
            max_accuracy = sklearn.metrics.accuracy_score(predictions, y_test)
            balanced_accuracy = max(balanced_accuracy, balanced_accuracy_score(predictions, y_test))
            highest_accuracy_model = model
            best_predictions = predictions
            best_y_test = y_test
            print(classification_report(y_test, predictions))
            print(confusion_matrix(y_test, predictions))
            print()
            print()
    
    test_split += 0.1
        
print(max_accuracy)
print(balanced_accuracy)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        22
           1       0.81      1.00      0.90        94

    accuracy                           0.81       116
   macro avg       0.41      0.50      0.45       116
weighted avg       0.66      0.81      0.73       116

[[ 0 22]
 [ 0 94]]


              precision    recall  f1-score   support

           0       0.67      0.33      0.44        18
           1       0.89      0.97      0.93        98

    accuracy                           0.87       116
   macro avg       0.78      0.65      0.69       116
weighted avg       0.85      0.87      0.85       116

[[ 6 12]
 [ 3 95]]


              precision    recall  f1-score   support

           0       0.75      0.40      0.52        15
           1       0.92      0.98      0.95       101

    accuracy                           0.91       116
   macro avg       0.83      0.69      0.73       116
weighted avg       0.90      0.9