In [10]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import pickle

In [11]:
training_df = pd.read_csv('../final_dataset.csv', index_col=0)
training_df = training_df.dropna()
ground_truth = training_df['Buy']
training_df = training_df.drop(['Symbol', 'beta', 'profitMargins','Name', 'Analyst', 'agora_pred'],
                          axis=1)
X = training_df[['headline_polarity', 'convo_polarity','forwardEps','bookValue', 'heldPercentInstitutions', 
        'shortRatio', 'shortPercentOfFloat']]
y = training_df['Buy']

In [12]:
max_accuracy = -1
highest_accuracy_model = None
balanced_accuracy = -1
best_predictions = None

test_split_ll = 0.15
test_split_ul = 0.50
test_split = test_split_ll
best_y_test = None

while test_split <= test_split_ul:
    for i in range(1, 201):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_split)
        model = RandomForestClassifier(n_estimators=i)
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)

        if sklearn.metrics.accuracy_score(predictions, y_test) > max_accuracy:
            max_accuracy = sklearn.metrics.accuracy_score(predictions, y_test)
            balanced_accuracy = max(balanced_accuracy, balanced_accuracy_score(predictions, y_test))
            highest_accuracy_model = model
            best_predictions = predictions
            best_y_test = y_test
            print(classification_report(y_test, predictions))
            print(confusion_matrix(y_test, predictions))
            print()
            print()
    
    test_split += 0.1
        
print(max_accuracy)
print(balanced_accuracy)

              precision    recall  f1-score   support

           0       0.52      0.54      0.53        24
           1       0.88      0.87      0.87        92

    accuracy                           0.80       116
   macro avg       0.70      0.71      0.70       116
weighted avg       0.80      0.80      0.80       116

[[13 11]
 [12 80]]


              precision    recall  f1-score   support

           0       0.60      0.47      0.53        19
           1       0.90      0.94      0.92        97

    accuracy                           0.86       116
   macro avg       0.75      0.71      0.72       116
weighted avg       0.85      0.86      0.86       116

[[ 9 10]
 [ 6 91]]


              precision    recall  f1-score   support

           0       0.83      0.73      0.78        26
           1       0.92      0.96      0.94        90

    accuracy                           0.91       116
   macro avg       0.88      0.84      0.86       116
weighted avg       0.90      0.9

In [13]:
pickle.dump(highest_accuracy_model, open('RF_pickled_final_df.pkl', 'wb'))

In [14]:
print(classification_report(best_y_test, best_predictions))

              precision    recall  f1-score   support

           0       0.90      0.64      0.75        14
           1       0.95      0.99      0.97       102

    accuracy                           0.95       116
   macro avg       0.93      0.82      0.86       116
weighted avg       0.95      0.95      0.94       116



In [23]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
# model = RandomForestClassifier(n_estimators=6)
# model.fit(X_train, y_train)
# predictions = model.predict(X_test)
# print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.67      0.60      0.64        48
           1       0.89      0.92      0.90       170

    accuracy                           0.85       218
   macro avg       0.78      0.76      0.77       218
weighted avg       0.84      0.85      0.85       218



In [None]:
from xgboost import XGBClassifier

max_accuracy = -1
highest_accuracy_model = None
balanced_accuracy = -1
best_predictions = None

test_split_ll = 0.15
test_split_ul = 0.50
test_split = test_split_ll
best_y_test = None

while test_split <= test_split_ul:
    for i in range(1, 201):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_split)
        #model = RandomForestClassifier(n_estimators=i)
        model = XGBClassifier(n_estimators=i)
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)

        if sklearn.metrics.accuracy_score(predictions, y_test) > max_accuracy:
            max_accuracy = sklearn.metrics.accuracy_score(predictions, y_test)
            balanced_accuracy = max(balanced_accuracy, balanced_accuracy_score(predictions, y_test))
            highest_accuracy_model = model
            best_predictions = predictions
            best_y_test = y_test
            print(classification_report(y_test, predictions))
            print(confusion_matrix(y_test, predictions))
            print()
            print()
    
    test_split += 0.1
        
print(max_accuracy)
print(balanced_accuracy)