In [23]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import log_loss, accuracy_score
import numpy as np
from collections import Counter
import pickle

In [25]:
training_df = pd.read_csv('../final_dataset.csv', index_col=0)
training_df = training_df.dropna()
ground_truth = training_df['Buy']
training_df = training_df.drop(['Symbol', 'beta', 'profitMargins','Name', 'Analyst', 'agora_pred'],
                          axis=1)
X = training_df[['headline_polarity', 'convo_polarity','forwardEps','bookValue', 'heldPercentInstitutions', 
        'shortRatio', 'shortPercentOfFloat']]
y = training_df['Buy'].values.ravel()

X2 = training_df[['headline_polarity', 'convo_polarity','forwardEps','bookValue', 'heldPercentInstitutions', 
        'shortRatio', 'shortPercentOfFloat', 'last2PolarityDeltaConvo', 'last2PolarityDeltaHead']]
y2 = training_df['Buy'].values.ravel()

In [28]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracies = []

for train_index, test_index in kf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]  
    
    print("X_train size:", X_train.shape)
    print("X_test size:", X_test.shape)
    print("y_train size:", y_train.shape)
    print("y_test size:", y_test.shape)
    
    y_train_distribution = Counter(y_train)
    y_test_distribution = Counter(y_test)

    print("\ny_train distribution:", y_train_distribution)
    print("y_test distribution:", y_test_distribution)

    y_train_percent = {k: v / len(y_train) * 100 for k, v in y_train_distribution.items()}
    y_test_percent = {k: v / len(y_test) * 100 for k, v in y_test_distribution.items()}

    print("\nClass distribution in y_train (percentages):", y_train_percent)
    print("Class distribution in y_test (percentages):", y_test_percent)
    print("\n")

    model = RandomForestClassifier()
    model.fit(X_train, y_train)

    predictions = model.predict(X_test)

    accuracy = accuracy_score(y_test, predictions)
    accuracies.append(accuracy)

    print(classification_report(y_test, predictions))
    print(confusion_matrix(y_test, predictions))
    print(f'Log Loss: {log_loss(y_test, predictions)}')

average_accuracy = np.mean(accuracies)
print(f'Average Accuracy: {average_accuracy:.4f}')

X_train size: (688, 7)
X_test size: (172, 7)
y_train size: (688,)
y_test size: (172,)
              precision    recall  f1-score   support

           0       0.72      0.50      0.59        36
           1       0.88      0.95      0.91       136

    accuracy                           0.85       172
   macro avg       0.80      0.72      0.75       172
weighted avg       0.84      0.85      0.84       172

[[ 18  18]
 [  7 129]]
Log Loss: 5.238903108883307
X_train size: (688, 7)
X_test size: (172, 7)
y_train size: (688,)
y_test size: (172,)
              precision    recall  f1-score   support

           0       0.87      0.56      0.68        36
           1       0.89      0.98      0.93       136

    accuracy                           0.89       172
   macro avg       0.88      0.77      0.81       172
weighted avg       0.89      0.89      0.88       172

[[ 20  16]
 [  3 133]]
Log Loss: 3.9815663627513134
X_train size: (688, 7)
X_test size: (172, 7)
y_train size: (688,)
y_tes

In [33]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracies = []

for train_index, test_index in kf.split(X2, y2):
    X_train, X_test = X2[train_index], X2[test_index]
    y_train, y_test = y2[train_index], y2[test_index]  
    
    print("X_train size:", X_train.shape)
    print("X_test size:", X_test.shape)
    print("y_train size:", y_train.shape)
    print("y_test size:", y_test.shape)
    
    y_train_distribution = Counter(y_train)
    y_test_distribution = Counter(y_test)

    print("\ny_train distribution:", y_train_distribution)
    print("y_test distribution:", y_test_distribution)

    y_train_percent = {k: v / len(y_train) * 100 for k, v in y_train_distribution.items()}
    y_test_percent = {k: v / len(y_test) * 100 for k, v in y_test_distribution.items()}

    print("\nClass distribution in y_train (percentages):", y_train_percent)
    print("Class distribution in y_test (percentages):", y_test_percent)
    print("\n")

    model = RandomForestClassifier()
    model.fit(X_train, y_train)

    predictions = model.predict(X_test)

    accuracy = accuracy_score(y_test, predictions)
    accuracies.append(accuracy)

    print(classification_report(y_test, predictions))
    print(confusion_matrix(y_test, predictions))
    print(f'Log Loss: {log_loss(y_test, predictions)}')

average_accuracy = np.mean(accuracies)
print(f'Average Accuracy: {average_accuracy:.4f}')

X_train size: (800, 20)
X_test size: (200, 20)
y_train size: (800,)
y_test size: (200,)

y_train distribution: Counter({1: 400, 0: 400})
y_test distribution: Counter({1: 100, 0: 100})

Class distribution in y_train (percentages): {1: 50.0, 0: 50.0}
Class distribution in y_test (percentages): {1: 50.0, 0: 50.0}


              precision    recall  f1-score   support

           0       0.86      0.92      0.89       100
           1       0.91      0.85      0.88       100

    accuracy                           0.89       200
   macro avg       0.89      0.89      0.88       200
weighted avg       0.89      0.89      0.88       200

[[92  8]
 [15 85]]
Log Loss: 4.145020139748473
X_train size: (800, 20)
X_test size: (200, 20)
y_train size: (800,)
y_test size: (200,)

y_train distribution: Counter({1: 400, 0: 400})
y_test distribution: Counter({0: 100, 1: 100})

Class distribution in y_train (percentages): {1: 50.0, 0: 50.0}
Class distribution in y_test (percentages): {0: 50.0, 1: 50.0}


In [5]:
pickle.dump(highest_accuracy_model, open('RF_pickled_final_df.pkl', 'wb'))

In [75]:
#testing previous model and baseline models

"""import os
print(os.getcwd())"""
test_split = 0.20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_split)

print("X_train size:", X_train.shape)
print("X_test size:", X_test.shape)
print("y_train size:", y_train.shape)
print("y_test size:", y_test.shape)

y_train_distribution = Counter(y_train)
y_test_distribution = Counter(y_test)

print("\ny_train distribution:", y_train_distribution)
print("y_test distribution:", y_test_distribution)

y_train_percent = {k: v / len(y_train) * 100 for k, v in y_train_distribution.items()}
y_test_percent = {k: v / len(y_test) * 100 for k, v in y_test_distribution.items()}

print("\nClass distribution in y_train (percentages):", y_train_percent)
print("Class distribution in y_test (percentages):", y_test_percent)
print("\n")

pickled_model = pickle.load(open('RF_pickled_final_old_df.pkl', 'rb'))
old_predictions = pickled_model.predict(X_test)
print("old model")
print(classification_report(y_test, old_predictions))
print(confusion_matrix(y_test, old_predictions))
print(log_loss(y_test, old_predictions))
print('accuracy: ', sklearn.metrics.accuracy_score(y_test, old_predictions))
print()

pickled_model = pickle.load(open('RF_pickled_final_old_df_defaultParam.pkl', 'rb'))
old_predictions = pickled_model.predict(X_test)
print("old default param model - not brute forced")
print(classification_report(y_test, old_predictions))
print(confusion_matrix(y_test, old_predictions))
print(log_loss(y_test, old_predictions))
print('accuracy: ', sklearn.metrics.accuracy_score(y_test, old_predictions))
print()

mostFreqBaseline = DummyClassifier(strategy="most_frequent")
mostFreqBaseline.fit(X_train, y_train)
mostFreqBaselinePred = mostFreqBaseline.predict(X_test)
print("most frequent baseline model")
print(classification_report(y_test, mostFreqBaselinePred, zero_division=0))
print(confusion_matrix(y_test, mostFreqBaselinePred))
print(log_loss(y_test, mostFreqBaselinePred))
print('accuracy: ', sklearn.metrics.accuracy_score(y_test, mostFreqBaselinePred))
print()

uniformBaseline = DummyClassifier(strategy="uniform")
uniformBaseline.fit(X_train, y_train)
uniformBaselinePred = uniformBaseline.predict(X_test)
print("uniform random baseline model")
print(classification_report(y_test, uniformBaselinePred))
print(confusion_matrix(y_test, uniformBaselinePred))
print(log_loss(y_test, uniformBaselinePred))
print('accuracy: ', sklearn.metrics.accuracy_score(y_test, uniformBaselinePred))
print()

stratifiedBaseline = DummyClassifier(strategy="stratified")
stratifiedBaseline.fit(X_train, y_train)
stratifiedBaselinePred = stratifiedBaseline.predict(X_test)
print("stratified random baseline model")
print(classification_report(y_test, stratifiedBaselinePred))
print(confusion_matrix(y_test, stratifiedBaselinePred))
print(log_loss(y_test, stratifiedBaselinePred))
print('accuracy: ', sklearn.metrics.accuracy_score(y_test, stratifiedBaselinePred))
print()

X_train size: (688, 7)
X_test size: (172, 7)
y_train size: (688,)
y_test size: (172,)

y_train distribution: Counter({1: 542, 0: 146})
y_test distribution: Counter({1: 135, 0: 37})

Class distribution in y_train (percentages): {1: 78.77906976744185, 0: 21.22093023255814}
Class distribution in y_test (percentages): {1: 78.48837209302324, 0: 21.511627906976745}


old model
              precision    recall  f1-score   support

           0       0.84      0.57      0.68        37
           1       0.89      0.97      0.93       135

    accuracy                           0.88       172
   macro avg       0.87      0.77      0.80       172
weighted avg       0.88      0.88      0.87       172

[[ 21  16]
 [  4 131]]
4.191122487106646
accuracy:  0.8837209302325582

old default param model - not brute forced
              precision    recall  f1-score   support

           0       0.88      0.62      0.73        37
           1       0.90      0.98      0.94       135

    accuracy        

In [78]:
pickled_model = pickle.load(open('RF_pickled_final_df.pkl', 'rb'))
new_predictions = pickled_model.predict(X_test)
print(classification_report(y_test, new_predictions))
print(confusion_matrix(y_test, new_predictions))
print(log_loss(y_test, new_predictions))
print('accuracy: ', sklearn.metrics.accuracy_score(new_predictions, y_test))

              precision    recall  f1-score   support

           0       0.96      0.70      0.81        37
           1       0.92      0.99      0.96       135

    accuracy                           0.93       172
   macro avg       0.94      0.85      0.88       172
weighted avg       0.93      0.93      0.93       172

[[ 26  11]
 [  1 134]]
2.5146734922639875
accuracy:  0.9302325581395349


In [14]:
print(classification_report(best_y_test, best_predictions))

              precision    recall  f1-score   support

           0       0.90      0.64      0.75        14
           1       0.95      0.99      0.97       102

    accuracy                           0.95       116
   macro avg       0.93      0.82      0.86       116
weighted avg       0.95      0.95      0.94       116



In [23]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
# model = RandomForestClassifier(n_estimators=6)
# model.fit(X_train, y_train)
# predictions = model.predict(X_test)
# print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.67      0.60      0.64        48
           1       0.89      0.92      0.90       170

    accuracy                           0.85       218
   macro avg       0.78      0.76      0.77       218
weighted avg       0.84      0.85      0.85       218



In [15]:
from xgboost import XGBClassifier

max_accuracy = -1
highest_accuracy_model = None
balanced_accuracy = -1
best_predictions = None

test_split_ll = 0.15
test_split_ul = 0.50
test_split = test_split_ll
best_y_test = None

while test_split <= test_split_ul:
    for i in range(1, 201):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_split)
        #model = RandomForestClassifier(n_estimators=i)
        model = XGBClassifier(n_estimators=i)
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)

        if sklearn.metrics.accuracy_score(predictions, y_test) > max_accuracy:
            max_accuracy = sklearn.metrics.accuracy_score(predictions, y_test)
            balanced_accuracy = max(balanced_accuracy, balanced_accuracy_score(predictions, y_test))
            highest_accuracy_model = model
            best_predictions = predictions
            best_y_test = y_test
            print(classification_report(y_test, predictions))
            print(confusion_matrix(y_test, predictions))
            print()
            print()
    
    test_split += 0.1
        
print(max_accuracy)
print(balanced_accuracy)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        22
           1       0.81      1.00      0.90        94

    accuracy                           0.81       116
   macro avg       0.41      0.50      0.45       116
weighted avg       0.66      0.81      0.73       116

[[ 0 22]
 [ 0 94]]


              precision    recall  f1-score   support

           0       0.67      0.33      0.44        18
           1       0.89      0.97      0.93        98

    accuracy                           0.87       116
   macro avg       0.78      0.65      0.69       116
weighted avg       0.85      0.87      0.85       116

[[ 6 12]
 [ 3 95]]


              precision    recall  f1-score   support

           0       0.75      0.40      0.52        15
           1       0.92      0.98      0.95       101

    accuracy                           0.91       116
   macro avg       0.83      0.69      0.73       116
weighted avg       0.90      0.9