In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

In [12]:
data = pd.read_csv("trades.csv")
data.drop(["MonitorPrice"], axis=1, inplace=True)

In [13]:
data.head()

Unnamed: 0,id,member,Ticker,Type,Bought,BuyPrice,Amount,FilePrice,Spot7,Spot30,Spot90,Spot180
0,20001934,Lou Barletta,DDD,S,10/22/2014,36.669998,15000,36.669998,37.099998,35.150002,29.32,32.5
1,20000404,John A. Boehner,CAH,s,02/27/2014,53.021889,15000,53.021889,53.560749,51.661263,51.490734,54.630642
2,20000404,John A. Boehner,slB,P,02/26/2014,69.109833,50000,69.109833,68.855377,73.016212,76.451126,82.801132
3,20000404,John A. Boehner,WMT,s,02/25/2014,19.52754,50000,19.52754,20.001425,20.401081,,20.551355
4,20001087,John A. Boehner,KN,S,05/20/2014,29.540001,15000,29.540001,28.16,31.93,31.959999,19.85


In [14]:
len(data)

5032

In [15]:
data.dropna(inplace=True)

In [16]:
len(data)

4255

In [38]:
# more data wrangling
data['Ticker'] = data['Ticker'].str.upper()
data['Type'] = data['Type'].str.upper()

float_columns = ['BuyPrice', 'FilePrice', 'Spot7', 'Spot30', 'Spot90', 'Spot180']
data[float_columns] = data[float_columns].round(2)

data["Profitable7"] = (data["FilePrice"] < data["Spot7"]).astype(int)
data["Profitable30"] = (data["FilePrice"] < data["Spot30"]).astype(int)
data["Profitable90"] = (data["FilePrice"] < data["Spot90"]).astype(int)
data["Profitable180"] = (data["FilePrice"] < data["Spot180"]).astype(int)

In [39]:
data.head()

Unnamed: 0,id,member,Ticker,Type,Bought,BuyPrice,Amount,FilePrice,Spot7,Spot30,Spot90,Spot180,Profitable7,Profitable30,Profitable90,Profitable180
0,20001934,Lou Barletta,DDD,S,10/22/2014,36.67,15000,36.67,37.1,35.15,29.32,32.5,1,0,0,0
1,20000404,John A. Boehner,CAH,S,02/27/2014,53.02,15000,53.02,53.56,51.66,51.49,54.63,1,0,0,1
2,20000404,John A. Boehner,SLB,P,02/26/2014,69.11,50000,69.11,68.86,73.02,76.45,82.8,0,1,1,1
4,20001087,John A. Boehner,KN,S,05/20/2014,29.54,15000,29.54,28.16,31.93,31.96,19.85,0,1,1,0
5,20001087,John A. Boehner,QCOM,P,05/22/2014,58.52,50000,58.52,58.99,59.05,56.52,53.54,1,1,0,0


In [40]:
categorical_features = ["member", "Ticker", "Type"]
numerical_features = ['BuyPrice', 'Amount']
categorical_transformer = OneHotEncoder(handle_unknown="ignore")
numerical_transformer = SimpleImputer(strategy='mean')

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features)
    ]
)


In [54]:
X = data[categorical_features + numerical_features]
y = data['Profitable90']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=69)


In [55]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=100))
])


In [56]:
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.68      0.46      0.55       254
           1       0.71      0.85      0.77       385

    accuracy                           0.70       639
   macro avg       0.69      0.66      0.66       639
weighted avg       0.69      0.70      0.68       639



In [47]:
# hyperparam tuning
param_grid = {
    'classifier__n_estimators': [100, 250, 500, 750],  
    'classifier__max_depth': [None], #, 10, 20, 30, 40],  
    'classifier__min_samples_split': [2], # 5, 10, 15],  
    'classifier__min_samples_leaf': [1], #, 2, 4, 6],  
    'classifier__max_features': ['sqrt', 'log2', None],  
    'classifier__bootstrap': [True, False], 
    'classifier__class_weight': [None, 'balanced']  
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    refit='f1',
    verbose=2,
)

grid_search.fit(X_train, y_train)

print("Best parameters found:")
print(grid_search.best_params_)

print("Best score found:")
print(grid_search.best_score_)

best_model = grid_search.best_estimator_

test_score = best_model.score(X_test, y_test)
print(f"Test score: {test_score}")

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END classifier__bootstrap=True, classifier__class_weight=None, classifier__max_depth=None, classifier__max_features=sqrt, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100; total time=   0.8s
[CV] END classifier__bootstrap=True, classifier__class_weight=None, classifier__max_depth=None, classifier__max_features=sqrt, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100; total time=   0.8s
[CV] END classifier__bootstrap=True, classifier__class_weight=None, classifier__max_depth=None, classifier__max_features=sqrt, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100; total time=   0.8s
[CV] END classifier__bootstrap=True, classifier__class_weight=None, classifier__max_depth=None, classifier__max_features=sqrt, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimator

In [57]:
import pickle
# save model
model = best_model

with open("model.pkl", "wb") as f:
    pickle.dump(model, f)