In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report



In [10]:
data = pd.read_csv("trades.csv")
data.drop(["MonitorPrice"], axis=1, inplace=True)

In [11]:
data.head()

Unnamed: 0,id,member,Ticker,Type,Bought,BuyPrice,Amount,FilePrice,Spot7,Spot30,Spot90,Spot180,MonitorPrice
0,20001934,Lou Barletta,DDD,S,10/22/2014,36.669998,15000,36.669998,37.099998,35.150002,29.32,32.5,
1,20000404,John A. Boehner,CAH,s,02/27/2014,53.021889,15000,53.021889,53.560749,51.661263,51.490734,54.630642,
2,20000404,John A. Boehner,slB,P,02/26/2014,69.109833,50000,69.109833,68.855377,73.016212,76.451126,82.801132,
3,20000404,John A. Boehner,WMT,s,02/25/2014,19.52754,50000,19.52754,20.001425,20.401081,,20.551355,
4,20001087,John A. Boehner,KN,S,05/20/2014,29.540001,15000,29.540001,28.16,31.93,31.959999,19.85,


In [8]:
len(data)

0

In [5]:
data.dropna(inplace=True)

In [7]:
len(data)

0

In [45]:
data["Profitable7"] = (data["FilePrice"] < data["Spot7"]).astype(int)
data["Profitable30"] = (data["FilePrice"] < data["Spot30"]).astype(int)
data["Profitable90"] = (data["FilePrice"] < data["Spot90"]).astype(int)
data["Profitable180"] = (data["FilePrice"] < data["Spot180"]).astype(int)

In [46]:
data.head()

Unnamed: 0,id,member,Ticker,Type,Bought,BuyPrice,Amount,Filed,FilePrice,Spot7,Spot30,Spot90,Spot180,MonitorPrice,Profitable7,Profitable30,Profitable90,Profitable180
3,20023082,Rick W. Allen,MMM,S,05/14/2020,93.738243,15000,05/20/2020,103.174828,101.504097,107.811096,114.431389,118.816765,129.919998,0,1,1,1
4,20023082,Rick W. Allen,MMM,S,05/14/2020,93.738243,15000,06/05/2020,116.540703,101.504097,107.811096,116.828751,118.816765,129.919998,0,0,1,1
5,20023082,Rick W. Allen,MMM,P,08/22/2019,108.956505,50000,09/05/2019,111.225311,108.604324,112.938774,114.176102,109.516396,129.919998,0,1,1,0
6,20023082,Rick W. Allen,MMM,P,06/19/2019,114.59243,15000,07/05/2019,115.431313,115.350792,115.840706,105.13002,116.642151,129.919998,0,1,0,1
7,20023082,Rick W. Allen,MMM,S,05/17/2019,112.515663,15000,06/05/2019,109.961746,111.465034,111.867722,107.439468,115.505539,129.919998,1,1,0,1


In [47]:
categorical_features = ["member", "Ticker", "Type"]
numerical_features = ['BuyPrice', 'Amount']
categorical_transformer = OneHotEncoder(handle_unknown="ignore")
numerical_transformer = SimpleImputer(strategy='mean')

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features)
    ]
)


In [59]:
X = data[categorical_features + numerical_features]
y = data['Profitable90']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [60]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=100))
])


In [61]:
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.65      0.47      0.55       106
           1       0.71      0.84      0.77       165

    accuracy                           0.69       271
   macro avg       0.68      0.65      0.66       271
weighted avg       0.69      0.69      0.68       271



In [62]:
# hyperparam tuning
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    verbose=2
)

grid_search.fit(X_train, y_train)

print("Best parameters found:")
print(grid_search.best_params_)

print("Best score found:")
print(grid_search.best_score_)

best_model = grid_search.best_estimator_

test_score = best_model.score(X_test, y_test)
print(f"Test score: {test_score}")

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END classifier__max_depth=None, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100; total time=   0.1s
[CV] END classifier__max_depth=None, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100; total time=   0.1s
[CV] END classifier__max_depth=None, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100; total time=   0.1s
[CV] END classifier__max_depth=None, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100; total time=   0.1s
[CV] END classifier__max_depth=None, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100; total time=   0.1s
[CV] END classifier__max_depth=None, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=200; total time=   0.3s
[CV] END classifier__max_depth=N