In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

In [19]:
data = pd.read_csv("trades v4.csv")
data.drop(["MonitorPrice"], axis=1, inplace=True)

In [20]:
data = data.sort_values(by="Bought", ascending=False)
data.head()

Unnamed: 0,id,member,Ticker,Type,Bought,Filed,Delta,BuyPrice,Amount,FilePrice,...,Earnings Per Share,Operating Income,Gross Profit,Net Income,CRC,Revenue,Spot7,Spot30,Spot90,Spot180
1037,20026650,Robert E. Latta,FMAO,P,1737349200,1737694800,345600,,15000,26.0,...,0.39,6778000.0,,5359000.0,,38654000.0,,,,
1046,20026590,Nancy Pelosi,VST,P,1736830800,1737090000,259200,171.0,1000000,171.0,...,-0.24,86000000.0,,18000000.0,,3054000000.0,191.0,,,
1043,20026590,Nancy Pelosi,NVDA,P,1736830800,1737090000,259200,132.0,500000,138.0,...,6.04,16909000000.0,20406000000.0,14881000000.0,1.324228,26044000000.0,143.0,,,
1039,20026590,Nancy Pelosi,AMZN,P,1736830800,1737090000,259200,218.0,500000,226.0,...,1.0,15307000000.0,70680000000.0,10431000000.0,1.015861,143000000000.0,235.0,,,
1045,20026590,Nancy Pelosi,TEM,P,1736830800,1737090000,259200,32.0,100000,35.0,...,-6.86,-533492000.0,,-552212000.0,,165969000.0,51.0,,,


In [21]:
print(len(data))
print(data.keys())

4201
Index(['id', 'member', 'Ticker', 'Type', 'Bought', 'Filed', 'Delta',
       'BuyPrice', 'Amount', 'FilePrice', 'State', 'Country', 'Sector', 'NM',
       'OM', 'ROA', 'RGR', 'EGR', 'CLR', 'DER', 'Assets', 'Liabilities',
       'Equity', 'Net Cashflow', 'Earnings Per Share', 'Operating Income',
       'Gross Profit', 'Net Income', 'CRC', 'Revenue', 'Spot7', 'Spot30',
       'Spot90', 'Spot180'],
      dtype='object')


In [22]:
print(data.isna().sum())
print(data.isna().any(axis=1).sum())
data.dropna(how="all")

id                       0
member                   0
Ticker                   0
Type                     0
Bought                   0
Filed                    0
Delta                    0
BuyPrice               150
Amount                   0
FilePrice              177
State                  517
Country                295
Sector                 295
NM                     611
OM                     617
ROA                    581
RGR                    646
EGR                    609
CLR                    582
DER                    605
Assets                 515
Liabilities            515
Equity                 543
Net Cashflow           539
Earnings Per Share     837
Operating Income       557
Gross Profit          1576
Net Income             511
CRC                   1578
Revenue                538
Spot7                  431
Spot30                 264
Spot90                 360
Spot180                517
dtype: int64
2468


Unnamed: 0,id,member,Ticker,Type,Bought,Filed,Delta,BuyPrice,Amount,FilePrice,...,Earnings Per Share,Operating Income,Gross Profit,Net Income,CRC,Revenue,Spot7,Spot30,Spot90,Spot180
1037,20026650,Robert E. Latta,FMAO,P,1737349200,1737694800,345600,,15000,26.000000,...,0.39,6.778000e+06,,5.359000e+06,,3.865400e+07,,,,
1046,20026590,Nancy Pelosi,VST,P,1736830800,1737090000,259200,171.000000,1000000,171.000000,...,-0.24,8.600000e+07,,1.800000e+07,,3.054000e+09,191.000000,,,
1043,20026590,Nancy Pelosi,NVDA,P,1736830800,1737090000,259200,132.000000,500000,138.000000,...,6.04,1.690900e+10,2.040600e+10,1.488100e+10,1.324228,2.604400e+10,143.000000,,,
1039,20026590,Nancy Pelosi,AMZN,P,1736830800,1737090000,259200,218.000000,500000,226.000000,...,1.00,1.530700e+10,7.068000e+10,1.043100e+10,1.015861,1.430000e+11,235.000000,,,
1045,20026590,Nancy Pelosi,TEM,P,1736830800,1737090000,259200,32.000000,100000,35.000000,...,-6.86,-5.334920e+08,,-5.522120e+08,,1.659690e+08,51.000000,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1558,20001511,Scott H. Peters,MlPN,P,1360731600,1362027600,1296000,27.910000,15000,27.030001,...,,,,,,,27.389999,,29.639999,28.870001
2237,20003069,Pete Sessions,PNC,P,1359522000,1359522000,0,43.208851,15000,43.208851,...,1.45,1.092000e+09,,8.110000e+08,,3.732000e+09,44.271244,43.879837,47.756630,53.694691
2238,20003069,Pete Sessions,STT,S,1359003600,1359003600,0,40.820091,15000,40.820091,...,0.86,5.860000e+08,,4.270000e+08,,2.421000e+09,40.915657,42.047920,43.029179,52.788448
1491,20000945,Alan S. Lowenthal,gOOgL,S,1358398800,1358398800,0,17.736504,15000,17.736504,...,,,,,,,18.805952,19.770422,19.512848,22.930141


In [23]:
len(data)

4201

In [24]:
# more data wrangling
data['Ticker'] = data['Ticker'].str.upper()
data['Type'] = data['Type'].str.upper()

float_columns = ['BuyPrice', 'Bought', 'Filed', 'Delta', 'Amount', 'FilePrice',
                  'NM', 'OM', 'ROA', 'RGR', 'EGR', 'CLR', 'DER', 
                 "Assets", "Liabilities", "Equity", "Net Cashflow", "Earnings Per Share", "Operating Income",
                 'Spot7', 'Spot30', 'Spot90', 'Spot180']
# float_columns = ['BuyPrice', 'FilePrice', 'Spot7', 'Spot30', 'Spot90', 'Spot180']
data[float_columns] = data[float_columns].round(2)

data["Profitable7"] = (data["FilePrice"] < data["Spot7"]).astype(int)
data["Profitable30"] = (data["FilePrice"] < data["Spot30"]).astype(int)
data["Profitable90"] = (data["FilePrice"] < data["Spot90"]).astype(int)
data["Profitable180"] = (data["FilePrice"] < data["Spot180"]).astype(int)

In [25]:
data.head()
len(data)

4201

In [26]:
# categorical_features = ["member", "Ticker", "Type"]
categorical_features = ["member", "Ticker", "Type", "State", "Country", "Sector"]
# numerical_features = ['BuyPrice', 'Amount']
numerical_features = ['BuyPrice', 'Bought', 'Filed', 'Delta', 'Amount', 'FilePrice',
                      'NM', 'OM', 'ROA', 'RGR', 'EGR', 'CLR', 'DER',
                      "Assets", "Liabilities", "Equity", "Net Cashflow", "Earnings Per Share", "Operating Income",
                      ]
categorical_transformer = OneHotEncoder(handle_unknown="ignore")
numerical_transformer = SimpleImputer(strategy='mean')

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features)
    ]
)


In [27]:
X = data[categorical_features + numerical_features]
# 1 if profitable after 7 days, 0 otherwise
y = data['Profitable7']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=69)


In [28]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=4294967295, n_jobs=-1, warm_start=True))
])


In [30]:
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate
print(classification_report(y_test, y_pred))
# accuracy score
print(model.score(X_test, y_test))

import pickle
# save model

with open("model v4.pkl", "wb") as f:
    pickle.dump(model, f)

              precision    recall  f1-score   support

           0       0.63      0.74      0.68       330
           1       0.65      0.53      0.59       301

    accuracy                           0.64       631
   macro avg       0.64      0.64      0.63       631
weighted avg       0.64      0.64      0.64       631

0.6402535657686212


In [31]:
# hyperparam tuning
param_grid = {
    'classifier__n_estimators': [100, 250, 500],  
    'classifier__max_depth': [3, 5, 7, 9, 15, 20],
    'classifier__min_samples_split': [2, 4, 6, 8],
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='accuracy',
    cv=3,
    refit='f1',
    # verbose=2,
    n_jobs=-1,
)

grid_search.fit(X_train, y_train)

print("Best parameters found:")
print(grid_search.best_params_)

print("Best score found:")
print(grid_search.best_score_)

best_model = grid_search.best_estimator_

test_score = best_model.score(X_test, y_test)
print(f"Test score: {test_score}")

Best parameters found:
{'classifier__max_depth': 20, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}
Best score found:
0.638375350140056
Test score: 0.6354992076069731


In [76]:
import pickle
# save model
model = best_model

with open("model v3.pkl", "wb") as f:
    pickle.dump(model, f)

In [None]:
import joblib
joblib.dump(model, "model v3.1.pkl")