In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

In [13]:
data = pd.read_csv("trades v4.csv")
data.drop(["MonitorPrice"], axis=1, inplace=True)

In [14]:
data.dropna(inplace=True)

In [16]:
# sort the data by bought
data.sort_values(by="Bought", inplace=True)
data.head()

Unnamed: 0,id,member,Ticker,Type,Bought,Filed,Delta,BuyPrice,Amount,FilePrice,...,Earnings Per Share,Operating Income,Gross Profit,Net Income,CRC,Revenue,Spot7,Spot30,Spot90,Spot180
2215,20003069,Pete Sessions,BDX,S,1366084800,1366084800,0,77.790199,15000,77.790199,...,1.23,358569000.0,961463000.0,262985000.0,1.048209,1887645000.0,78.132584,81.93161,83.714333,84.663231
2211,20003069,Pete Sessions,aaPl,P,1366344000,1366344000,0,11.939035,15000,11.939035,...,14.03,17340000000.0,20703000000.0,13064000000.0,0.804604,46333000000.0,12.754375,13.333024,13.286859,15.522838
2228,20003069,Pete Sessions,IBM,P,1366603200,1366603200,0,112.201172,15000,112.201172,...,2.65,3836000000.0,11118000000.0,3066000000.0,0.99609,24673000000.0,118.963272,124.226112,116.153984,104.816299
2220,20003069,Pete Sessions,EMR,S,1366948800,1366948800,0,39.266094,15000,39.266094,...,0.5,371000000.0,2055000000.0,381000000.0,1.101106,5309000000.0,40.982716,41.356674,43.397621,48.597759
1612,20001429,Lamar Smith,QCoM,S,1367208000,1400644800,33436800,44.411404,15000,58.602509,...,0.83,1551000000.0,2927000000.0,1395000000.0,1.016534,4681000000.0,59.007065,59.050549,55.553059,52.343555


In [5]:
# more data wrangling
data['Ticker'] = data['Ticker'].str.upper()
data['Type'] = data['Type'].str.upper()

float_columns = ['BuyPrice', 'Bought', 'Filed', 'Delta', 'FilePrice', 'NM', 'OM', 'ROA', 'RGR', 'EGR', 'CLR', 'DER', 
                 "Assets", "Liabilities", "Equity", "Net Cashflow", "Earnings Per Share", "Operating Income",
                 'Spot7', 'Spot30', 'Spot90', 'Spot180']
# float_columns = ['BuyPrice', 'FilePrice', 'Spot7', 'Spot30', 'Spot90', 'Spot180']
data[float_columns] = data[float_columns].round(2)

data["Profitable7"] = (data["FilePrice"] < data["Spot7"]).astype(int)
data["Profitable30"] = (data["FilePrice"] < data["Spot30"]).astype(int)
data["Profitable90"] = (data["FilePrice"] < data["Spot90"]).astype(int)
data["Profitable180"] = (data["FilePrice"] < data["Spot180"]).astype(int)

In [5]:
data.head()

Unnamed: 0,id,member,Ticker,Type,Bought,Filed,Delta,BuyPrice,Amount,FilePrice,...,CRC,Revenue,Spot7,Spot30,Spot90,Spot180,Profitable7,Profitable30,Profitable90,Profitable180
0,20021740,Robert B. Aderholt,AAPL,P,1661486400,1663905600,2419200,162.0,15000,149.0,...,0.76746,111000000000.0,137.0,145.0,131.0,156.0,0,0,0,1
1,20022132,Robert B. Aderholt,TSLA,S,1670216400,1670907600,691200,182.0,15000,161.0,...,1.110105,10389000000.0,138.0,124.0,174.0,244.0,0,0,1,1
3,20021134,Cindy Axne,DHR,P,1652155200,1654228800,2073600,211.0,15000,233.0,...,1.082917,6858000000.0,220.0,226.0,240.0,240.0,0,0,1,1
4,20021134,Cindy Axne,DHR,P,1652155200,1654228800,2073600,211.0,15000,233.0,...,1.082917,6858000000.0,220.0,226.0,240.0,240.0,0,0,1,1
5,20021134,Cindy Axne,DHR,P,1652155200,1654228800,2073600,211.0,15000,233.0,...,1.082917,6858000000.0,220.0,226.0,240.0,240.0,0,0,1,1


In [6]:
# categorical_features = ["member", "Ticker", "Type"]
categorical_features = ["member", "Ticker", "Type", "State", "Country", "Sector"]
# numerical_features = ['BuyPrice', 'Amount']
numerical_features = ['BuyPrice', 'Bought', 'Filed', 'Delta', 'Amount', 'FilePrice',
                      'NM', 'OM', 'ROA', 'RGR', 'EGR', 'CLR', 'DER',
                      "Assets", "Liabilities", "Equity", "Net Cashflow", "Earnings Per Share", "Operating Income",
                      ]
categorical_transformer = OneHotEncoder(handle_unknown="ignore")
numerical_transformer = SimpleImputer(strategy='mean')

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features)
    ]
)


In [7]:
X = data[categorical_features + numerical_features]
y = data['Profitable7']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=69)


In [8]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    # ('classifier', GradientBoostingClassifier(n_estimators=100, tol=0.001 ,min_samples_split=6, max_depth=9, learning_rate=0.01, random_state=69))
    ('classifier', GradientBoostingClassifier(n_estimators=1000000, random_state=69))
])


In [9]:
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate
print(classification_report(y_test, y_pred))
# accuracy score
print(model.score(X_test, y_test))

              precision    recall  f1-score   support

           0       0.67      0.66      0.67       126
           1       0.69      0.70      0.69       134

    accuracy                           0.68       260
   macro avg       0.68      0.68      0.68       260
weighted avg       0.68      0.68      0.68       260

0.6807692307692308


In [21]:
# take the bottom 15% of the data as the test data
test_data = data.tail(int(0.15 * data.shape[0]))

# make predictions, if the model was correct, and its probability
# the direction 0 if spot7 is less than fileprice, 1 otherwise
test_data["Direction"] = (test_data["Spot7"] > test_data["FilePrice"]).astype(int)
test_data["Prediction"] = model.predict(test_data[categorical_features + numerical_features])
test_data["Probability"] = model.predict_proba(test_data[categorical_features + numerical_features])[:, 1]

# save the predictions
test_data.to_csv("predictions.csv", index=False)

In [18]:
import pickle
# save model

with open("model v4.1.pkl", "wb") as f:
    pickle.dump(model, f)

In [20]:
# hyperparam tuning

# usually really good initial test for accuracy
param_grid = {
    'classifier__n_estimators': [25, 50, 100, 200, 300, 400, 500, 1000],
    'classifier__learning_rate': [0.00001, 0.001, 0.01, 0.1, 0.5],
    'classifier__max_depth': [3, 5, 7, 9, 15, 20],
    'classifier__min_samples_split': [2, 4, 6, 8],
    'classifier__tol': [1e-4, 1e-3, 1e-2],
}
grid_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_grid,
    n_iter=250,
    scoring='accuracy',
    cv=5,
    refit='f1',
    random_state=69,
    n_jobs=-1,
)
# param_grid = {
#     'classifier__n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 1000],
#     'classifier__learning_rate': [0.01, 0.1, 0.5],
#     'classifier__max_depth': [3, 5, 7, 9, 15, 20],
#     'classifier__min_samples_split': [2, 4, 6, 8],
#     # 'classifier__min_samples_leaf': [1, 2, 3, 4],
#     # 'classifier__subsample': [0.5, 0.75, 1],
#     # 'classifier__max_features': ['sqrt', 'log2'],
#     # 'classifier__max_leaf_nodes': [None, 10, 20, 30, 40, 50],
#     # 'classifier__warm_start': [True, False],
#     # 'classifier__validation_fraction': [0.1, 0.2, 0.3, 0.4, 0.5],
#     # 'classifier__n_iter_no_change': [5, 10, 15, 20, 25],
#     'classifier__tol': [1e-4, 1e-3, 1e-2],
#     # 'classifier__ccp_alpha': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],
#     # 'classifier__loss': ['deviance', 'exponential'],
#     # 'classifier__criterion': ['friedman_mse', 'mse', 'mae'],
#     # 'classifier__min_impurity_decrease': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],
#     # 'classifier__min_impurity_split': [None, 0.1, 0.2, 0.3, 0.4, 0.5],
#     # 'classifier__max_samples': [None, 0.5, 0.75, 1],
#     # 'classifier__ccp_alpha': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],
# }

# grid_search = RandomizedSearchCV(
#     estimator=model,
#     param_distributions=param_grid,
#     n_iter=1000,
#     scoring='accuracy',
#     cv=5,
#     refit='f1',
#     random_state=69,
#     n_jobs=-1,
# )

grid_search.fit(X_train, y_train)

print("Best parameters found:")
print(grid_search.best_params_)

print("Best score found:")
print(grid_search.best_score_)

best_model = grid_search.best_estimator_

test_score = best_model.score(X_test, y_test)
print(f"Test score: {test_score}")

Best parameters found:
{'classifier__tol': 0.0001, 'classifier__n_estimators': 500, 'classifier__min_samples_split': 2, 'classifier__max_depth': 9, 'classifier__learning_rate': 0.01}
Best score found:
0.6374331550802139
Test score: 0.6


In [76]:
import pickle
# save model
model = best_model

with open("model v1.pkl", "wb") as f:
    pickle.dump(model, f)

In [None]:
import joblib
joblib.dump(model, "model v1.1.pkl")