# Model Train

In [1]:
# !pip install optuna

In [2]:
import torch
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score

import optuna

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
import xgboost as xgb

  from .autonotebook import tqdm as notebook_tqdm
Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


In [3]:
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7f9514ee9e70>

In [4]:
# Load the preprocessed data
X_train = np.load("data/baf/preprocess/X_train.npy")
y_train = np.load("data/baf/preprocess/y_train.npy")
X_valid = np.load("data/baf/preprocess/X_valid.npy")
y_valid = np.load("data/baf/preprocess/y_valid.npy")

with open("data/baf/preprocess/metadata.pkl", 'rb') as f:
    metadata = pickle.load(f)

In [5]:
# Train a logistic regression model
model_lr = LogisticRegression(max_iter=1000)
model_lr.fit(X_train, y_train)

# Train a random forest model
model_rf = RandomForestClassifier(n_estimators=100)
model_rf.fit(X_train, y_train)

# Train a xgboost model
model_xgb = xgb.XGBClassifier(max_depth=3, n_estimators=100)
model_xgb.fit(X_train, y_train)

# Evaluate the models
y_pred = model_lr.predict(X_valid)
print("Logistic Regression")
print(classification_report(y_valid, y_pred))
print("ROC-AUC:", roc_auc_score(y_valid, model_lr.predict_proba(X_valid)[:, 1]))

y_pred = model_rf.predict(X_valid)
print("Random Forest")
print(classification_report(y_valid, y_pred))
print("ROC-AUC:", roc_auc_score(y_valid, model_rf.predict_proba(X_valid)[:, 1]))

y_pred = model_xgb.predict(X_valid)
print("XGBoost")
print(classification_report(y_valid, y_pred))
print("ROC-AUC:", roc_auc_score(y_valid, model_xgb.predict_proba(X_valid)[:, 1]))


# Save models
with open("models/baf/model_lr.pkl", 'wb') as f:
    pickle.dump(model_lr, f)

with open("models/baf/model_rf.pkl", 'wb') as f:
    pickle.dump(model_rf, f)

with open("models/baf/model_xgb.pkl", 'wb') as f:
    pickle.dump(model_xgb, f)

Logistic Regression
              precision    recall  f1-score   support

           0       0.87      0.95      0.91      5000
           1       0.73      0.47      0.58      1411

    accuracy                           0.85      6411
   macro avg       0.80      0.71      0.74      6411
weighted avg       0.84      0.85      0.83      6411

ROC-AUC: 0.8752963855421687
Random Forest
              precision    recall  f1-score   support

           0       0.87      0.95      0.91      5000
           1       0.75      0.49      0.59      1411

    accuracy                           0.85      6411
   macro avg       0.81      0.72      0.75      6411
weighted avg       0.84      0.85      0.84      6411

ROC-AUC: 0.8760414599574772
XGBoost
              precision    recall  f1-score   support

           0       0.88      0.94      0.91      5000
           1       0.73      0.56      0.63      1411

    accuracy                           0.86      6411
   macro avg       0.81      0

In [6]:
class ResNetBlock(torch.nn.Module):
    def __init__(self, in_features, out_features):
        super(ResNetBlock, self).__init__()
        self.bn = torch.nn.BatchNorm1d(in_features)
        self.fc1 = torch.nn.Linear(in_features, out_features)
        self.fc2 = torch.nn.Linear(out_features, out_features)
        self.dropout = torch.nn.Dropout(0.2)
        
    def forward(self, x):
        y = torch.relu(self.fc1(self.bn(x)))
        y = self.dropout(y)
        y = self.fc2(y)
        y = self.dropout(y)
        return torch.add(x, y)
    
class TabResNet(torch.nn.Module):
    def __init__(self, in_features, out_features, num_blocks=1, embedding_dim=128):
        super(TabResNet, self).__init__()
        self.embedding = torch.nn.Linear(in_features, embedding_dim)
        self.res_blocks = []
        for i in range(num_blocks):
            self.res_blocks.append(ResNetBlock(embedding_dim, embedding_dim))
        self.res_blocks = torch.nn.ModuleList(self.res_blocks)
        self.bn = torch.nn.BatchNorm1d(embedding_dim)
        self.fc = torch.nn.Linear(embedding_dim, out_features)
        
    def forward(self, x):
        x = self.embedding(x)
        for block in self.res_blocks:
            x = block(x)
        x = torch.relu(self.bn(x))
        x = self.fc(x)
        return x

In [7]:
model = TabResNet(X_train.shape[1], 2)
criterion = torch.nn.CrossEntropyLoss()

# Convert the data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_valid_tensor = torch.tensor(X_valid, dtype=torch.float32)
y_valid_tensor = torch.tensor(y_valid, dtype=torch.long)

# Create a DataLoader
train_data = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=64, shuffle=True)

# Define the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

model.train()
# Train the model
for epoch in range(10):
    model.train()
    train_loss = 0
    for i, (X_batch, y_batch) in enumerate(train_loader):
        optimizer.zero_grad()
        output = model(X_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    print(f"Epoch {epoch}, Loss: {train_loss / len(train_loader)}")

    model.eval()
    valid_loss = 0
    with torch.no_grad():
        y_pred = model(X_valid_tensor)
        loss = criterion(y_pred, y_valid_tensor)
        valid_loss = loss.item()

        print(f"Validation Loss: {valid_loss}")
        y_pred = torch.argmax(y_pred, dim=1)
        print(classification_report(y_valid_tensor, y_pred))
        print("ROC-AUC:", roc_auc_score(y_valid, model(X_valid_tensor).detach().numpy()[:, 1]))


Epoch 0, Loss: 0.3864909635420051
Validation Loss: 0.3482488691806793
              precision    recall  f1-score   support

           0       0.90      0.91      0.90      5000
           1       0.66      0.63      0.64      1411

    accuracy                           0.85      6411
   macro avg       0.78      0.77      0.77      6411
weighted avg       0.84      0.85      0.84      6411

ROC-AUC: 0.8750987243090008
Epoch 1, Loss: 0.3666435921092353
Validation Loss: 0.3406158983707428
              precision    recall  f1-score   support

           0       0.88      0.94      0.91      5000
           1       0.72      0.55      0.62      1411

    accuracy                           0.85      6411
   macro avg       0.80      0.75      0.77      6411
weighted avg       0.85      0.85      0.85      6411

ROC-AUC: 0.8748755492558469
Epoch 2, Loss: 0.3628064251758836
Validation Loss: 0.3438488245010376
              precision    recall  f1-score   support

           0       0.89  

In [8]:
torch.save(model.state_dict(), "models/baf/tabresnet.pth")

In [9]:
STOP

NameError: name 'STOP' is not defined

In [None]:
optuna_studies = []

for model_name in ["logistic_regression", "random_forest", "xgboost"]:
    def generate_objective(X_train, y_train, cv=3, scoring="roc_auc"):
        def objective(trial):
            if model_name == "logistic_regression":
                params = {
                    'C': trial.suggest_float('C', 1e-3, 10.0, log=True),
                    'solver': trial.suggest_categorical('solver', ['liblinear', 'lbfgs']),
                    'max_iter': 1000
                }
                model = LogisticRegression(**params)
            elif model_name == "random_forest":
                params = {
                    'n_estimators': trial.suggest_int('n_estimators', 50, 500),
                    'max_depth': trial.suggest_int('max_depth', 2, 20),
                    'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
                    'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
                    'bootstrap': trial.suggest_categorical('bootstrap', [True, False])
                }
                model = RandomForestClassifier(**params)
            elif model_name == "xgboost":
                params = {
                    'verbosity': 0,
                    'objective': 'binary:logistic',
                    'booster': 'gbtree',
                    'lambda': trial.suggest_float('lambda', 1e-3, 10.0, log=True),
                    'alpha': trial.suggest_float('alpha', 1e-3, 10.0, log=True),
                    'subsample': trial.suggest_uniform('subsample', 0.4, 1.0),
                    'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.4, 1.0),
                    'max_depth': trial.suggest_int('max_depth', 2, 10),
                    'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
                    'eta': trial.suggest_float('eta', 1e-3, 0.1, log=True),
                    'n_estimators': trial.suggest_int('n_estimators', 50, 500),
                    'gamma': trial.suggest_float('gamma', 1e-3, 10.0, log=True),
                }
                model = xgb.XGBClassifier(**params)
            else:
                raise ValueError("Invalid model name")

            scores = cross_val_score(model, X_train, y_train, cv=cv, scoring=scoring)
            return scores.mean()

        return objective

    study = optuna.create_study(direction="maximize")
    study.optimize(generate_objective(X_train, y_train), n_trials=100)

    if model_name == "logistic_regression":
        best_model = LogisticRegression(**study.best_params)
    elif model_name == "random_forest":
        best_model = RandomForestClassifier(**study.best_params)
    elif model_name == "xgboost":
        best_model = xgb.XGBClassifier(**study.best_params)

    best_model.fit(X_train, y_train)

    res = {
        "model_name": model_name,
        "best_model" : best_model,
        "best_params": study.best_params,
        "best_score": study.best_value,
    }

    optuna_studies.append(res)

    y_pred = best_model.predict(X_valid)
    print(classification_report(y_valid, y_pred))
    print(roc_auc_score(y_valid, best_model.predict_proba(X_valid)[:, 1]))


with open("data/baf/optuna_studies.pkl", 'wb') as f:
    pickle.dump(optuna_studies, f)
    