In [3]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import json
import pickle
import optuna

#### Hyperparameters

In [None]:
logistic_params = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['lbfgs', 'liblinear']
}

rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

xgb_params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 10]
}

: 

#### Data Standardization

In [None]:
data = pd.read_csv("")
imputer = SimpleImputer(strategy='median')
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

In [None]:
X = data_imputed.drop("risk", axis=1)
y = data_imputed['risk']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

#### Data Analysis

#### Model Training and Hyperparameter Tuning

In [None]:
def objective_rf(trial):

    params = {
        'n_estimators':trial.suggest_int("n_estimators", 50, 300),
        'max_depth':trial.suggest_int("max_depth", 1, 32, log=True),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False])
    }


    rf_model = RandomForestClassifier(
        **params,
        random_state=42,
        n_jobs=-1
    )

    rf_model.fit(X_train, y_train)
    return cross_val_score(rf_model, X_test, y_test, cv=5).mean()


study = optuna.create_study(direction="maximize")
study.optimize(objective_rf, n_trials=100)

trial = study.best_trial


print(f"Accuracy: {trial.value}")
print(f"Best hyperparameters: {trial.params}")

optuna.visualization.plot_optimization_history(study)
optuna.visualization.plot_slice(study)
optuna.visualization.plot_contour(study, params=["n_estimators", "max_depth", "min_samples_split", "min_samples_leaf", "max_features"])
optuna.visualization.plot_param_importances(study)


In [None]:
best_rf_model = RandomForestClassifier(**trial.params)
best_rf_model.fit(X_train, y_train)
final_preds = best_rf_model.predict(X_test)
final_accuracy = accuracy_score(y_test, final_preds)
print(final_accuracy)
with open("rf_model.pkl", "wb") as file:
    pickle.dump(best_rf_model, file)

In [None]:
def objective_xgb(trial):


    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'tree_method': 'gpu_hist',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=50),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'lambda': trial.suggest_float('lambda', 1e-8, 10.0, log=True),  # L2
        'alpha': trial.suggest_float('alpha', 1e-8, 10.0, log=True)    # L1 
    }
    
    xgb_model = XGBClassifier(**params, use_label_encoder=False)
    xgb_model.fit(X_train, y_train)

    preds = xgb_model.predict(X_test)
    accuracy = accuracy_score(y_test, preds)

    return accuracy




study = optuna.create_study(direction="maximize")
study.optimize(objective_xgb, n_trials=100)

trial = study.best_trial


print(f"Accuracy: {trial.value}")
print(f"Best hyperparameters: {trial.params}")

optuna.visualization.plot_optimization_history(study)
optuna.visualization.plot_slice(study)
optuna.visualization.plot_contour(study, params=["n_estimators", "max_depth", "learning_rate", "subsample", "colsample_bytree", "lambda", "alpha"])
optuna.visualization.plot_param_importances(study)


In [None]:
best_xgb_model = XGBClassifier(**trial.params, tree_method='gpu_hist', use_label_encoder=False)
best_xgb_model.fit(X_train, y_train)
final_preds = best_xgb_model.predict(X_test)
final_accuracy = accuracy_score(y_test, final_preds)
print(final_accuracy)
with open("xgb_model.pkl", "wb") as file:
    pickle.dump(best_xgb_model, file)

In [None]:
def objective_lgr(trial):
    params = {
        'C': trial.suggest_float('C', 1e-4, 10.0, log=True),  # Regularization strength
        'penalty': trial.suggest_categorical('penalty', ['l1', 'l2']),
        'solver': 'liblinear'
    }
    
    lgr_model = LogisticRegression(**params, random_state=42, n_jobs=-1)
    lgr_model.fit(X_train, y_train)

    preds = lgr_model.predict(X_test)
    accuracy = accuracy_score(y_test, preds)

    return accuracy



study = optuna.create_study(direction="maximize")
study.optimize(objective_rf, n_trials=100)

trial = study.best_trial


print(f"Accuracy: {trial.value}")
print(f"Best hyperparameters: {trial.params}")

optuna.visualization.plot_optimization_history(study)
optuna.visualization.plot_slice(study)
optuna.visualization.plot_contour(study, params=["C", "penalty"])
optuna.visualization.plot_param_importances(study)

In [None]:
best_lgr_model = LogisticRegression(**trial.params, random_state=42)
best_lgr_model.fit(X_train, y_train)
final_preds = best_lgr_model.predict(X_test)
final_accuracy = accuracy_score(y_test, final_preds)
print(final_accuracy)
with open("lgr_model.pkl", "wb") as file:
    pickle.dump(best_lgr_model, file)

#### Evaluation

In [None]:
def evaluate_model(y_true, y_pred, model_name):
    print(f"{model_name} Evaluation:")
    print(classification_report(y_true, y_pred))
    print("Confusion Matrix:")
    sns.heatmap(confusion_matrix(y_true, y_pred), annot=True, fmt='d', cmap='Blues')
    plt.title(f"{model_name} Confusion Matrix")
    plt.show()
    plt.savefig(model_name+'.png', dpi=300, bbox_inches='tight')


In [None]:
with open("lgr_model.pkl", 'rb') as file:
    lgr_model = pickle.load(file)
with open("xgb_model.pkl", 'rb') as file:
    xgb_model = pickle.load(file)
with open("rf_model.pkl", 'rb') as file:
    rf_model = pickle.load(file)
y_pred_lgr = lgr_model.predict(X_test)
y_pred_xgb = xgb_model.predict(X_test)
y_pred_rf = rf_model.predict(X_test)


evaluate_model(y_test, y_pred_lgr, "LogisticRegression")
evaluate_model(y_test, y_pred_rf, "Random Forest")
evaluate_model(y_test, y_pred_xgb, "XGBoost")

#### Feature Importance

In [None]:
rf_importances = pd.Series(best_rf.feature_importances_, index=X.columns).sort_values(ascending=False)
print("Random Forest Feature Importances: ")
print(rf_importances)

xgb_importances = pd.Series(best_xgb.feature_importances_, index=X.columns).sort_values(ascending=False)
print("XGBoost Feature Importances:")
print(xgb_importances)

#### Cross Validation

In [None]:
rf_cv_scores = cross_val_score(best_rf, X_scaled, y, cv=5)
print("Random Forest Cross-Validation Accuracy:", np.mean(rf_cv_scores))

xgb_cv_scores = cross_val_score(best_xgb, X_scaled, y, cv=5)
print("XGBoost Cross-Validation Accuracy:", np.mean(xgb_cv_scores))

#### Save Best Model

In [None]:
with open(".pkl", "wb") as file:
    pickle.dump(model, file)

In [None]:
file_path = ''

with open(file_path, 'rb') as file:
    loaded_model = pickle.load(file)
