In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import json
import pickle

#### Hyperparameters

In [None]:
logistic_params = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['lbfgs', 'liblinear']
}

rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

xgb_params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 10]
}

: 

#### Data Standardization

In [None]:
data = pd.read_csv("")
imputer = SimpleImputer(strategy='median')
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

In [None]:
X = data_imputed.drop("risk", axis=1)
y = data_imputed['risk']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

#### Data Analysis

#### Model Training and Hyperparameter Tuning

In [None]:
logistic_model = LogisticRegression()
rf_model = RandomForestClassifier(random_state=42)
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

In [None]:
logistic_grid = GridSearchCV(logistic_model, logistic_params, cv=5, scoring='accuracy')
logistic_grid.fit(X_train, y_train)
best_logistic = logistic_grid.best_estimator_
y_pred_logistic = best_logistic.predict(X_test)

In [None]:
rf_grid = GridSearchCV(rf_model, rf_params, cv=5, scoring='accuracy')
rf_grid.fit(X_train, y_train)
best_rf = rf_grid.best_estimator_
y_pred_rf = best_rf.predict(X_test)

In [None]:
xgb_grid = GridSearchCV(xgb_model, xgb_params, cv=5, scoring='accuracy')
xgb_grid.fit(X_train, y_train)
best_xgb = xgb_grid.best_estimator_
y_pred_xgb = best_xgb.predict(X_test)

#### Evaluation

In [None]:
def evaluate_model(y_true, y_pred, model_name):
    print(f"{model_name} Evaluation:")
    print(classification_report(y_true, y_pred))
    print("Confusion Matrix:")
    sns.heatmap(confusion_matrix(y_true, y_pred), annot=True, fmt='d', cmap='Blues')
    plt.title(f"{model_name} Confusion Matrix")
    plt.show()
    plt.savefig(model_name+'.png', dpi=300, bbox_inches='tight')


In [None]:
evaluate_model(y_test, y_pred_logistic, "LogisticRegression")
evaluate_model(y_test, y_pred_rf, "Random Forest")
evaluate_model(y_test, y_pred_xgb, "XGBoost")

#### Feature Importance

In [None]:
rf_importances = pd.Series(best_rf.feature_importances_, index=X.columns).sort_values(ascending=False)
print("Random Forest Feature Importances: ")
print(rf_importances)

xgb_importances = pd.Series(best_xgb.feature_importances_, index=X.columns).sort_values(ascending=False)
print("XGBoost Feature Importances:")
print(xgb_importances)

#### Cross Validation

In [None]:
rf_cv_scores = cross_val_score(best_rf, X_scaled, y, cv=5)
print("Random Forest Cross-Validation Accuracy:", np.mean(rf_cv_scores))

xgb_cv_scores = cross_val_score(best_xgb, X_scaled, y, cv=5)
print("XGBoost Cross-Validation Accuracy:", np.mean(xgb_cv_scores))

#### Save Best Model

In [None]:
with open(".pkl", "wb") as file:
    pickle.dump(model, file)

In [None]:
file_path = ''

with open(file_path, 'rb') as file:
    loaded_model = pickle.load(file)
