In [None]:
import sys
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import shap
import tensorflow as tf

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, precision_recall_curve

from imblearn.over_sampling import SMOTE

from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import f1_score, precision_score, recall_score,accuracy_score
import pickle



sys.path.append(os.path.abspath(os.path.join('..', 'src')))
from utils import print_model_score
from visualization import create_bar_chart


In [None]:
data=pd.read_csv("../data/processed/data_features_selected.csv")


## Logistic regression

#### Split data

In [None]:

X = data.drop('AR', axis = 1)
y = data['AR']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, stratify = y, random_state=2022)

sampling_strategy = 0.6  # Increase minority class to 60% of the majority class
smote = SMOTE(sampling_strategy=sampling_strategy, random_state=2022)
X_train, y_train = smote.fit_resample(X_train, y_train)


#### Train Model

In [None]:

param_distributions = {
    
        'classifier__C': [0.1, 1, 10, 100,200],
        'classifier__penalty': ['l2'],
        'classifier__solver':  ['lbfgs','newton-cg','liblinear','sag','saga']
    
}


log_model = LogisticRegression(class_weight='balanced', max_iter=4000)

log_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', log_model)
])

log_random_search = RandomizedSearchCV(log_pipeline, param_distributions, cv=10, scoring='f1', n_iter=25, random_state=2022)
log_random_search.fit(X_train, y_train)

best_params = log_random_search.best_params_

print(f"Best Parameters: {best_params}")

best_log_model_params = {
    key.replace('classifier__', ''): value
    for key, value in best_params.items()
}
log_model = LogisticRegression(**best_log_model_params,max_iter=4000)

log_model.fit(X_train, y_train)

y_pred = log_model.predict(X_test)
y_train_pred = log_model.predict(X_train)




#### visualization

##### Shap Values

In [None]:
explainer = shap.LinearExplainer(log_model, X_train, feature_names=X_train.columns)
shap_values = explainer(X_test)


shap.force_plot(
    explainer.expected_value, 
    shap_values[0].values,
    X_test.iloc[0], 
    feature_names=X_test.columns
)
plt.show()


shap.summary_plot(shap_values, X_test)

##### Model Performance

In [None]:
print_model_score(y_train, y_train_pred, train=True)
print_model_score(y_test, y_pred, train=False)

In [None]:
with open('../models/logistic_regression_model.pkl', 'wb') as file:
    pickle.dump(log_model, file)

## XGB Classifier

#### Split Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, stratify = y, random_state=2022)

sampling_strategy = 0.6  # Increase minority class to 60% of the majority class
smote = SMOTE(sampling_strategy=sampling_strategy, random_state=2022)
X_train, y_train = smote.fit_resample(X_train, y_train)


#### Train Model

In [None]:


param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'colsample_bytree': [0.3, 0.7],
    'subsample': [0.8, 1.0]
}

xgb_model = XGBClassifier()

xgb_random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_grid, 
                            n_iter=10, scoring='roc_auc', cv=3, verbose=1, n_jobs=-1)

xgb_random_search.fit(X_train, y_train)

best_params = xgb_random_search.best_params_
best_params['tree_method'] = 'hist'
print(f"Best Parameters: {best_params}")


xgb_model = XGBClassifier(**best_params)
xgb_model.fit(X_train, y_train)


y_train_pred = xgb_model.predict(X_train)
y_test_pred = xgb_model.predict(X_test)


#### Visualization

##### Shap Values

In [None]:

explainer = shap.Explainer(xgb_model)
shap_values = explainer(X_test)

shap.summary_plot(shap_values, X_test)

shap.force_plot(explainer.expected_value, shap_values[0].values, X_test.iloc[0])

plt.show()

##### Model Performance

In [None]:
print_model_score(y_train, y_train_pred, train=True)
print_model_score(y_test, y_test_pred, train=False)

In [None]:
with open('../models/xgboost_model.pkl', 'wb') as file:
    pickle.dump(xgb_model, file)

## Random Forest

#### Split Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify = y, random_state=2022)
sampling_strategy = 0.6  # Increase minority class to 60% of the majority class
smote = SMOTE(sampling_strategy=sampling_strategy, random_state=2022)
X_train, y_train = smote.fit_resample(X_train, y_train)


#### Train Model

In [None]:

param_grid = {
    'n_estimators': [100, 150, 200],
    'max_depth': [10, 20, 50, 100],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5, 10],
}

rf_model = RandomForestClassifier(n_estimators=100)

rf_random_search = RandomizedSearchCV(
    rf_model, param_distributions=param_grid, cv=StratifiedKFold(10), n_iter=60, 
    scoring='f1', n_jobs=-1, verbose=1, random_state=2022
)

rf_random_search.fit(X_train, y_train)

best_params = rf_random_search.best_params_

print(f"Best Parameters: {best_params}")

rf_model = RandomForestClassifier(**best_params)

rf_model.fit(X_train, y_train)

y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

#### Visualization

##### Model Performance

In [None]:
print_model_score(y_train, y_train_pred, train=True)
print_model_score(y_test, y_test_pred, train=False)

In [None]:
with open('../models/random_forest_model.pkl', 'wb') as file:
    pickle.dump(rf_model, file)

## Ensemble

#### Train Model

In [None]:


# Define the best hyperparameters for pre-trained models
best_log_reg = log_model
best_xgb = xgb_model
best_rf = rf_model

ensemble_model = VotingClassifier(
    estimators=[
        ('log_reg', best_log_reg),
        ('xgb', best_xgb),
        ('rf', best_rf)
    ],
    voting='soft' 
)

ensemble_model.fit(X_train, y_train)

y_train_pred = ensemble_model.predict(X_train)
y_test_pred = ensemble_model.predict(X_test)




#### Visualization

##### Model Performance

In [None]:
# Print model performance
print_model_score(y_train, y_train_pred, train=True)
print_model_score(y_test, y_test_pred, train=False)

In [None]:
with open('../models/ensemble_model.pkl', 'wb') as file:
    pickle.dump(ensemble_model, file)

## All Model Performance

#### ROC AUC of Models

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve, auc
import matplotlib.pyplot as plt

# Predict probabilities for the test set
y_pred_proba_logistic = log_model.predict_proba(X)[:, 1]
y_pred_proba_xgb = xgb_model.predict_proba(X)[:, 1]
y_pred_proba_rf = rf_model.predict_proba(X)[:, 1]
y_pred_proba_ensemble = ensemble_model.predict_proba(X)[:, 1]

# Calculate ROC curves
fpr_logistic, tpr_logistic, _ = roc_curve(y, y_pred_proba_logistic)
fpr_xgb, tpr_xgb, _ = roc_curve(y, y_pred_proba_xgb)
fpr_rf, tpr_rf, _ = roc_curve(y, y_pred_proba_rf)
fpr_ensemble, tpr_ensemble, _ = roc_curve(y, y_pred_proba_ensemble)

# Calculate AUC
roc_auc_logistic = auc(fpr_logistic, tpr_logistic)
roc_auc_xgb = auc(fpr_xgb, tpr_xgb)
roc_auc_rf = auc(fpr_rf, tpr_rf)
roc_auc_ensemble = auc(fpr_ensemble, tpr_ensemble)

# Plot ROC curve
plt.figure()
plt.plot(fpr_logistic, tpr_logistic, color='red', lw=2, label=f'Logistic Regression (AUC = {roc_auc_logistic:.2f})')
plt.plot(fpr_xgb, tpr_xgb, color='yellow', lw=2, label=f'XGBoost (AUC = {roc_auc_xgb:.2f})')
plt.plot(fpr_rf, tpr_rf, color='blue', lw=2, label=f'Random Forest (AUC = {roc_auc_xgb:.2f})')
plt.plot(fpr_ensemble, tpr_ensemble, color='orange', lw=2, label=f'Ensemble (AUC = {roc_auc_xgb:.2f})')
plt.plot([0, 1], [0, 1], color='grey', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.savefig("../reports/figures/roc.png")
plt.show()




#### F1-score,accuracy,recall,precision

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Predict and calculate metrics for each model
# RandomForest
rf_y_test_pred = rf_model.predict(X)
rf_metrics = {
    "Model": "RandomForest",
    "Accuracy": accuracy_score(y, rf_y_test_pred)* 100,
    "Precision": precision_score(y, rf_y_test_pred)* 100,
    "Recall": recall_score(y, rf_y_test_pred)* 100,
    "F1-score": f1_score(y, rf_y_test_pred)* 100
}

# XGBoost
xgb_y_test_pred = xgb_model.predict(X_test)
xgb_metrics = {
    "Model": "XGBoost",
    "Accuracy": accuracy_score(y_test, xgb_y_test_pred)* 100,
    "Precision": precision_score(y_test, xgb_y_test_pred)* 100,
    "Recall": recall_score(y_test, xgb_y_test_pred)* 100,
    "F1-score": f1_score(y_test, xgb_y_test_pred)* 100
}

# Logistic Regression
log_y_test_pred = log_model.predict(X_test)
log_metrics = {
    "Model": "Logistic Regression",
    "Accuracy": accuracy_score(y_test, log_y_test_pred)* 100,
    "Precision": precision_score(y_test, log_y_test_pred)* 100,
    "Recall": recall_score(y_test, log_y_test_pred)* 100,
    "F1-score": f1_score(y_test, log_y_test_pred)* 100
}

# Logistic Regression
ensemble_y_test_pred = ensemble_model.predict(X_test)
ensemble_metrics = {
    "Model": "Ensemble",
    "Accuracy": accuracy_score(y_test, ensemble_y_test_pred)* 100,
    "Precision": precision_score(y_test, ensemble_y_test_pred)* 100,
    "Recall": recall_score(y_test, ensemble_y_test_pred)* 100,
    "F1-score": f1_score(y_test, ensemble_y_test_pred)* 100
}

metrics_df = pd.DataFrame([rf_metrics, xgb_metrics, log_metrics,ensemble_metrics])

metrics_melted = metrics_df.melt(id_vars="Model", var_name="Metric", value_name="Value")


# Create a grouped bar plot
plt.figure(figsize=(12, 8))
sns.barplot(x="Model", y="Value", hue="Metric", data=metrics_melted)
plt.title("Model Performance Comparison")
plt.ylabel("Score %")
plt.xlabel("Model")
plt.legend(loc='upper right')
plt.show()

plt.tight_layout()
plt.show()


In [None]:
metrics_df

#### Precision-Recall curves

In [None]:
models = [
    ('RandomForest', rf_model),
    ('XGBoost', xgb_model),
    ('Logistic Regression', log_model),
    ('Ensemble', ensemble_model)
]




fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.flatten()

for ax, (name, model) in zip(axes, models):
    y_prob = model.predict_proba(X_test)[:, 1] 
    precision, recall, thresholds = precision_recall_curve(y_test, y_prob)
    
    ax.plot(thresholds, precision[:-1], label=f"Precision")
    ax.plot(thresholds, recall[:-1], label=f"Recall")
    
    ax.set_xlabel("Threshold")
    ax.set_ylabel("Score")
    ax.set_title(f"{name} Precision and Recall for Different Thresholds")
    ax.legend(loc="best")
    ax.grid(True)

# Adjust layout
plt.tight_layout()
plt.show()

