In [1]:
import os
import joblib
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import MinMaxScaler

# Define function to compute the performance metrics for decision tree
def compute_tree_metrics(X, y, smote=False, model_name='model'):
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler=MinMaxScaler()
    X_train=scaler.fit_transform(X_train)
    X_test=scaler.transform(X_test)

    if smote:
        sm = SMOTE(random_state=27)
        X_train, y_train = sm.fit_resample(X_train, y_train)

    # Train decision tree model
    model = DecisionTreeClassifier(criterion="entropy")
    model.fit(X_train, y_train)
    
    # Save model with best performance
    joblib.dump(model, f'{model_name}.pkl')
    
    # Predict on test set
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    # Compute metrics
    auc = roc_auc_score(y_test, y_prob)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    return auc, recall, precision, f1

def load_data(file_name):
    curr_file = os.path.join(os.getcwd().replace('models', 'data'), file_name)
    return pd.read_csv(curr_file, delimiter=",")

# Load data
data = load_data('data.csv')

# Extract features and labels for each dataset
X_data, y_data = data.drop('Outcome', axis=1), data['Outcome']

# Compute metrics for each dataset with decision tree
metrics_data_tree = compute_tree_metrics(X_data, y_data, model_name='decision_tree')
metrics_data_smote_tree = compute_tree_metrics(X_data, y_data, smote=True, model_name='decision_tree_smote')

# Combine metrics into a single table for decision tree
metrics_table_tree = pd.DataFrame({
    'Dataset': ['data', 'data_smote'],
    'AUC': [metrics_data_tree[0],  metrics_data_smote_tree[0]],
    'Recall': [metrics_data_tree[1],  metrics_data_smote_tree[1]],
    'Precision': [metrics_data_tree[2],  metrics_data_smote_tree[2]],
    'F1': [metrics_data_tree[3],  metrics_data_smote_tree[3]]
})

In [2]:
# Vizuallize metrics table
display(metrics_table_tree)

Unnamed: 0,Dataset,AUC,Recall,Precision,F1
0,data,0.708698,0.651163,0.608696,0.629213
1,data_smote,0.738448,0.697674,0.638298,0.666667
