In [14]:
import os
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc, roc_auc_score, classification_report, cohen_kappa_score, matthews_corrcoef, precision_recall_curve, average_precision_score, log_loss, brier_score_loss

In [4]:
working_dir = os.getcwd()
print("Current Working Directory:", working_dir)

Current Working Directory: c:\Users\User.ACIES35\Desktop\project\src


In [6]:
# Get the working directory of the main.py file
working_dir = os.path.dirname(os.path.abspath("c:/Users/User.ACIES35/Desktop/project/src"))

In [7]:
# Get the parent directory
parent_dir = os.path.dirname(working_dir)

In [9]:
print(parent_dir)

c:\Users\User.ACIES35\Desktop


In [10]:
# Step 1: Read the data
def read_data(file_name):
    file_path = f"{parent_dir}/data/{file_name}"
    if file_path.endswith('.csv'):
        df = pd.read_csv(file_path)
        return df
    elif file_path.endswith('.xlsx') or file_path.endswith('.xls'):
        df = pd.read_excel(file_path)
        return df

In [11]:
# Step 2: Preprocess the data
def preprocess_data(df, target_column, scaler_type):
    # Split features and target
    X = df.drop(columns=[target_column])
    y = df[target_column]

    # Check if there are only numerical or categorical columns
    numerical_cols = X.select_dtypes(include=['number']).columns
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns

    if len(numerical_cols) == 0:
        pass
    else:
        # Split data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Impute missing values for numerical columns (mean imputation)
        num_imputer = SimpleImputer(strategy='mean')
        X_train[numerical_cols] = num_imputer.fit_transform(X_train[numerical_cols])
        X_test[numerical_cols] = num_imputer.transform(X_test[numerical_cols])

        # Scale the numerical features based on scaler_type
        if scaler_type == 'standard':
            scaler = StandardScaler()
        elif scaler_type == 'minmax':
            scaler = MinMaxScaler()

        X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
        X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

    if len(categorical_cols) == 0:
        pass
    else:
        # Impute missing values for categorical columns (mode imputation)
        cat_imputer = SimpleImputer(strategy='most_frequent')
        X_train[categorical_cols] = cat_imputer.fit_transform(X_train[categorical_cols])
        X_test[categorical_cols] = cat_imputer.transform(X_test[categorical_cols])

        # One-hot encode categorical features
        encoder = OneHotEncoder()
        X_train_encoded = encoder.fit_transform(X_train[categorical_cols])
        X_test_encoded = encoder.transform(X_test[categorical_cols])
        X_train_encoded = pd.DataFrame(X_train_encoded.toarray(), columns=encoder.get_feature_names(categorical_cols))
        X_test_encoded = pd.DataFrame(X_test_encoded.toarray(), columns=encoder.get_feature_names(categorical_cols))
        X_train = pd.concat([X_train.drop(columns=categorical_cols), X_train_encoded], axis=1)
        X_test = pd.concat([X_test.drop(columns=categorical_cols), X_test_encoded], axis=1)

    return X_train, X_test, y_train, y_test

In [12]:
# Step 3: Train the model
def train_model(X_train, y_train, model, model_name):
    # training the selected model
    model.fit(X_train, y_train)
    # saving the trained model
    with open(f"{parent_dir}/trained_model/{model_name}.pkl", 'wb') as file:
        pickle.dump(model, file)
    return model

In [15]:
# Step 4: Evaluate the model
def evaluate_model(model, X_test, y_test):
    
    # Assuming y_score is the predicted probabilities for the positive class
    y_score = model.predict_proba(X_test)[:, 1]

    # Reshape y_score for binary classification
    y_score = y_score.reshape(-1, 1)

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    log_loss_value = log_loss(y_test, y_pred)
    brier_score = brier_score_loss(y_test, y_pred)
    precision, recall, _ = precision_recall_curve(y_test, y_pred)
    average_precision = average_precision_score(y_test, y_pred)

    metrics = {
        "accuracy": round(accuracy, 2),
        "confusion_matrix": conf_matrix,
        "roc_auc": round(roc_auc, 2),
        "cohen_kappa": round(kappa, 2),
        "matthews_corrcoef": round(mcc, 2),
        "classification_report": class_report,
        "log_loss": round(log_loss_value, 2),
        "brier_score_loss": round(brier_score, 2),
        "average_precision": round(average_precision, 2),
        "precision_recall_curve": (precision, recall)
    }

    return metrics