In [88]:
import pandas as pd
import os
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
#import random forest classifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from src.logger import logging
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


In [89]:
X_train = pd.read_csv("artifacts/transformed_data/train_features.csv")
y_train = pd.read_csv("artifacts/transformed_data/train_labels.csv")
X_test = pd.read_csv("artifacts/transformed_data/test_features.csv")
y_test = pd.read_csv("artifacts/transformed_data/test_labels.csv")

In [91]:
log_reg = LogisticRegression()
rf = RandomForestClassifier()
xgb = XGBClassifier()
dt = DecisionTreeClassifier()
grad_boost = GradientBoostingClassifier()
ada_boost = AdaBoostClassifier()
et = ExtraTreeClassifier()

models = {
    "Random Forest": rf,
    "XGBoost": xgb,
    "Decision Tree": dt,
    "Gradient Boosting": grad_boost,
    "Extra Tree": et
}

In [92]:
#unpack the dict
for model_name, model in models.items():
    print(model_name)

Random Forest
XGBoost
Decision Tree
Gradient Boosting
Extra Tree


In [118]:
def evaluate_models(models:dict, X_train:np.ndarray, y_train:np.ndarray, X_test:np.ndarray, y_test:np.ndarray) -> pd.DataFrame:
    """Evaluates a number of models using the same training and testing datasets.
    Args:
        models (dict): A dictionary of models to evaluate
        X_train (np.ndarray): Training features
        y_train (np.ndarray): Training labels
        X_test (np.ndarray): Testing features
        y_test (np.ndarray): Testing labels
    
        models = {"model_name" : model}
    Returns:
        pd.DataFrame: A dataframe of model names and their respective scores
    """
    accuracies = precisions = recalls = f1s = roc_aucs = np.zeros(len(models)) 

    for model_idx, (model_name, model) in enumerate(models.items()):
        logging.info(f"Evaluating {model_name}")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracies[model_idx] = accuracy_score(y_test, y_pred)
        precisions[model_idx] = precision_score(y_test, y_pred, average="weighted")
        recalls[model_idx] = recall_score(y_test, y_pred, average="weighted")
        f1s[model_idx] = f1_score(y_test, y_pred, average="weighted")
        logging.info(f"Score for {model_name} is {model.score(X_test, y_test)}")
        
    return pd.DataFrame({"Model": models.keys(), 
                         "Accuracy": accuracies, 
                         "Precision": precisions, 
                         "Recall": recalls, "F1": f1s})

In [119]:
results = evaluate_models(models=models, X_train=X_train, y_train=np.array(y_train).ravel(), 
                X_test=X_test, y_test=np.array(y_test).ravel())
results

Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,Random Forest,0.860052,0.860052,0.860052,0.860052
1,XGBoost,0.836321,0.836321,0.836321,0.836321
2,Decision Tree,0.775618,0.775618,0.775618,0.775618
3,Gradient Boosting,0.765564,0.765564,0.765564,0.765564
4,Extra Tree,0.716918,0.716918,0.716918,0.716918


0