In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn.pipeline import Pipeline
import joblib


In [2]:
# Utility function to load datasets
def load_data(train_input_path, train_target_path, test_input_path, test_target_path):
    X_train = pd.read_csv(train_input_path)
    Y_train = pd.read_csv(train_target_path)
    X_test = pd.read_csv(test_input_path)
    Y_test = pd.read_csv(test_target_path)
    
    return X_train, Y_train, X_test, Y_test

In [3]:
# Utility function to drop unnecessary columns
def drop_columns(df, columns):
    return df.drop(columns=columns, axis=1)

In [4]:
# Utility function to impute missing values
def impute_missing_values(df, strategy='median'):
    imputer = SimpleImputer(strategy=strategy)
    df_imputed = imputer.fit_transform(df)
    return pd.DataFrame(df_imputed, columns=df.columns)

In [5]:
# Utility function to standardize features
def scale_features(df):
    scaler = StandardScaler()
    scaled_df = scaler.fit_transform(df)
    return pd.DataFrame(scaled_df, columns=df.columns), scaler

In [6]:
# Utility function to plot correlation matrix
def plot_correlation_matrix(df):
    plt.figure(figsize=(12, 8))
    correlation_matrix = df.corr()
    sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm')
    plt.title("Correlation Matrix")
    plt.show()

In [7]:
# Utility function to visualize the distribution of target variable
def plot_target_distribution(target):
    sns.countplot(x=target)
    plt.title("Distribution of Target Variable")
    plt.show()

In [8]:
# Utility function to plot histograms of all numeric features
def plot_histograms(df):
    df.hist(bins=20, figsize=(15, 10), layout=(5, 4))
    plt.suptitle("Histograms for Numeric Features")
    plt.show()

In [9]:
# Utility function to calculate VIF
def calculate_vif(df):
    from statsmodels.stats.outliers_influence import variance_inflation_factor
    vif_data = pd.DataFrame()
    vif_data["feature"] = df.columns
    vif_data["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    return vif_data

In [10]:
# Utility function to visualize feature importance
def plot_feature_importance(model, feature_names):
    importances = model.feature_importances_
    feature_importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importances
    })
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

    plt.figure(figsize=(12, 8))
    sns.barplot(data=feature_importance_df, x='Importance', y='Feature')
    plt.title('Feature Importance')
    plt.show()

In [11]:
# Utility function for ROC Curve visualization
def plot_roc_curve(y_true, y_proba):
    fpr, tpr, _ = roc_curve(y_true, y_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label="ROC Curve")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.legend()
    plt.show()

In [12]:
# Utility function to evaluate model performance
def evaluate_model(y_true, y_pred, y_proba=None):
    accuracy = accuracy_score(y_true, y_pred)
    conf_matrix = confusion_matrix(y_true, y_pred)
    class_report = classification_report(y_true, y_pred)
    
    print(f"Accuracy: {accuracy:.2f}")
    print("\nConfusion Matrix:")
    print(conf_matrix)
    print("\nClassification Report:")
    print(class_report)
    
    if y_proba is not None:
        roc_auc = roc_auc_score(y_true, y_proba)
        print(f"\nROC-AUC Score: {roc_auc:.2f}")

In [13]:
# Utility function to save model
def save_model(model, filename):
    joblib.dump(model, filename)
    print(f"Model saved as {filename}")

In [14]:

# Utility function to load model
def load_model(filename):
    model = joblib.load(filename)
    print(f"Model loaded from {filename}")
    return model