# MLOps Implementation and Model Evaluation
## Credit Risk Classification — Analysis of the South German Credit Dataset

### Table of Contents
1. [Libraries](#libraries)
2. [Dataset Preview](#dataset)
3. [Data Analysis & Cleaning](#data)
4. [EDA](#eda)
5. [Feature Engineeding and Preprocessing Pipelines](#feature)
6. [Machine Learning Models](#ml)
7. [Logistic Regression Model](#lr)
8. [k-Nearest Neighbors Model](#knn)
9. [Decision Tree Model](#dt)
10. [Random Forrest](#rf)
11. [XGBoost Model](#xgb)
12. [MLP Model](#mlp)
13. [SVC Model](#svc)
14. [Models Comparison Summary](#result)



#Libraries

In [None]:
!pip install category_encoders

# --- Google Colab específicos ---
from google.colab import userdata
from google.colab import drive

# --- Manejo de datos ---
import pandas as pd
import numpy as np

# --- Visualización ---
import matplotlib.pyplot as plt
import seaborn as sns

# --- Preprocesamiento y transformación ---
from sklearn.model_selection import train_test_split, cross_validate, cross_val_predict, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.preprocessing import MinMaxScaler, PowerTransformer, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

# --- Modelos clásicos ---
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

# --- Métricas y evaluación ---
from sklearn.metrics import (
    confusion_matrix,
    ConfusionMatrixDisplay,
    make_scorer
)

# --- Balanceo de clases ---
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.metrics import geometric_mean_score

# --- Codificación categórica ---
import category_encoders as ce

# --- XGBoost ---
from xgboost import XGBClassifier


#Dataset Preview

In [None]:
drive.mount('/content/drive', force_remount=True)
FOLDER_ID = userdata.get('FOLDER_ID')
TARGET = f"/content/drive/.shortcut-targets-by-id/{FOLDER_ID}"
import os
os.chdir(TARGET)

In [None]:
#Preview Dataset
df = pd.read_csv('trabajo_grupal_mlops/data/german_credit_modified.csv')
df.T

In [None]:
#Rename Columns headers
col_ing = ['status', 'duration', 'credit_history', 'purpose', 'amount',
           'savings', 'employment_duration', 'installment_rate', 'personal_status_sex',
           'other_debtors', 'present_residence', 'property', 'age', 'other_installment_plans',
           'housing', 'number_credits', 'job', 'people_liable', 'telephone', 'foreign_worker',
           'credit_risk','mixed']

df.columns = col_ing
df.head(10).T


#Data Analysis & Cleaning


In [None]:
#Delete "mixed" Column
df.drop(columns=['mixed'], inplace=True) # se borra la columna que solo contiene basura

In [None]:
#Show general information of the dataset
df.info()

In [None]:
# Define variable types and convert data accordingly
# This section classifies the dataset columns into numerical, ordinal, and categorical/binary groups,
# then applies appropriate data type conversions for consistent analysis

# Variables numéricas:
num_col = ['duration', 'amount', 'age']

# Variables ordinales:
ord_col = ['employment_duration','installment_rate', 'present_residence','property', 'number_credits', 'job']

# Variables nominales & binarias:
cat_col = ['status', 'credit_history', 'purpose',  'savings','people_liable',
                   'personal_status_sex', 'other_debtors', 'other_installment_plans',
                   'housing', 'telephone', 'foreign_worker']

print("Para los datos de entrada, veamos la cantidad de cada tipo de variable obtenida:")
print("Variables numéricas:", len(num_col))
print("Variables ordinales:", len(ord_col))
print("Variables nominales & binarias:", len(cat_col))

df[num_col] = df[num_col].apply(pd.to_numeric, errors="coerce")
df[ord_col] = df[ord_col].astype("object")
df[cat_col] = df[cat_col].astype("object")

In [None]:
#Show general information of the dataset
#This helps verify that the previous corrections were successfully applied
df.info()

In [None]:
# Display the count of distinct values for each column (sorted from highest to lowest)
df.nunique().sort_values(ascending=False)

In [None]:
# Generate descriptive statistics for all numeric columns
df.select_dtypes(include="number").describe().T

In [None]:
# Display frequency counts for each numeric column (including NaN), sorted by value
# This helps identify inconsistencies or potential data entry errors
for col in df.select_dtypes(include="number").columns:
    vc = df[col].value_counts(dropna=False)
    print(f"== {col} (by value ascending) ==")
    print(vc.sort_index())
    print("-" * 50)


In [None]:
# Correct column value errors identified in the previous analysis
df["duration"] = df["duration"].astype("Float64").mask(df["duration"] > 72, other=pd.NA)
df["age"] = df["age"].astype("Float64").mask(df["age"] > 75, other=pd.NA)
df["amount"] = df["amount"].astype("Float64").mask(df["amount"] > 25000, other=pd.NA)

In [None]:
# Display frequency counts for each numeric column (including NaN), sorted by value
# This helps verify that the previous corrections were successfully applied
for col in df.select_dtypes(include="number").columns:
    vc = df[col].value_counts(dropna=False)
    print(f"== {col} (by value ascending) ==")
    print(vc.sort_index())
    print("-" * 50)

In [None]:
# Display frequency counts for each non numeric columns (including NaN), sorted by value
# This helps identify inconsistencies or potential data entry errors
for col in df.columns:
    if df[col].dtype == "object":
        print(df[col].value_counts(dropna=False))
        print('-' * 50)

In [None]:
# Correct column value errors identified in the previous analysis
obj_cols = df.select_dtypes(include=["object", "string"]).columns
df[obj_cols] = df[obj_cols].replace(r"\s+", "", regex=True)
counts = df[obj_cols].apply(lambda s: s.map(s.value_counts(dropna=False)))
df[obj_cols] = df[obj_cols].mask(counts < 7)
df['credit_risk'] = pd.to_numeric(df['credit_risk'], errors='coerce')
df = df[df['credit_risk'].isin([0.0, 1.0])]

In [None]:
# Display frequency counts for each non numeric column (including NaN), sorted by value
# This helps verify that the previous corrections were successfully applied
for col in df.columns:
    if df[col].dtype == "object":
        print(df[col].value_counts(dropna=False))
        print('-' * 50)

#EDA

In [None]:
#Show general information of the dataset
df.info()

In [None]:
# Generate summary statistics for all numeric variables as part of the EDA process
df.select_dtypes(include="number").describe().T

In [None]:
# Generate summary statistics for all non numeric variables as part of the EDA process
df.select_dtypes(exclude=np.number).describe().T

In [None]:
# Check for missing values in each column
df.isna().sum()

In [None]:
# Remove rows with missing values in the 'credit_risk' column
#This ensures we remove all NaN values from the target variable used as the model's output
df.dropna(subset=["credit_risk"], inplace=True)

In [None]:
# Generate and visualize the correlation matrix for numerical and encoded categorical variables
# This helps identify relationships and potential multicollinearity among features
corr_matrix = df.corr()

plt.figure(figsize=(15,12))
sns.heatmap(
    corr_matrix,
    annot=True,
    cmap="plasma",
    fmt=".2f",
    cbar=True
)
plt.title("Matriz de correlación (numéricas + categóricas codificadas)")
plt.show()

There is a little bit of correlation between some variables for example:

*   Amount & Duration
*   Number Credit & Credit History
*   Property & Duration
*   Property & Amount
*   Housing & Property
*   Housing & Age
*   Telephone & Job

Some variables have inverse correlation for example:

*   Installment rate & Amount
*   Credit risk & duration


In [None]:
# Visualize the distribution of numeric variables using boxplots and histograms
# Boxplots help detect outliers, while histograms show the overall data distribution and skewness
rows, cols = 2, 3
fig, axes = plt.subplots(rows, cols, figsize=(25, 8))

for i, col in enumerate(num_col):
    row_box = (i // cols) * 2
    col_box = i % cols
    row_hist = row_box + 1

    # Boxplot.
    sns.boxplot(x=df[col], ax=axes[row_box, col_box])
    axes[row_box, col_box].set_title(f"Boxplot de {col}")
    axes[row_box, col_box].set_xlabel("")

    # Histograma.
    sns.histplot(df[col], kde=True, ax=axes[row_hist, col_box], bins=20)
    axes[row_hist, col_box].set_title(f"Histograma de {col}")
    axes[row_hist, col_box].set_xlabel("")

In [None]:
# Visualize frequency distributions for all categorical variables
# This helps identify dominant categories, class imbalance, and potential data entry issues
categorical_atts = df.select_dtypes(include=['object']).columns.tolist()

fig, axes = plt.subplots(6, 3, figsize=(12, 20))
plt.subplots_adjust(wspace=.5, hspace=.5)
axes = axes.ravel()

for i, col in enumerate(categorical_atts):
    # Convert categories to numeric for ordering (ignore errors if non-numeric)
    categories = pd.to_numeric(df[col], errors="coerce").dropna().unique()
    order = sorted(categories, reverse=True)  # descending numeric order

    ax = axes[i]
    sns.countplot(
        y=pd.to_numeric(df[col], errors="coerce"),
        ax=ax,
        order=order
    )
    ax.set_title(f'Frecuencia de {col}')

    # --- Add counts at the end of each bar ---
    for container in ax.containers:
        ax.bar_label(container, fmt='%d', label_type='edge', padding=3)

    xmin, xmax = ax.get_xlim()
    ax.set_xlim(xmin, xmax + 80)

plt.tight_layout()
plt.show()

In [None]:
# Display the count of each class in the target variable 'credit_risk'
# This helps evaluate class balance before training the model
print(df['credit_risk'].value_counts())

In [None]:
# Handle missing values in the dataset
# Numeric columns are filled with their median to preserve distribution and reduce outlier influence
# Categorical columns are filled with their mode (most frequent value) to maintain logical consistency
# We avoid deleting rows since the dataset is small and prefer substituting NaN values

# Fill numeric columns with median
for col in df.select_dtypes(include=["number"]).columns:
    if df[col].isna().any():
        df[col] = df[col].fillna(df[col].median())

# Fill categorical (object) columns with mode
for col in df.select_dtypes(include=["object"]).columns:
    if df[col].isna().any():
        top = df[col].mode().iloc[0]
        df[col] = df[col].fillna(top)


In [None]:
# Reset the DataFrame index after cleaning to maintain sequential order
# This ensures consistent indexing after any row removals or modifications
df = df.reset_index(drop=True)

In [None]:
#Show general information of the modify dataset
df.info()

In [None]:
# Convert all DataFrame columns to numeric type (no invalid values expected)
# Then cast the data to int64 for consistent numerical representation
df = df.apply(pd.to_numeric).astype("int64")

# Feature Engineering and Preprocessing Pipelines

In [None]:
# Invert the values of the target variable 'credit_risk'
# (change 1 → 0 and 0 → 1) to match the desired labeling scheme for the model
# Then, display the count of each class to confirm the change
df['credit_risk'] = df['credit_risk'].apply(lambda x: 0 if x == 1 else 1)
print(df['credit_risk'].value_counts())

In [None]:
# Split the dataset into training and testing sets for model evaluation
# Use stratified sampling to preserve the class distribution of the target variable ('credit_risk')
# Display dataset dimensions and the percentage distribution of positive and negative classes

# Note: We are not using a separate validation set because the dataset is too small.
# In a larger dataset, it would be recommended to split into training, validation, and testing sets
# to prevent data leakage and ensure better model generalization.

X = df.drop(columns=['credit_risk'])
y = df['credit_risk']
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.30, random_state=1, stratify=y)

print("Dimensiones:")
print("Entrenamiento:", Xtrain.shape, ytrain.shape)
print("Prueba:", Xtest.shape, ytest.shape)

tmp = (ytrain.sum()/ytrain.shape[0])
print("\nPorcentaje clases Positiva:%.2f%%, y Negativa:%.2f%%" % (tmp*100,100*(1-tmp)))#Se cambia el orden para mostrar el porcentaje correcto de las clases.

In [None]:
# ============================================
# Feature Transformation Pipelines
# ============================================
# In this section, we define preprocessing pipelines for different variable types.
# Each pipeline handles missing values, scaling, and encoding appropriately for its data type.
# A custom BinaryEncoderWrapper is also implemented to integrate binary encoding within a scikit-learn pipeline.
# This step ensures that all features are properly scaled and encoded before model training.
#
# Note: Since the dataset is relatively small, efficient encoding (like Binary Encoding)
# helps reduce dimensionality compared to One-Hot Encoding, minimizing overfitting risk.
# ============================================

# Crear un Transformer personalizado para Binary Encoding
class BinaryEncoderWrapper(BaseEstimator, TransformerMixin):
    def __init__(self, cols):
        self.cols = cols
        self.encoder = ce.BinaryEncoder(cols=self.cols)
        self.feature_names_out_ = None

    def fit(self, X, y=None):
        X_df = pd.DataFrame(X, columns=self.cols)  # Convertir a DataFrame
        self.encoder.fit(X_df)
        self.feature_names_out_ = self.encoder.get_feature_names_out()  # Guardar nombres de columnas
        return self

    def transform(self, X):
        X_df = pd.DataFrame(X, columns=self.cols)  # Convertir a DataFrame nuevamente
        return self.encoder.transform(X_df).to_numpy()  # Convertir de DataFrame a array

    def get_feature_names_out(self, input_features=None):
        return self.feature_names_out_


# Variables numéricas:
numericas_pipe =  Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler(feature_range=(0, 1))),
    ('power_transform', PowerTransformer(method='yeo-johnson'))
])
numericas_pipe_nombres = num_col

# Variables categóricas-Nominales:
nominales_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('binary_encoder', BinaryEncoderWrapper(cols=cat_col)),  # Aplicar Binary Encoding
    #('onehot_encoder', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
])
nominales_pipe_nombres = cat_col

# Variables categóricas-ordinales:
ordinales_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal_encoder', OrdinalEncoder())
])
ordinales_pipe_nombres = ord_col

columnasTransformer = ColumnTransformer(
    transformers=[
        ('num', numericas_pipe, numericas_pipe_nombres),
        ('nom', nominales_pipe, nominales_pipe_nombres),
        ('cat', ordinales_pipe, ordinales_pipe_nombres),
])

Xtmp = Xtrain.copy()
tmp = columnasTransformer.fit_transform(Xtmp)
print("Dimensión de los datos de entrada:")
print("antes de aplicar las transformaciones:", Xtmp.shape)
print("después de aplicar las transformaciones:", tmp.shape)

In [None]:
# Apply the preprocessing pipelines to the complete dataset (training + testing)
# This step ensures consistent transformation and encoding across all data before model integration
# The goal is to verify the dimensional changes after applying the ColumnTransformer
# and confirm that all preprocessing steps (scaling, encoding, imputing) were correctly applied

Xtraintest = pd.concat([Xtrain, Xtest])
ytraintest = pd.concat([ytrain, ytest])
Xtmp = Xtraintest.copy()
tmp = columnasTransformer.fit_transform(Xtmp)
print("Dimensión de las variables de entrada ANTES de las transformaciones:", Xtmp.shape)
print("Dimensión de las variables de entrada DESPUÉS de las transformaciones:", tmp.shape)

In [None]:
# Model Training, Evaluation, and Validation Function
# This function (`mi_fun`) automates the entire training and evaluation process
# for a given machine learning model within a unified pipeline.

def mi_fun(modelo, nombre, Xtraintest, ytraintest, metodo_uo=None):
    """
    Se lleva a cabo el proceso de entrenamiento y evaluación
    con diferentes métricas. Retorna un DataFrame con los resultados.
    """

    pipeline = ImbPipeline(steps=[
        ('preprocesador', columnasTransformer),
        ('sub_sobre_muestreo', metodo_uo),
        ('model', modelo)
    ])

    micv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=5)
    mismetricas = {
        'Accuracy': 'accuracy',
        'Precision': 'precision',
        'Recall': 'recall',
        'F1': 'f1',
        'AUC': 'roc_auc',
        'Gmean': make_scorer(geometric_mean_score)
    }

    scores = cross_validate(
        pipeline,
        Xtraintest,
        np.ravel(ytraintest),
        scoring=mismetricas,
        cv=micv,
        return_train_score=True
    )

    # --- Crear tabla comparativa ---
    rows = []
    for metric in mismetricas.keys():
        test_mean  = np.nanmean(scores[f'test_{metric}'])
        test_std   = np.nanstd(scores[f'test_{metric}'])
        train_mean = np.nanmean(scores[f'train_{metric}'])
        train_std  = np.nanstd(scores[f'train_{metric}'])
        rows.append({
            "MODEL": nombre,
            "TEST NAME": metric,
            "TRAIN": f"{train_mean:.4f} (±{train_std:.3f})",
            "TEST":  f"{test_mean:.4f} (±{test_std:.3f})"
        })

    df_results = pd.DataFrame(rows)

    # Mostrar resultados individuales
    print(f"\n>> Resultados de {nombre}")
    print(df_results.to_string(index=False))

    # --- Matriz de confusión (con predicciones OOF) ---
    cv_cm = StratifiedKFold(n_splits=5, shuffle=True, random_state=5)
    y_true = np.ravel(ytraintest)
    y_pred_oof = cross_val_predict(pipeline, Xtraintest, y_true, cv=cv_cm, method='predict')

    labels = np.unique(y_true)
    cm = confusion_matrix(y_true, y_pred_oof, labels=labels)

    fig, ax = plt.subplots(figsize=(3, 3))
    ConfusionMatrixDisplay(cm, display_labels=labels).plot(
        ax=ax, cmap="cividis", values_format="d", colorbar=False
    )
    ax.set_title(f"Matriz de Confusión — {nombre}")
    plt.show()

    return df_results

#Machine Learning Models

# Logistic Regression Model

In [None]:
nombre = "Regresión_Logística"

modelo = LogisticRegression(
    penalty='l2',
    C=0.1,
    max_iter=5000,
    solver='saga',
    random_state=1,
)

metodo_uo =SMOTE(random_state=1)
df_logreg = mi_fun(modelo,nombre, Xtraintest, ytraintest, metodo_uo)

#K-Nearest Neighbors Model



In [None]:
nombre = "KNN"

modelo = KNeighborsClassifier(
    n_neighbors=25,
    weights='uniform',
    metric='manhattan',
    p=1,
    algorithm='auto'
)

metodo_uo = NearMiss(version=2)
df_KNN = mi_fun(modelo, nombre, Xtraintest, ytraintest, metodo_uo)

#Decision Tree Model


In [None]:
nombre = "DTree"

modelo = DecisionTreeClassifier(
    criterion="gini",
    max_depth=20,
    min_samples_split=20,
    min_samples_leaf=5,
    max_features="sqrt",
    ccp_alpha=0.01,
    class_weight="balanced",
    random_state=1,
    splitter="best"
)

metodo_uo = SMOTETomek(random_state=1)
df_DTree = mi_fun(modelo, nombre, Xtraintest, ytraintest, metodo_uo)

#Random Forrest Model

In [None]:
nombre = "RF"

modelo = RandomForestClassifier(
    n_estimators= 180,
    max_depth=6,
    min_samples_split=20,
    min_samples_leaf=25,
    max_features="log2",
    class_weight="balanced",
    bootstrap=False,
    random_state=1
)

metodo_uo = SMOTEENN(random_state=1)
df_RF = mi_fun(modelo, nombre, Xtraintest, ytraintest, metodo_uo)

#Extreme Gradient Boosting (XGBoost) Model

In [None]:
nombre = "XGBoost"

modelo = XGBClassifier(
    booster= 'gbtree',
    n_estimators=200,
    max_depth= 2,
    learning_rate=0.01,
    subsample=0.4,
    colsample_bytree=0.5,
    reg_alpha=1.0,
    reg_lambda=4.0,
    objective='binary:logistic',
    tree_method='hist',
    random_state=1,
    n_jobs=-1
)

metodo_uo = NearMiss(version=3)
df_XGBoost = mi_fun(modelo, nombre, Xtraintest, ytraintest, metodo_uo)

#Multi-Layer Perceptron (MLP) Model

In [None]:
nombre = "MLP"

modelo = MLPClassifier(
    hidden_layer_sizes=(8, 4),
    activation='relu',
    solver='adam',
    max_iter=1500,
    alpha=5,
    learning_rate='adaptive',
    learning_rate_init=0.007,
    tol=1e-4,
    early_stopping=True,
    n_iter_no_change=15,
    random_state=1
)

metodo_uo = SMOTETomek(random_state=1)
df_MLP = mi_fun(modelo, nombre, Xtraintest, ytraintest, metodo_uo)

#Support Vector Classifier (SVC) Model

In [None]:
nombre = "SVC"

modelo = SVC(
    kernel='rbf',
    C=4,
    gamma=0.0025,
    class_weight='balanced',
    probability=True,
    random_state=1
)

metodo_uo = SMOTE(sampling_strategy=0.7, random_state=1)
df_SVC = mi_fun(modelo, nombre, Xtraintest, ytraintest, metodo_uo)

# Models Comparison Summary


In [None]:
# Comparative Summary of Model Performance
# This section consolidates the evaluation results from all trained models
# (Logistic Regression, KNN, Decision Tree, Random Forest, XGBoost, MLP, and SVC)
# into a single DataFrame for comparison.

# Unir todos los modelos
df_final = pd.concat([
    df_logreg,
    df_KNN,
    df_DTree,
    df_RF,
    df_XGBoost,
    df_MLP,
    df_SVC
], ignore_index=True)

# Pivotear: filas = métricas, columnas = modelos, valores = TEST
tabla_comparativa = df_final.pivot(
    index="TEST NAME",
    columns="MODEL",
    values="TEST"
)

pd.set_option("display.max_columns", None)   # mostrar todas las columnas
pd.set_option("display.width", None)         # desactivar el corte automático por ancho
pd.set_option("display.colheader_justify", "center")  # centrar cabeceras (opcional)

print("\n=== Comparativa por métricas (TEST) ===")
print(tabla_comparativa.to_string(line_width=2000))



In [None]:
# Heatmap Comparison of Model Performance Metrics
# This section visualizes the comparative performance of all models
# across multiple evaluation metrics using a heatmap.

# Copiar la tabla comparativa
tabla_numeric = tabla_comparativa.copy()

# Convertir cada celda de string "0.7063 (±0.024)" a número 0.7063
for col in tabla_numeric.columns:
    tabla_numeric[col] = tabla_numeric[col].map(lambda x: float(x.split()[0]) if isinstance(x, str) else x)


# Transponer: modelos en filas, métricas en columnas
tabla_numeric_T = tabla_numeric.T

plt.figure(figsize=(12,6))
sns.heatmap(tabla_numeric_T, annot=True, fmt=".3f", cmap="plasma", cbar=True)

plt.title("Comparativa de Modelos por Métricas (TEST)", fontsize=14)
plt.ylabel("Modelos")
plt.xlabel("Métricas")
plt.xticks(rotation=45, ha="right")
plt.show()
