In [1]:
import pandas as pd
import os

from datetime import datetime

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

import mlflow
import mlflow.sklearn


In [2]:
class DataHandler:
    def __init__(self, data_path = None):
        self.data_path = data_path

    def load_dataset(self, filename):
        if self.data_path is None:
            os.chdir(r'C:\Users\hzapi\OneDrive\Documents\Hans Files\07_MNA\13_TC5044_Operaciones de aprendizaje automático\98_Git\MLOps_Project\data\raw')
            df_read = pd.read_csv(filename, sep = ';')
            print('Read OS success')
        else:
            path_dir = self.data_path + '\\' + filename
            df_read = pd.read_csv(filename, sep = ';')
            print('Read path success')
        return df_read

In [3]:
class MLFlowLogger:
    def __init__(self, experiment_name, tracking_uri = 'http://localhost:5000'):
        mlflow.set_tracking_uri(tracking_uri)
        mlflow.set_experiment(experiment_name)
        
    def log_experiment(self, best_model, accuracy, precision, recall, X_sample, run_name = None):
        with mlflow.start_run(run_name = run_name):
            mlflow.sklearn.log_model(best_model, 'best_model', input_example = X_sample)
            
            mlflow.log_metric('accuracy', accuracy)
            mlflow.log_metric('precision', precision)
            mlflow.log_metric('recall', recall)
            mlflow.end_run()

In [4]:
class FeatureEngineer:
    def __init__(self):
        self.label_encoder = LabelEncoder()

    def process(self, df_interest, target):
        df_filter = df_interest[df_interest['Age at enrollment'] <= df_interest['Age at enrollment'].quantile(0.90)].copy()
        df_filter[target] = df_filter[target].fillna('Unknown')
        df_filter['encoded_target'] = self.label_encoder.fit_transform(df_filter[target])
        df_filter = df_filter.drop(target, axis = 1)
        return df_filter, df_filter['encoded_target'].name
    
    def model_data_prep (self, df_interest, features, target, train_split = 0.3):
        features_df = df_interest[features].copy()
        target_df = df_interest[target].copy()
        
        X_train, X_test, y_train, y_test = train_test_split(features_df, target_df, test_size = train_split, random_state = 42)
        X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.5, random_state = 42)

        scaler = StandardScaler()

        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        X_val = scaler.transform(X_val)

        y_train = y_train.values.ravel()
        y_test = y_test.values.ravel()
        y_val = y_val.values.ravel()

        return df_interest, scaler, X_train, X_test, X_val, y_train, y_test, y_val

In [5]:
class ModelTrainer:
    def __init__(self, model, param_grid, experiment_name = None):
        self.model = model
        self.param_grid = param_grid
        self.logger = None
        if experiment_name:
            self.logger = MLFlowLogger(experiment_name)

    def train(self, X_train, X_test, y_train, y_test):
        grid_search = GridSearchCV(self.model, self.param_grid, cv = 3, n_jobs = -1, verbose = 2)
        grid_search.fit(X_train, y_train.ravel())
        best_mod = grid_search.best_estimator_
        y_pred = best_mod.predict(X_test)
        return  best_mod, y_pred

    def evaluate(self, model, y_test, y_pred):

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average = 'weighted')
        recall = recall_score(y_test, y_pred, average = 'weighted')
        print(f'Dropout Model Accuracy: {accuracy}, Precision: {precision}, Recall: {recall})')
        return accuracy, precision, recall
    
    def log(self, X_train, model, accuracy, precision, recall, model_name = None):
        X_sample = X_train[0:1]
        current_datetime = datetime.now()
        datetime_string = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
        model_hash = model_name + '_' + datetime_string + '_HZR'
        if self.logger:
            self.logger.log_experiment(model, accuracy, precision, recall, X_sample, model_hash)


In [6]:
data_handler = DataHandler()
df_student = data_handler.load_dataset('data.csv')

model_target = ['Target']
model_features = df_student.copy().drop(model_target, axis = 1).columns

feature_engineer = FeatureEngineer()
df_processed, target = feature_engineer.process(df_student, model_target)
df_interest, scaler, X_train, X_test, X_val, y_train, y_test, y_val = feature_engineer.model_data_prep(df_processed, model_features, target)



Read OS success


  y = column_or_1d(y, warn=True)


In [7]:
models = {
        'Random Forest': RandomForestClassifier(random_state = 42),
        'Gradient Boosting': GradientBoostingClassifier(random_state = 42),
        'XGBoost': XGBClassifier(random_state = 42),
        'Light GBM': LGBMClassifier(random_state = 42, verbosity = -1),
        'SVC': SVC(),
        'KNN': KNeighborsClassifier()
}

In [8]:
param_grids = {
        'Random Forest': 
        {
                'n_estimators': [50, 100, 200],
                'max_depth': [None, 10, 20, 30],
                'min_samples_split': [2, 5, 10]
        },
        
        'Gradient Boosting': 
        {
                'n_estimators': [50, 100, 200],
                'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 4, 5] 
        },

        'XGBoost': 
        {
                'n_estimators': [50, 100, 200],
                'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 4, 5]
        },
        
        'Light GBM': 
        {
                'n_estimators': [50, 100, 200],
                'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 4, 5]
        },

        'SVC': {
                'C': [0.1, 1, 10],
                'gamma': [0.01, 0.1, 1],
                'kernel': ['linear', 'rbf']
        },        
        
        'KNN': {
                'n_neighbors': [3, 5, 10],
                'weights': ['uniform', 'distance'],
                'algorithm': ['auto', 'ball_tree', 'kd_tree']
        }
}

In [9]:
model_trainers = {}

experiment_name = 'Student Dropout Prediction'

for model_name, model in models.items():
    param_grid = param_grids[model_name]
    class_name = model_name.replace(' ', '') + 'Trainer'
    model_trainer_class = type(class_name, (ModelTrainer,), {})
    model_trainers[model_name] = model_trainer_class(model, param_grid, experiment_name)

for model_name, trainer in model_trainers.items():
    print(f'Model Training: ', model_name)
    best_mod, y_pred = trainer.train(X_train, X_test, y_train, y_test)
    accuracy, precision, recall = trainer.evaluate(best_mod, y_test, y_pred)
    trainer.log(X_train, best_mod, accuracy, precision, recall, model_name)

    X_train, model, accuracy, precision, recall, model_name

Model Training:  Random Forest
Fitting 3 folds for each of 36 candidates, totalling 108 fits
Dropout Model Accuracy: 0.7575250836120402, Precision: 0.7371038672186181, Recall: 0.7575250836120402)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024/10/12 14:10:00 INFO mlflow.tracking._tracking_service.client: 🏃 View run Random Forest_2024-10-12 14:09:55_HZR at: http://localhost:5000/#/experiments/3/runs/e495483f52ad48e9ac916068f3e03694.
2024/10/12 14:10:00 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/3.


Model Training:  Gradient Boosting
Fitting 3 folds for each of 27 candidates, totalling 81 fits
Dropout Model Accuracy: 0.7725752508361204, Precision: 0.7578945764669582, Recall: 0.7725752508361204)




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024/10/12 14:10:11 INFO mlflow.tracking._tracking_service.client: 🏃 View run Gradient Boosting_2024-10-12 14:10:09_HZR at: http://localhost:5000/#/experiments/3/runs/705d4efebab64ea6b86518f22bd19562.
2024/10/12 14:10:11 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/3.


Model Training:  XGBoost
Fitting 3 folds for each of 27 candidates, totalling 81 fits




Dropout Model Accuracy: 0.7692307692307693, Precision: 0.7507798298652871, Recall: 0.7692307692307693)




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024/10/12 14:10:23 INFO mlflow.tracking._tracking_service.client: 🏃 View run XGBoost_2024-10-12 14:10:20_HZR at: http://localhost:5000/#/experiments/3/runs/1a661fde59b64916abecad424b59f2d6.
2024/10/12 14:10:23 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/3.


Model Training:  Light GBM
Fitting 3 folds for each of 27 candidates, totalling 81 fits
Dropout Model Accuracy: 0.7692307692307693, Precision: 0.7536317274274268, Recall: 0.7692307692307693)




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024/10/12 14:10:39 INFO mlflow.tracking._tracking_service.client: 🏃 View run Light GBM_2024-10-12 14:10:36_HZR at: http://localhost:5000/#/experiments/3/runs/9771ae3ce7324eb48e833d86b617e8f9.
2024/10/12 14:10:39 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/3.


Model Training:  SVC
Fitting 3 folds for each of 18 candidates, totalling 54 fits
Dropout Model Accuracy: 0.774247491638796, Precision: 0.7599055274643832, Recall: 0.774247491638796)




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024/10/12 14:10:42 INFO mlflow.tracking._tracking_service.client: 🏃 View run SVC_2024-10-12 14:10:40_HZR at: http://localhost:5000/#/experiments/3/runs/d773eadc9e3e4bde98ef700090fa3fda.
2024/10/12 14:10:42 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/3.


Model Training:  KNN
Fitting 3 folds for each of 18 candidates, totalling 54 fits
Dropout Model Accuracy: 0.7224080267558528, Precision: 0.7025216501553763, Recall: 0.7224080267558528)




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024/10/12 14:10:45 INFO mlflow.tracking._tracking_service.client: 🏃 View run KNN_2024-10-12 14:10:42_HZR at: http://localhost:5000/#/experiments/3/runs/42d5b1c91d634288b89723ebb61f04c0.
2024/10/12 14:10:45 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/3.
