In [1]:
import pandas as pd
import os

import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score


In [2]:
def load_dataset (filename, path = None):
    if path is None:
        os.chdir(r'C:\Users\hzapi\OneDrive\Documents\Hans Files\07_MNA\13_TC5044_Operaciones de aprendizaje automático\98_Git\MLOps_Project\data\raw')
        df_read = pd.read_csv(filename, sep=';')
        print ('Read OS success')
    else:
        path_dir = path + '\\' + filename
        df_read = pd.read_csv(filename, sep=';')
        print ('Read path success')
    return  df_read

In [3]:
def save_model (name,prefix, path):
        header = get_df_name(df_interest).lower()
        name = 'model_demand_' + header + '_rf.pkl'
        save_demand_models(best_model, scaler, path, name)
        print (f'Model Saved as: {name}')

In [4]:
def feature_engineering (df_interest, target):
    df_filter = df_interest[df_interest['Age at enrollment'] <= df_interest['Age at enrollment'].quantile(0.90)].copy()
    df_filter[target] = df_filter[target].fillna('Unknown')
    
    label_encoder = LabelEncoder()
    df_filter['encoded_target'] = label_encoder.fit_transform(df_filter[target])
    
    #encoder = OneHotEncoder(sparse_output = False, handle_unknown = 'ignore')
    #one_hot_encoded = encoder.fit_transform(df_filter[[target[0]]])
    #new_target = encoder.get_feature_names_out([target[0]])
    #one_hot_df = pd.DataFrame(one_hot_encoded, columns = new_target, index = df_filter.index)

    #df_encoded = pd.concat([df_filter, one_hot_df], axis = 1)
    #df_encoded = df_encoded.drop(target, axis = 1)

    #if df_encoded.isnull().sum().sum() > 0:
    #    print("Warning: NaN values found after encoding.")
    #    print(df_encoded.isnull().sum())
    #    return

    df_filter = df_filter.drop(target, axis = 1)

    return df_filter, df_filter['encoded_target'].name

In [5]:
def model_data_prep(df_interest, features, target, train_split = 0.3):
    alt_df, new_target = feature_engineering(df_interest.copy(), target)

    features_df = alt_df[features].copy()
    target_df = alt_df[new_target].copy()
    
    X_train, X_test, y_train, y_test = train_test_split(features_df, target_df, test_size = train_split, random_state = 42)
    X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.5, random_state = 42)

    scaler = StandardScaler()

    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    X_val = scaler.transform(X_val)

    y_train = y_train.values.ravel()
    y_test = y_test.values.ravel()
    y_val = y_val.values.ravel()

    return alt_df, scaler, X_train, X_test, X_val, y_train, y_test, y_val

In [6]:
def model_grid_search(model, param_grid, X_train, X_test, y_train, y_test):
    grid_search = GridSearchCV(estimator = model, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2)
    grid_search.fit(X_train, y_train.ravel())
    best_mod = grid_search.best_estimator_
    y_pred = best_mod.predict(X_test)
    return  best_mod, y_pred

In [7]:
def performance_metrics(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average = 'weighted')
    recall = recall_score(y_test, y_pred, average = 'weighted')
    print(f'Dropout Model Accuracy: {accuracy}, Precision: {precision}, Recall: {recall})')
    

In [8]:
def setup_model (df_interest, model_name, model_type, features, target, param_grid, save_model = False, path = None):
    print(f'Model Training: ', model_name)
    alt_df, scaler, X_train, X_test, X_val, y_train, y_test, y_val = model_data_prep(df_interest, features, target)
    best_model, y_pred = model_grid_search(model_type, param_grid, X_train, X_test, y_train, y_test)
    performance_metrics(y_test, y_pred)

    if save_model:
        header = get_df_name(df_interest).lower()
        name = 'model_demand_' + header + '_knn.pkl'
        save_demand_models(best_model, scaler, path, name)
        print (f'Model Saved as: {name}')

    return best_model, X_train, scaler, alt_df

In [9]:
def predict_dropout(input_data, model_name, model_type, features, target,  param_grid, save_model = False, path = None):
    model, X_train, scaler, features_refined = setup_model(input_data, model_name, model_type, features, target, param_grid, save_model, path) 

In [10]:
models = {
        'Random Forest': RandomForestClassifier(random_state = 42),
        'Gradient Boosting': GradientBoostingClassifier(random_state = 42),
        'XGBoost': XGBClassifier(random_state = 42),
        'Light GBM': LGBMClassifier(random_state = 42, verbosity = -1),
        'SVC': SVC(),
        'KNN': KNeighborsClassifier()
}

param_grid = {
        'Random Forest': 
        {
                'n_estimators': [50, 100, 200],
                'max_depth': [None, 10, 20, 30],
                'min_samples_split': [2, 5, 10]
        },
        
        'Gradient Boosting': 
        {
                'n_estimators': [50, 100, 200],
                'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 4, 5] 
        },

        'XGBoost': 
        {
                'n_estimators': [50, 100, 200],
                'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 4, 5]
        },
        
        'Light GBM': 
        {
                'n_estimators': [50, 100, 200],
                'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 4, 5]
        },
        'SVC': {
                'C': [0.1, 1, 10],
                'gamma': [0.01, 0.1, 1],
                'kernel': ['linear', 'rbf']
        },        
        'KNN': {
                'n_neighbors': [3, 5, 10],
                'weights': ['uniform', 'distance'],
                'algorithm': ['auto', 'ball_tree', 'kd_tree']
        }
}

In [11]:
df_student = load_dataset('data.csv')
model_target = ['Target']
model_features = df_student.copy().drop(model_target, axis = 1).columns

Read OS success


In [12]:
for index, (model_name, model) in enumerate(models.items()):
    model_name, model_param = list(param_grid.items()) [index]
    predict_dropout(df_student, model_name, model, model_features, model_target, model_param)

Model Training:  Random Forest
Fitting 3 folds for each of 36 candidates, totalling 108 fits


  y = column_or_1d(y, warn=True)


Dropout Model Accuracy: 0.7575250836120402, Precision: 0.7371038672186181, Recall: 0.7575250836120402)
Model Training:  Gradient Boosting
Fitting 3 folds for each of 27 candidates, totalling 81 fits


  y = column_or_1d(y, warn=True)


Dropout Model Accuracy: 0.7725752508361204, Precision: 0.7578945764669582, Recall: 0.7725752508361204)
Model Training:  XGBoost
Fitting 3 folds for each of 27 candidates, totalling 81 fits


  y = column_or_1d(y, warn=True)


Dropout Model Accuracy: 0.7692307692307693, Precision: 0.7507798298652871, Recall: 0.7692307692307693)
Model Training:  Light GBM
Fitting 3 folds for each of 27 candidates, totalling 81 fits


  y = column_or_1d(y, warn=True)


Dropout Model Accuracy: 0.7692307692307693, Precision: 0.7536317274274268, Recall: 0.7692307692307693)
Model Training:  SVC
Fitting 3 folds for each of 18 candidates, totalling 54 fits


  y = column_or_1d(y, warn=True)


Dropout Model Accuracy: 0.774247491638796, Precision: 0.7599055274643832, Recall: 0.774247491638796)
Model Training:  KNN
Fitting 3 folds for each of 18 candidates, totalling 54 fits


  y = column_or_1d(y, warn=True)


Dropout Model Accuracy: 0.7224080267558528, Precision: 0.7025216501553763, Recall: 0.7224080267558528)


In [13]:
for index, (model_name, model) in enumerate(models.items()):
    model_name, model_param = list(param_grid.items()) [index]
    print(model_name)

Random Forest
Gradient Boosting
XGBoost
Light GBM
SVC
KNN
