# *Data Manager Class*

In [1]:
import yaml
import pandas as pd
from sklearn.model_selection import train_test_split

class Data_Manager():
    def __init__(self, path):
        with open(path) as config:
            self.config = yaml.safe_load(config)
        self.data = self.load_data()
    
    def load_data(self):
        path = self.config['data_load']['dataset']
        data = pd.read_csv(path, sep=';')
        self.data = data
    
    def split_data(self):
        target = self.config['data_load']['target']
        test = self.config['data_load']['test']
        X = self.data.drop([target], axis=1)
        y = self.data[target]
        Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=test, stratify=y)
        print("La dimension del conjunto de entrenamiento es: ",Xtrain.shape)
        print("La dimension del conjunto de prueba es: ",Xtest.shape)
        Xtrain.to_csv(self.config['datasets']['path_xtrain'], sep=';', index=False)
        Xtest.to_csv(self.config['datasets']['path_xtest'], sep=';', index=False)
        ytrain.to_csv(self.config['datasets']['path_ytrain'], sep=';', index=False)
        ytest.to_csv(self.config['datasets']['path_ytest'], sep=';', index=False)
        


# *Feature Processor Class*

In [2]:
import yaml
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from warnings import filterwarnings
filterwarnings('ignore')

class Feature_Manager():
    def __init__(self, path):
        with open(path) as config:
            self.config = yaml.safe_load(config)
        self.pca = PCA()
        self.pca_components = None
        self.log_pipe = Pipeline([('Log', FunctionTransformer(np.log1p, feature_names_out='one-to-one'))] )
        self.log_pipe_nombres = self.config['features']['log']
        self.scaler_pipe = Pipeline([('scaler', StandardScaler())] )
        self.scaler_pipe_nombres = self.config['features']['scaler']
        self.catOHE_pipeline = Pipeline( [('OneHot', OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False))] )
        self.catOHE_pipeline_nombres = list(set(self.get_allcolumns()) - set(self.log_pipe_nombres + self.scaler_pipe_nombres))
        self.ct_numericas = ColumnTransformer( transformers=[('log_transformer', self.log_pipe, self.log_pipe_nombres),('standard_Scaler', self.scaler_pipe, self.scaler_pipe_nombres)])
        self.ct_categoricas = ColumnTransformer( transformers=[('cat', self.catOHE_pipeline, self.catOHE_pipeline_nombres)])
        self.all_categories = self.get_total_categories()
    
    def get_allcolumns(self):
        path = self.config['datasets']['path_xtrain']
        data = pd.read_csv(path, sep=';')
        return data.columns.values

    def get_total_categories(self):
        path = self.config['data_load']['dataset']
        data = pd.read_csv(path, sep=';')
        all_cat = self.ct_categoricas.fit_transform(data)
        total_categories = self.ct_categoricas.named_transformers_['cat'].get_feature_names_out()
        return total_categories
    
    def sync_columns(self, reference, data):
        data = data[reference.columns]
        return data
    
    def complete_features(self, data):
        columns = data.columns
        missig_categories = list(set(self.all_categories)-set(columns))
        for missing in missig_categories:
            data[missing]=0
        return data

    def get_PCA_components(self):
        path = self.config['datasets']['path_xtrain']
        data = pd.read_csv(path, sep=';')
        processed = self.ct_numericas.fit_transform(data)
        x_projected = self.pca.fit_transform(processed)
        va = np.cumsum(self.pca.explained_variance_ratio_)
        components = None
        for i in range(len(va)):
            if va[i] > 0.9:
                print(f'El número de componentes que explican el 90% de la varianza son: {i+1}')
                components = i+1
                break
        self.pca_components = components

    def process_features(self, data):
        if self.pca_components:
            data_nums_processed = self.ct_numericas.fit_transform(data)
            x_projected = self.pca.fit_transform(data_nums_processed)
            x_projected = pd.DataFrame(x_projected)
            data_cat_processed= self.ct_categoricas.fit_transform(data)
            onehot_columns = self.ct_categoricas.named_transformers_['cat'].get_feature_names_out()
            data_cat_processed_df = pd.DataFrame(data_cat_processed, columns=onehot_columns)
            data_cat_processed_df = self.complete_features(data_cat_processed_df)
            componentes = x_projected.iloc[:,0:self.pca_components]
            componentes.reset_index(drop=True, inplace=True)
            data_cat_processed_df.reset_index(drop=True, inplace=True)
            data_final = pd.concat([componentes,data_cat_processed_df], axis=1)
            data_final.columns = data_final.columns.astype(str)
            return data_final
        else:
            raise Exception("PCA components has not been calculated, please first run the calculation to process the features.")

# *Models class*

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
import joblib 

class Models():
    def __init__(self, path):
        with open(path) as config:
            self.config = yaml.safe_load(config)
        self.models, self.names = self.generate_models()
        self.params = self.config['tuning']['parameters']
    
    def generate_models(self):
        models, names = list(), list()
        models.append(LogisticRegression(max_iter=1000))
        names.append('LR')
        models.append(KNeighborsClassifier())
        names.append('KNN')
        models.append(DecisionTreeClassifier())
        names.append('DTree')
        models.append(RandomForestClassifier(n_jobs=-1))
        names.append('RF')
        models.append(MLPClassifier( hidden_layer_sizes=(20,20),max_iter=10000))
        names.append('MLP')
        models.append(SVC())
        names.append('SVC')
        return models, names
    
    def finetune_parameters(self, data, y):
        best_parameters = self.get_hyperparameters(data, y)
        for i in range(len(self.models)):
            self.models[i].set_params(**best_parameters[i])

    def get_hyperparameters(self, data, y):
        parameters = []
        for i in range(len(self.models)):
            result = self.model_grid_search(self.models[i], self.params[self.names[i]], data, y)
            parameters.append(result)
        return parameters

    def model_grid_search(self, model, params, X_train, y_train):
        grid_search = GridSearchCV(estimator = model, param_grid = params, n_jobs = -1, verbose = 2, cv=3)
        grid_search.fit(X_train, y_train.values.ravel())
        best_mod = grid_search.best_params_
        return  best_mod
    
    def validate_models(self, data, y):
        for i in range(len(self.models)):
            pipeline = Pipeline(steps=[('m',self.models[i])])
            cv = RepeatedStratifiedKFold(n_splits=5,
                                            n_repeats=15,
                                            random_state=5
                                            )
            metrics = self.config['evaluation']['metrics']
            scores = cross_validate(pipeline,
                                    data,
                                    np.ravel(y),
                                    scoring=metrics,
                                    cv=cv,
                                    return_train_score=True,
                                    error_score = 0,
                                    )

            print('>> %s' % self.names[i])
            for j,k in enumerate(list(scores.keys())):
                if j>1:
                    print('\t %s %.3f (%.3f)' % (k, np.mean(scores[k]),np.std(scores[k])))
    
    def train_models(self, Xtrain, ytrain):
        for model in self.models:
            model.fit(Xtrain,ytrain)
        file = self.config['models']['path_models']
        joblib.dump(self.models, file)

    def evaluate(self, data, ytest):
        Xtest = data
        for i in range(len(self.models)):
            y_pred= self.models[i].predict(Xtest)
            print(f"\n>>Reporte final Test de {self.names[i]}:")
            print(classification_report(ytest, y_pred))
    

# *Running the code*

In [4]:
%cd ..

f:\Maestria\MLOps\Proyecto\MLOps_Project


In [5]:
# Data loading
path = 'params.yaml'
with open(path) as con:
    config = yaml.safe_load(con)
data_manager = Data_Manager(path)
data_manager.load_data()
data_manager.split_data()

# Feature Engineering
features = Feature_Manager(path)
features.get_PCA_components()
file = config['datasets']['path_xtrain']
Xtrain = pd.read_csv(file, sep=';')
Xtrain = features.process_features(Xtrain)
Xtrain.to_csv(config['processed']['path_xtrain'], sep=';', index=False)
file = config['datasets']['path_xtest']
Xtest = pd.read_csv(file, sep=';')
Xtest = features.process_features(Xtest)
Xtest = features.sync_columns(Xtrain, Xtest)
Xtest.to_csv(config['processed']['path_xtest'], sep=';', index=False)

# Model training and validaiton
models = Models(path)
file = config['processed']['path_xtrain']
Xtrain = pd.read_csv(file, sep=';')
file = config['datasets']['path_ytrain']
ytrain = pd.read_csv(file, sep=';')
models.finetune_parameters(Xtrain, ytrain)
models.validate_models(Xtrain, ytrain)
models.train_models(Xtrain, ytrain)
file = config['processed']['path_xtest']
Xtest = pd.read_csv(file, sep=';')
file = config['datasets']['path_ytest']
ytest = pd.read_csv(file, sep=';')
models.evaluate(Xtest, ytest)

La dimension del conjunto de entrenamiento es:  (3539, 36)
La dimension del conjunto de prueba es:  (885, 36)
El número de componentes que explican el 90% de la varianza son: 7
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Fitting 3 folds for each of 210 candidates, totalling 630 fits
Fitting 3 folds for each of 480 candidates, totalling 1440 fits
Fitting 3 folds for each of 864 candidates, totalling 2592 fits
Fitting 3 folds for each of 72 candidates, totalling 216 fits
Fitting 3 folds for each of 40 candidates, totalling 120 fits
>> LR
	 test_accuracy 0.759 (0.013)
	 train_accuracy 0.795 (0.004)
	 test_precision_macro 0.709 (0.021)
	 train_precision_macro 0.762 (0.006)
	 test_recall_macro 0.672 (0.017)
	 train_recall_macro 0.715 (0.006)
	 test_f1_macro 0.681 (0.019)
	 train_f1_macro 0.729 (0.006)
>> KNN
	 test_accuracy 0.675 (0.011)
	 train_accuracy 1.000 (0.000)
	 test_precision_macro 0.625 (0.027)
	 train_precision_macro 1.000 (0.000)
	 test_recall_macro 0.538 (0.013