# Pipeline of Training and Evaluation Machine Learning Models

### Requirements

In [148]:
from dataclasses import dataclass
from google.colab import drive
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.model_selection import train_test_split
from pathlib import Path
import pandas as pd
import numpy as np
import joblib
import pickle

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [149]:
model_path = "/content/drive/MyDrive/Aulas/AprendizadoDeMáquina/Projeto/Dataset/Models"
data_path = "/content/drive/MyDrive/Aulas/AprendizadoDeMáquina/Projeto/Dataset/CrossValidation"
results_path = "/content/drive/MyDrive/Aulas/AprendizadoDeMáquina/Projeto/Dataset/Results"

## Configurations

In [150]:
#@markdown ### Algoritmo:
selected_model = "LR" #@param ["GB", "PW", "KNN", "LR"]
step = "hyper" #@param ["hyper", "train", "test"]
dataset_name = "mfeat_fac" #@param ["mfeat_fac", "mfeat_fou", "mfeat_zer"]
repeat_id_start = 1 # @param {type:"slider", min:1, max:30, step:1}
repeat_id_end = 2 # @param {type:"slider", min:1, max:30, step:1}
fold_id_start = 1 # @param {type:"slider", min:1, max:10, step:1}
fold_id_end = 2 # @param {type:"slider", min:1, max:10, step:1}

actual_model = None

### Models

In [151]:
from numpy.core.fromnumeric import repeat
@dataclass
class Model:
  model: any = None
  name: str = ""

  def params(self):
    pass

  def load_params(self):
    complete_path = f"{model_path}/{dataset_name}"
    with open(f'{complete_path}/best_params.pkl', 'rb') as f:
      loaded_best_params = pickle.load(f)
    self.model.set_params(**loaded_best_params)


  def run_hyper(self, x, y):
    grid_search = GridSearchCV(self.model, self.params(), cv=5)
    grid_search.fit(x, y)
    best_params = grid_search.best_params_
    print(best_params)
    self.model.set_params(**best_params)

    complete_path = f"{model_path}/{dataset_name}"
    path = Path(complete_path)
    path.mkdir(parents=True, exist_ok=True)

    with open(f'{complete_path}/best_params.pkl', 'wb') as f:
      pickle.dump(best_params, f)

  def load_model(self, repeat):
    self.model = joblib.load(f"{model_path}/{dataset_name}/{repeat}/{self.name}.pkl")

  def save_model(self, repeat):
    complete_path = f"{model_path}/{dataset_name}/{repeat}"
    path = Path(complete_path)
    path.mkdir(parents=True, exist_ok=True)

    _ = joblib.dump(self.model, f"{complete_path}/{self.name}.pkl", compress=4)
    print(self.model, f"{complete_path}/{self.name}")

  def train(self, x, y, repeat):
    self.load_params()
    self.model.fit(x, y)
    self.save_model(repeat)

  def test(self, x, repeat):
    self.load_model(repeat)
    return self.model.predict(x)

In [152]:
from sklearn.linear_model import LogisticRegression as LR

class VotingClassifierModel:
  def __post_init__(self, clf1, clf2, clf3):
    self.ensemble_clf = VotingClassifier(estimators=[
        ('gnb1', clf1), ('gnb2', clf2), ('gnb3', clf3)], voting='hard')

  def train(x, y):
    return self.ensemble_clf.fit(x, y)

  def test(x):
    return self.ensemble_clf.predict(x)

In [153]:
from sklearn.linear_model import LogisticRegression as LR

@dataclass
class LogisticRegressionModel(Model):
  def __post_init__(self):
    self.model = LR()
    self.name = "LR"

  def params(self):
    return {
        'penalty':["l2"],
        'C':[1, 10]
        }

In [154]:
storage_models = [
    LogisticRegressionModel(),
    # Others models
    ]

In [155]:
for model in storage_models:
  if model.name == selected_model:
    actual_model = model

### Dataset

In [156]:
class Dataset:
  @staticmethod
  def get_hiper_sample():
    df = pd.read_csv(f"{data_path}/{dataset_name}/repeat1/split1/train_fold.csv")
    y = df.iloc[:, 0]
    x = df.iloc[:, 1:]
    x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)
    return x_val, y_val

  @staticmethod
  def get_sample_per_repeat_and_fold(repead_id: int, fold_id: int, is_train: bool = True):
    split_type = "train_fold" if is_train else "test_fold"
    df = pd.read_csv(f"{data_path}/{dataset_name}/repeat{repead_id}/split{fold_id}/{split_type}.csv")
    y = df.iloc[:, 0]
    x = df.iloc[:, 1:]
    return x, y

### Save results

In [157]:
class SaveResults:

  @staticmethod
  def save_test_results(y_predicted, y_real, repetition, fold):

    accuracy = accuracy_score(y_real, y_predicted)
    f1 = f1_score(y_real, y_predicted, average='weighted')
    recall = recall_score(y_real, y_predicted, average='weighted')
    precision = precision_score(y_real, y_predicted, average='weighted')

    result_df = pd.DataFrame({
        'Repetition': [repetition],
        'Fold': [fold],
        'Accuracy': [accuracy],
        'F1': [f1],
        'Recall': [recall],
        'Precision': [precision]
    })

    try:
        existing_df = pd.read_csv(f"{results_path}/results.csv")
        result_df = pd.concat([existing_df, result_df], ignore_index=True)
    except FileNotFoundError:
        pass

    result_df.to_csv(f"{results_path}/results.csv", index=False)

## Hyperparametrization (Grid Search)

In [158]:
x, y = Dataset.get_hiper_sample()
actual_model.run_hyper(x, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'C': 1, 'penalty': 'l2'}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Train

In [161]:
for repeat in range(repeat_id_start, repeat_id_end):
  for fold in range(fold_id_start, fold_id_end):
    x, y = Dataset.get_sample_per_repeat_and_fold(repeat, fold)
    actual_model.train(x, y, repeat)


LogisticRegression(C=1) /content/drive/MyDrive/Aulas/AprendizadoDeMáquina/Projeto/Dataset/Models/mfeat_fac/1/LR


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Test

In [163]:
for repeat in range(repeat_id_start, repeat_id_end):
  for fold in range(fold_id_start, fold_id_end):
    x, y = Dataset.get_sample_per_repeat_and_fold(repeat, fold, is_train=False)
    y_predicted = actual_model.test(x,repeat)
    SaveResults.save_test_results(y_predicted, y, repeat, fold)

    #print (y_predicted)