### LogClassifier

In [2]:
import yaml
import pandas as pd
import numpy as np
from joblib import dump

# Plotting Libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Modeling Libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import mlflow
import mlflow.sklearn
from mlflow import MlflowClient
import mlflow.pyfunc

# Setting Parent Folder
%cd ..

# Local Functions
from src.data.data_transformation import data_transformer

print('Libraries loaded')

c:\Users\meo1slp\Desktop\ITESM MAI\MLOps\itesm_tc5044_10_mlops_equipo22
Libraries loaded


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


### 1. Load Config

In [3]:
with open('params.yaml') as conf_file:
    config = yaml.safe_load(conf_file)

print(config)

{'base': None, 'data': {'input_data': 'data/Steel_industry_data.csv'}, 'train': {'test_size': 0.2, 'random_state': 42, 'activation': 'relu', 'activation_2': 'softmax', 'optimizer': 'adam', 'loss': 'sparse_categorical_crossentropy', 'epochs': 50, 'batch_size': 10, 'verbose': 1, 'axis': -1}, 'reports': {'model': 'models/steel_industry_model.keras'}}


### 2. Load Dataset

In [4]:
data = pd.read_csv(config['data']['input_data'])
data.head(5)

Unnamed: 0,date,Usage_kWh,Lagging_Current_Reactive.Power_kVarh,Leading_Current_Reactive_Power_kVarh,CO2(tCO2),Lagging_Current_Power_Factor,Leading_Current_Power_Factor,NSM,WeekStatus,Day_of_week,Load_Type
0,01/01/2018 00:15,3.17,2.95,0.0,0.0,73.21,100.0,900,Weekday,Monday,Light_Load
1,01/01/2018 00:30,4.0,4.46,0.0,0.0,66.77,100.0,1800,Weekday,Monday,Light_Load
2,01/01/2018 00:45,3.24,3.28,0.0,0.0,70.28,100.0,2700,Weekday,Monday,Light_Load
3,01/01/2018 01:00,3.31,3.56,0.0,0.0,68.09,100.0,3600,Weekday,Monday,Light_Load
4,01/01/2018 01:15,3.82,4.5,0.0,0.0,64.72,100.0,4500,Weekday,Monday,Light_Load


In [5]:
#Separa las variables del dataframe
X = data.drop(['date', 'Load_Type'],axis=1)
y = data['Load_Type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = config['train']['test_size'], random_state = config['train']['random_state'])

In [6]:
# Definir variables numéricas y categóricas
numeric_features = ['Usage_kWh', 'Lagging_Current_Reactive.Power_kVarh', 'Leading_Current_Reactive_Power_kVarh',
                    'CO2(tCO2)', 'Lagging_Current_Power_Factor', 'Leading_Current_Power_Factor', 'NSM']
categorical_features = ['WeekStatus', 'Day_of_week']

In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('scaler', StandardScaler()),
            ('pca', PCA(n_components=0.95))
        ]), numeric_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ]
)

In [9]:
rfc_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

rfc_model.fit(X_train, y_train)

In [10]:
# Evaluar el modelo en el conjunto de prueba
y_pred = rfc_model.predict(X_test)

# Calcular métricas de evaluación
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.8713
Precision: 0.8424
Recall: 0.8400
F1 Score: 0.8412


In [11]:
import pickle

# Supón que el pipeline se llama `pipeline` y ya está entrenado
# Guardar el pipeline en un archivo local
with open('./models/rfc_model.pkl', 'wb') as file:
    pickle.dump(rfc_model, file)

### Cargar el modelo rfc_model guardado

In [15]:
import pickle

with open('./models/rfc_model.pkl', 'rb') as file:
    loaded_pipeline = pickle.load(file)

# Ahora puedes usar `loaded_pipeline` para hacer predicciones
# Ejemplo: predicciones en el conjunto de prueba
y_pred = loaded_pipeline.predict(X_test)

y_pred

array(['Light_Load', 'Maximum_Load', 'Maximum_Load', ..., 'Maximum_Load',
       'Light_Load', 'Maximum_Load'], dtype=object)

### Log model in mlflow

In [None]:
# Set up MLFlow experiment
mlflow.set_experiment("Steel_Industry_Classification")

with mlflow.start_run():
    # Train a Logistic Regression model
    model = RandomForestClassifier(max_depth=None,min_samples_split=2, n_estimators=200)
    model.fit(X_train, y_train)
    
    # Make predictions and evaluate the model
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    # Log parameters and metrics to MLFlow
    mlflow.log_param("model_type", "Random Forest Classifier")
    mlflow.log_param("max_depth", None)
    mlflow.log_param("min_samples_split", 2)
    mlflow.log_param("n_estimators", 200)
    mlflow.log_metric("accuracy", accuracy)
    
    # Log the model
    mlflow.sklearn.log_model(model, "model")
    
    print(f"Model accuracy: {accuracy}")