In [None]:
import time

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import mlflow

from refactoring import data_loading, preprocess


mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("Wikipedia")

<Experiment: artifact_location='mlflow-artifacts:/4', creation_time=1729483784033, experiment_id='4', last_update_time=1729483784033, lifecycle_stage='active', name='Wikipedia', tags={}>

In [None]:
def train_new_model(num_estimators: int, max_depth, min_samples_split, min_samples_leaf, max_features, random_seed=None):
    if random_seed is None:
        random_seed = int(time.time() * 1000) % 4294967296
    with mlflow.start_run() as run:
        main_columns = ['NEds', 'NActDays', 'pagesWomen', 'wikiprojWomen']
        raw_data = data_loading.load_wikipedia("notebooks/1.EDA_Gender_Gap_in_Spanish_WP/data/data.csv")
        #transformed_data = preprocess.remove_outliers(raw_data, main_columns)
        transformed_data = raw_data
        transformed_data = raw_data[raw_data.gender != 0]
        X_train, X_test, y_train, y_test = model_selection.train_test_split(transformed_data[main_columns],
                                                                            transformed_data["gender"],
                                                                            test_size=0.3,
                                                                            random_state=random_seed)
        pipe = Pipeline(steps=[
            ('smote', SMOTE(random_state=random_seed)),
            ('scaler', StandardScaler()),
            ('random_forest', RandomForestClassifier(n_estimators=num_estimators, max_depth=max_depth,
                                                     min_samples_split=min_samples_split,
                                                     min_samples_leaf=min_samples_leaf,
                                                     max_features=max_features,
                                                     random_state=random_seed))
        ])
        pipe.fit(X_train, y_train)
        accuracy = pipe.score(X_test, y_test)
        mlflow.log_param("n_estimators", num_estimators)
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_param("min_samples_split", min_samples_split)
        mlflow.log_param("min_samples_leaf", min_samples_leaf)
        mlflow.log_param("max_features", max_features)
        mlflow.log_param("random_state", random_seed)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.sklearn.log_model(pipe, "model")

In [4]:
param_grid = {
    'n_estimators': [100, 200, 300],  # Número de árboles
    'max_depth': [None, 10, 20, 30],  # Profundidad máxima del árbol
    'min_samples_split': [2, 5, 10],  # Mínimo número de muestras para dividir un nodo
    'min_samples_leaf': [1, 2, 4],    # Mínimo número de muestras en un nodo hoja
    'bootstrap': [True, False],       # Si usar o no reemplazo
    'max_features': ['sqrt', 'log2'], # Número de características a considerar para la mejor división
}

for n_estimators in param_grid['n_estimators']:
    for max_depth in param_grid['max_depth']:
        for min_samples_split in param_grid['min_samples_split']:
            for min_samples_leaf in param_grid['min_samples_leaf']:
                for max_features in param_grid['max_features']:
                    train_new_model(n_estimators, max_depth, min_samples_split, min_samples_leaf, max_features)

2024/10/20 22:09:51 INFO mlflow.tracking._tracking_service.client: 🏃 View run bald-ram-289 at: http://dev.logaritmia.mx:5000/#/experiments/4/runs/a039487f0d97438f878ad12b2079c46c.
2024/10/20 22:09:51 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://dev.logaritmia.mx:5000/#/experiments/4.
2024/10/20 22:09:53 INFO mlflow.tracking._tracking_service.client: 🏃 View run omniscient-doe-832 at: http://dev.logaritmia.mx:5000/#/experiments/4/runs/fbae28563cb94d9d9fb4ea4cd328f977.
2024/10/20 22:09:53 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://dev.logaritmia.mx:5000/#/experiments/4.
2024/10/20 22:09:55 INFO mlflow.tracking._tracking_service.client: 🏃 View run bold-calf-400 at: http://dev.logaritmia.mx:5000/#/experiments/4/runs/57e539a0288c453fb7824d80a00a5ee4.
2024/10/20 22:09:55 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://dev.logaritmia.mx:5000/#/experiments/4.
2024/10/20 22:09:57 INFO mlflow.tracking._tra

In [11]:
train_new_model(200, 20, 10, 2, 'sqrt')

2024/11/03 22:00:26 INFO mlflow.tracking._tracking_service.client: 🏃 View run omniscient-shark-372 at: http://dev.logaritmia.mx:5000/#/experiments/4/runs/0ad45d083ed1487798d02269bf831f00.
2024/11/03 22:00:26 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://dev.logaritmia.mx:5000/#/experiments/4.


In [3]:
from sklearn.linear_model import LogisticRegression


with mlflow.start_run() as run:
    main_columns = ['NEds', 'NActDays', 'pagesWomen', 'wikiprojWomen']
    raw_data = data_loading.load_wikipedia("notebooks/1.EDA_Gender_Gap_in_Spanish_WP/data/data.csv")
    transformed_data = preprocess.remove_outliers(raw_data, main_columns)
    X_train, X_test, y_train, y_test = model_selection.train_test_split(transformed_data[main_columns],
                                                                        transformed_data["gender"],
                                                                        test_size=0.3)
    solver = 'lbfgs'
    pipe = Pipeline(steps=[
        ('smote', SMOTE()),
        ('scaler', StandardScaler()),
        ('logistic_regression', LogisticRegression(solver=solver,multi_class='multinomial'))
    ])
    pipe.fit(X_train, y_train)
    accuracy = pipe.score(X_test, y_test)
    mlflow.log_param("solver", solver)
    mlflow.log_param("solver", solver)
    mlflow.log_metric("accuracy", accuracy)
    mlflow.sklearn.log_model(pipe, "model")

2024/11/03 21:44:55 INFO mlflow.tracking._tracking_service.client: 🏃 View run inquisitive-rat-448 at: http://dev.logaritmia.mx:5000/#/experiments/4/runs/280de4da305941c090da3934f169509a.
2024/11/03 21:44:55 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://dev.logaritmia.mx:5000/#/experiments/4.


In [19]:
# Deep Neural Network Model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
import keras
import mlflow.keras
import numpy as np

mlflow.keras.autolog()

with mlflow.start_run() as run:
    input_dim = X_train.shape[1]
    num_classes = len(np.unique(y_train))


    model = Sequential()
    model.add(Input(shape=(input_dim,))) 
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    loss = 'sparse_categorical_crossentropy'
    model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])

    # Train the model
    history = model.fit(
        X_train, y_train,
        epochs=50,
        batch_size=64,
        validation_data=(X_test, y_test),
        verbose=1
    )

    # Evaluate the model
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    mlflow.log_metric("accuracy", accuracy)

    mlflow.keras.log_model(
        model=model,
        artifact_path="model",
    )



Epoch 1/50
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.4187 - loss: 1.0734 - val_accuracy: 0.3821 - val_loss: 1.0684
Epoch 2/50
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4992 - loss: 1.0163 - val_accuracy: 0.4474 - val_loss: 1.0333
Epoch 3/50
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4990 - loss: 1.0062 - val_accuracy: 0.4396 - val_loss: 1.0292
Epoch 4/50
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5179 - loss: 0.9900 - val_accuracy: 0.4540 - val_loss: 1.0179
Epoch 5/50
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5114 - loss: 0.9829 - val_accuracy: 0.4585 - val_loss: 1.0088
Epoch 6/50
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5196 - loss: 0.9757 - val_accuracy: 0.4673 - val_loss: 1.0087
Epoch 7/50
[1m99/99[0m [32m━━━━━━━━━━

2024/10/20 02:32:23 INFO mlflow.tracking._tracking_service.client: 🏃 View run dashing-horse-353 at: http://localhost:5000/#/experiments/3/runs/dff792fed79140979cc431ab48f163ef.
2024/10/20 02:32:23 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/3.


In [20]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

loaded_model, last_run = mlflow_helper.load_last_mlflow_model("http://localhost:5000", "wikipedia")

# Get predicted probabilities
y_pred_proba = loaded_model.predict(X_test)

# Convert probabilities to class labels
y_pred = np.argmax(y_pred_proba, axis=1)

# Compute classification metrics
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print metrics
print("Accuracy:", accuracy)
print("Classification Report:")
print(report)

Latest run ID: dff792fed79140979cc431ab48f163ef


Downloading artifacts:   0%|          | 0/6 [00:00<?, ?it/s]

[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Accuracy: 0.4673311184939092
Classification Report:
              precision    recall  f1-score   support

           0       0.45      0.54      0.49       307
           1       0.70      0.42      0.52       528
           2       0.16      0.51      0.24        68

    accuracy                           0.47       903
   macro avg       0.44      0.49      0.42       903
weighted avg       0.57      0.47      0.49       903



In [7]:
model_uri = f"runs:/09e5ebd61aa14e42a900adf9bc2c7dd4/model"
mlflow.register_model(model_uri=model_uri, name="wikipedia_genre")

Registered model 'wikipedia_genre' already exists. Creating a new version of this model...
2024/10/20 22:27:06 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: wikipedia_genre, version 3
Created version '3' of model 'wikipedia_genre'.


<ModelVersion: aliases=[], creation_timestamp=1729484826656, current_stage='None', description='', last_updated_timestamp=1729484826656, name='wikipedia_genre', run_id='09e5ebd61aa14e42a900adf9bc2c7dd4', run_link='', source='mlflow-artifacts:/4/09e5ebd61aa14e42a900adf9bc2c7dd4/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='3'>

In [9]:
# API TEST
# docker build -t fastapi-mlflow-app .
# docker run -p 8000:8000 fastapi-mlflow-app
import requests
import json

# Define the API endpoint
url = "http://localhost:8000/train"

# Define the data payload
payload = {
    "X_train": [[1.0, 2.0], [3.0, 4.0]],
    "y_train": [0, 1],
    "X_test": [[5.0, 6.0]],
    "y_test": [1],
    "classifier_type": "logistic_regression",
    "hyperparameters": {"C": 1.0, "max_iter": 100}
}

# Set the headers
headers = {
    "Content-Type": "application/json"
}

# Send the POST request
response = requests.post(url, data=json.dumps(payload), headers=headers)

# Check if the request was successful
if response.status_code == 200:
    print("Request was successful!")
    print(f"Response: {response.json()}")
else:
    print(f"Request failed with status code {response.status_code}")
    print(f"Error: {response.text}")

Request was successful!
Response: {'status': 'Model trained and logged successfully'}


In [12]:
loaded_model, last_run = mlflow_helper.load_last_mlflow_model("http://localhost:5000", "wikipedia")
evaluation.evaluate_model(loaded_model, X_test, y_test, last_run)

Latest run ID: 2e9f29d8e20a4588ab25163618de1b17


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

 - mlflow (current: 2.16.2, required: mlflow==2.17.0)
 - cloudpickle (current: 3.0.0, required: cloudpickle==3.1.0)
 - numpy (current: 1.26.4, required: numpy==2.1.2)
 - pandas (current: 2.2.2, required: pandas==2.2.3)
 - scikit-learn (current: 1.5.0, required: scikit-learn==1.5.2)
 - scipy (current: 1.13.1, required: scipy==1.14.1)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


Error Cuadrático Medio (MSE) - Pipeline: 0.918
Accuracy - Pipeline: 0.4463
Informe de clasificación - Pipeline:
              precision    recall  f1-score   support

           0       0.46      0.46      0.46       307
           1       0.67      0.42      0.52       528
           2       0.14      0.56      0.23        68

    accuracy                           0.45       903
   macro avg       0.43      0.48      0.40       903
weighted avg       0.56      0.45      0.48       903



https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
