In [1]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import mlflow

from refactoring import data_loading, preprocess


mlflow.set_tracking_uri("http://dev.logaritmia.mx:5000")
mlflow.set_experiment("/wikipedia")

<Experiment: artifact_location='mlflow-artifacts:/2', creation_time=1728864738995, experiment_id='2', last_update_time=1728864738995, lifecycle_stage='active', name='/wikipedia', tags={}>

In [4]:
def train_new_model(num_estimators: int, max_depth, min_samples_split, min_samples_leaf, max_features):
    with mlflow.start_run() as run:
        main_columns = ['NEds', 'NActDays', 'pagesWomen', 'wikiprojWomen']
        raw_data = data_loading.load_wikipedia("notebooks/1.EDA_Gender_Gap_in_Spanish_WP/data/data.csv")
        transformed_data = preprocess.remove_outliers(raw_data, main_columns)
        X_train, X_test, y_train, y_test = model_selection.train_test_split(transformed_data[main_columns],
                                                                            transformed_data["gender"],
                                                                            test_size=0.3)
        pipe = Pipeline(steps=[
            ('smote', SMOTE()),
            ('scaler', StandardScaler()),
            ('random_forest', RandomForestClassifier(n_estimators=num_estimators, max_depth=max_depth,
                                                     min_samples_split=min_samples_split,
                                                     min_samples_leaf=min_samples_leaf,
                                                     max_features=max_features))
        ])
        pipe.fit(X_train, y_train)
        accuracy = pipe.score(X_test, y_test)
        mlflow.log_param("n_estimators", num_estimators)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.sklearn.log_model(pipe, "model")

In [3]:
train_new_model(num_estimators=100)

2024/10/20 12:56:06 INFO mlflow.tracking._tracking_service.client: 🏃 View run whimsical-zebra-93 at: http://dev.logaritmia.mx:5000/#/experiments/2/runs/34c9db5239b34c239ed134c3337fafa4.
2024/10/20 12:56:06 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://dev.logaritmia.mx:5000/#/experiments/2.


In [5]:
param_grid = {
    'n_estimators': [100, 200, 300],  # Número de árboles
    'max_depth': [None, 10, 20, 30],  # Profundidad máxima del árbol
    'min_samples_split': [2, 5, 10],  # Mínimo número de muestras para dividir un nodo
    'min_samples_leaf': [1, 2, 4],    # Mínimo número de muestras en un nodo hoja
    'bootstrap': [True, False],       # Si usar o no reemplazo
    'max_features': ['sqrt', 'log2'], # Número de características a considerar para la mejor división
}

for n_estimators in param_grid['n_estimators']:
    for max_depth in param_grid['max_depth']:
        for min_samples_split in param_grid['min_samples_split']:
            for min_samples_leaf in param_grid['min_samples_leaf']:
                for max_features in param_grid['max_features']:
                    train_new_model(n_estimators, max_depth, min_samples_split, min_samples_leaf, max_features)

2024/10/20 13:14:18 INFO mlflow.tracking._tracking_service.client: 🏃 View run silent-flea-394 at: http://dev.logaritmia.mx:5000/#/experiments/2/runs/8c440c3803854fa18440fbb14623789c.
2024/10/20 13:14:18 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://dev.logaritmia.mx:5000/#/experiments/2.
2024/10/20 13:14:20 INFO mlflow.tracking._tracking_service.client: 🏃 View run illustrious-bird-793 at: http://dev.logaritmia.mx:5000/#/experiments/2/runs/8072a2c7fe6c4218a9f06aac33989a4b.
2024/10/20 13:14:20 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://dev.logaritmia.mx:5000/#/experiments/2.
2024/10/20 13:14:22 INFO mlflow.tracking._tracking_service.client: 🏃 View run bold-skink-334 at: http://dev.logaritmia.mx:5000/#/experiments/2/runs/d2fa9f7e2ecb44fb8137de428568813f.
2024/10/20 13:14:22 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://dev.logaritmia.mx:5000/#/experiments/2.
2024/10/20 13:14:24 INFO mlflow.trackin

In [6]:
model_uri = f"runs:/b8048c7133264eb684d8a95299555b1f/model"
mlflow.register_model(model_uri=model_uri, name="wikipedia_genre")

Successfully registered model 'wikipedia_genre'.
2024/10/20 14:07:35 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: wikipedia_genre, version 1
Created version '1' of model 'wikipedia_genre'.


<ModelVersion: aliases=[], creation_timestamp=1729454855659, current_stage='None', description='', last_updated_timestamp=1729454855659, name='wikipedia_genre', run_id='b8048c7133264eb684d8a95299555b1f', run_link='', source='mlflow-artifacts:/2/b8048c7133264eb684d8a95299555b1f/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='1'>

In [2]:
# Setup training data
#columns = ['E_NEds', 'E_Bpag', 'NEds', 'NDays', 'NActDays',
           #'NPages', 'NPcreated', 'pagesWomen', 'wikiprojWomen', 'weightIJ', 'NIJ']
columns = ["E_NEds", "E_Bpag", "firstDay", "lastDay", "NEds", "NDays", "NActDays", "NPages", "NPcreated", "pagesWomen", "wikiprojWomen", "ns_user", "ns_wikipedia", "ns_talk", "ns_userTalk", "ns_content", "weightIJ", "NIJ"]
X_train, X_test, y_train, y_test = preprocess.get_data(columns)

In [3]:
# Random Forest model

with mlflow.start_run() as run:
    rf_model = Pipeline(steps=[
        ('random_forest', RandomForestClassifier(bootstrap = True, max_depth = None, max_features = 'sqrt', min_samples_leaf = 2, min_samples_split = 2, n_estimators = 200, random_state=42))
    ])
    rf_model.fit(X_train, y_train)
    accuracy = rf_model.score(X_test, y_test)
    mlflow.log_metric("accuracy", accuracy)

    # Log the model to MLflow
    mlflow.sklearn.log_model(
        sk_model=rf_model,
        artifact_path="model",
        signature=infer_signature(X_train, rf_model.predict(X_train)),
        input_example= X_train.head(1))


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024/10/20 02:20:05 INFO mlflow.tracking._tracking_service.client: 🏃 View run capricious-loon-471 at: http://localhost:5000/#/experiments/3/runs/f889c8df9fa543a9b45993ab45e0f598.
2024/10/20 02:20:05 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/3.


In [4]:

loaded_model,last_run = mlflow_helper.load_last_mlflow_model("http://localhost:5000", "wikipedia")
evaluation.evaluate_model(loaded_model, X_test, y_test, last_run)

Latest run ID: f889c8df9fa543a9b45993ab45e0f598


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Error Cuadrático Medio (MSE) - RandomForestClassifier: 0.7389
Accuracy - RandomForestClassifier: 0.557
Informe de clasificación - RandomForestClassifier:
              precision    recall  f1-score   support

           0       0.47      0.42      0.45       307
           1       0.64      0.68      0.66       528
           2       0.22      0.24      0.23        68

    accuracy                           0.56       903
   macro avg       0.45      0.44      0.44       903
weighted avg       0.55      0.56      0.55       903



In [5]:
# Logistic Regression model
with mlflow.start_run() as run:
    logistic_model = Pipeline(steps=[
        ('logistic_regression', LogisticRegression(random_state=42,solver='lbfgs',multi_class='multinomial'))
    ])

    logistic_model.fit(X_train, y_train)
    logistic_accuracy = logistic_model.score(X_test, y_test)
    mlflow.log_metric("logistic_accuracy", logistic_accuracy)

    # Log the model to MLflow
    mlflow.sklearn.log_model(
        sk_model=logistic_model,
        artifact_path="model",
        signature=infer_signature(X_train, rf_model.predict(X_train)),
        input_example= X_train.head(1))



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024/10/20 02:20:15 INFO mlflow.tracking._tracking_service.client: 🏃 View run likeable-stag-755 at: http://localhost:5000/#/experiments/3/runs/2b441c6cf87b4ab8bc3895f282422365.
2024/10/20 02:20:15 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/3.


In [6]:
loaded_model, last_run = mlflow_helper.load_last_mlflow_model("http://localhost:5000", "wikipedia")
evaluation.evaluate_model(loaded_model, X_test, y_test, last_run)

Latest run ID: 2b441c6cf87b4ab8bc3895f282422365


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Error Cuadrático Medio (MSE) - LogisticRegression: 0.918
Accuracy - LogisticRegression: 0.4463
Informe de clasificación - LogisticRegression:
              precision    recall  f1-score   support

           0       0.46      0.46      0.46       307
           1       0.67      0.42      0.52       528
           2       0.14      0.56      0.23        68

    accuracy                           0.45       903
   macro avg       0.43      0.48      0.40       903
weighted avg       0.56      0.45      0.48       903



In [19]:
# Deep Neural Network Model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
import keras
import mlflow.keras
import numpy as np

mlflow.keras.autolog()

with mlflow.start_run() as run:
    input_dim = X_train.shape[1]
    num_classes = len(np.unique(y_train))

    keras.utils.set_random_seed(42)
    model = Sequential()
    model.add(Input(shape=(input_dim,))) 
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    loss = 'sparse_categorical_crossentropy'
    model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])

    # Train the model
    history = model.fit(
        X_train, y_train,
        epochs=50,
        batch_size=64,
        validation_data=(X_test, y_test),
        verbose=1
    )

    # Evaluate the model
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    mlflow.log_metric("accuracy", accuracy)

    mlflow.keras.log_model(
        model=model,
        artifact_path="model",
    )



Epoch 1/50
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.4187 - loss: 1.0734 - val_accuracy: 0.3821 - val_loss: 1.0684
Epoch 2/50
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4992 - loss: 1.0163 - val_accuracy: 0.4474 - val_loss: 1.0333
Epoch 3/50
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4990 - loss: 1.0062 - val_accuracy: 0.4396 - val_loss: 1.0292
Epoch 4/50
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5179 - loss: 0.9900 - val_accuracy: 0.4540 - val_loss: 1.0179
Epoch 5/50
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5114 - loss: 0.9829 - val_accuracy: 0.4585 - val_loss: 1.0088
Epoch 6/50
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5196 - loss: 0.9757 - val_accuracy: 0.4673 - val_loss: 1.0087
Epoch 7/50
[1m99/99[0m [32m━━━━━━━━━━

2024/10/20 02:32:23 INFO mlflow.tracking._tracking_service.client: 🏃 View run dashing-horse-353 at: http://localhost:5000/#/experiments/3/runs/dff792fed79140979cc431ab48f163ef.
2024/10/20 02:32:23 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/3.


In [20]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

loaded_model, last_run = mlflow_helper.load_last_mlflow_model("http://localhost:5000", "wikipedia")

# Get predicted probabilities
y_pred_proba = loaded_model.predict(X_test)

# Convert probabilities to class labels
y_pred = np.argmax(y_pred_proba, axis=1)

# Compute classification metrics
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print metrics
print("Accuracy:", accuracy)
print("Classification Report:")
print(report)

Latest run ID: dff792fed79140979cc431ab48f163ef


Downloading artifacts:   0%|          | 0/6 [00:00<?, ?it/s]

[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Accuracy: 0.4673311184939092
Classification Report:
              precision    recall  f1-score   support

           0       0.45      0.54      0.49       307
           1       0.70      0.42      0.52       528
           2       0.16      0.51      0.24        68

    accuracy                           0.47       903
   macro avg       0.44      0.49      0.42       903
weighted avg       0.57      0.47      0.49       903

