In [1]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import mlflow

from refactoring import data_loading, preprocess


mlflow.set_tracking_uri("http://dev.logaritmia.mx:5000")
mlflow.set_experiment("/wikipedia")

with mlflow.start_run() as run:
    main_columns = ['NEds', 'NActDays', 'pagesWomen', 'wikiprojWomen']
    raw_data = data_loading.load_wikipedia("notebooks/1.EDA_Gender_Gap_in_Spanish_WP/data/data.csv")
    transformed_data = preprocess.remove_outliers(raw_data, main_columns)
    X_train, X_test, y_train, y_test = model_selection.train_test_split(transformed_data[main_columns],
                                                                        transformed_data["gender"],
                                                                        test_size=0.3)
    pipe = Pipeline(steps=[
        ('smote', SMOTE()),
        ('scaler', StandardScaler()),
        ('random_forest', RandomForestClassifier(n_estimators=100))
    ])
    pipe.fit(X_train, y_train)
    mlflow.log_metric("accuracy", pipe.score(X_test, y_test))
    mlflow.sklearn.log_model(pipe, "model")


2024/10/13 18:12:18 INFO mlflow.tracking.fluent: Experiment with name '/wikipedia' does not exist. Creating a new experiment.
2024/10/13 18:12:22 INFO mlflow.tracking._tracking_service.client: 🏃 View run painted-hound-910 at: http://dev.logaritmia.mx:5000/#/experiments/2/runs/383df3ba47344888a977962cc54a572b.
2024/10/13 18:12:22 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://dev.logaritmia.mx:5000/#/experiments/2.


In [2]:
import mlflow

from refactoring import evaluation


logged_model = 'runs:/383df3ba47344888a977962cc54a572b/model'
loaded_model = mlflow.pyfunc.load_model(logged_model)
evaluation.evaluate_model(loaded_model, X_test, y_test)

  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 16.32it/s] 

Error Cuadrático Medio (MSE) - Random Forest: 0.8441
Accuracy - Random Forest: 0.4213
Informe de clasificación - Random Forest:
              precision    recall  f1-score   support

           0       0.35      0.33      0.34       469
           1       0.60      0.48      0.54       808
           2       0.13      0.35      0.19       102

    accuracy                           0.42      1379
   macro avg       0.36      0.39      0.35      1379
weighted avg       0.48      0.42      0.44      1379




