In [1]:
from imblearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import mlflow
from mlflow.models.signature import infer_signature
from refactoring import data_loading, preprocess, mlflow_helper, evaluation
# Start the MLflow server
mlflow_helper.start_mlflow()
# Setup MLflow
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("wikipedia")

Docker Desktop is already running.
MLflow is already running.


<Experiment: artifact_location='mlflow-artifacts:/3', creation_time=1729384150209, experiment_id='3', last_update_time=1729384150209, lifecycle_stage='active', name='wikipedia', tags={}>

In [2]:
# Setup training data
#columns = ['E_NEds', 'E_Bpag', 'NEds', 'NDays', 'NActDays',
           #'NPages', 'NPcreated', 'pagesWomen', 'wikiprojWomen', 'weightIJ', 'NIJ']
columns = ["E_NEds", "E_Bpag", "firstDay", "lastDay", "NEds", "NDays", "NActDays", "NPages", "NPcreated", "pagesWomen", "wikiprojWomen", "ns_user", "ns_wikipedia", "ns_talk", "ns_userTalk", "ns_content", "weightIJ", "NIJ"]
X_train, X_test, y_train, y_test = preprocess.get_data(columns)

mlflow.sklearn.autolog(log_post_training_metrics=False)


In [3]:
# Random Forest model

with mlflow.start_run() as run:
    rf_model = Pipeline(steps=[
        ('random_forest', RandomForestClassifier(bootstrap = True, max_depth = None, max_features = 'sqrt', min_samples_leaf = 2, min_samples_split = 2, n_estimators = 200, random_state=42))
    ])
    rf_model.fit(X_train, y_train)
    accuracy = rf_model.score(X_test, y_test)
    mlflow.log_metric("accuracy", accuracy)

    # Log the model to MLflow
    mlflow.sklearn.log_model(
        sk_model=rf_model,
        artifact_path="model",
        signature=infer_signature(X_train, rf_model.predict(X_train)),
        input_example= X_train.head(1))


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024/10/20 02:20:05 INFO mlflow.tracking._tracking_service.client: 🏃 View run capricious-loon-471 at: http://localhost:5000/#/experiments/3/runs/f889c8df9fa543a9b45993ab45e0f598.
2024/10/20 02:20:05 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/3.


In [4]:

loaded_model,last_run = mlflow_helper.load_last_mlflow_model("http://localhost:5000", "wikipedia")
evaluation.evaluate_model(loaded_model, X_test, y_test, last_run)

Latest run ID: f889c8df9fa543a9b45993ab45e0f598


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Error Cuadrático Medio (MSE) - RandomForestClassifier: 0.7389
Accuracy - RandomForestClassifier: 0.557
Informe de clasificación - RandomForestClassifier:
              precision    recall  f1-score   support

           0       0.47      0.42      0.45       307
           1       0.64      0.68      0.66       528
           2       0.22      0.24      0.23        68

    accuracy                           0.56       903
   macro avg       0.45      0.44      0.44       903
weighted avg       0.55      0.56      0.55       903



In [5]:
# Logistic Regression model
with mlflow.start_run() as run:
    logistic_model = Pipeline(steps=[
        ('logistic_regression', LogisticRegression(random_state=42,solver='lbfgs',multi_class='multinomial'))
    ])

    logistic_model.fit(X_train, y_train)
    logistic_accuracy = logistic_model.score(X_test, y_test)
    mlflow.log_metric("logistic_accuracy", logistic_accuracy)

    # Log the model to MLflow
    mlflow.sklearn.log_model(
        sk_model=logistic_model,
        artifact_path="model",
        signature=infer_signature(X_train, rf_model.predict(X_train)),
        input_example= X_train.head(1))



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024/10/20 02:20:15 INFO mlflow.tracking._tracking_service.client: 🏃 View run likeable-stag-755 at: http://localhost:5000/#/experiments/3/runs/2b441c6cf87b4ab8bc3895f282422365.
2024/10/20 02:20:15 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/3.


In [6]:
loaded_model, last_run = mlflow_helper.load_last_mlflow_model("http://localhost:5000", "wikipedia")
evaluation.evaluate_model(loaded_model, X_test, y_test, last_run)

Latest run ID: 2b441c6cf87b4ab8bc3895f282422365


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Error Cuadrático Medio (MSE) - LogisticRegression: 0.918
Accuracy - LogisticRegression: 0.4463
Informe de clasificación - LogisticRegression:
              precision    recall  f1-score   support

           0       0.46      0.46      0.46       307
           1       0.67      0.42      0.52       528
           2       0.14      0.56      0.23        68

    accuracy                           0.45       903
   macro avg       0.43      0.48      0.40       903
weighted avg       0.56      0.45      0.48       903



In [19]:
# Deep Neural Network Model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
import keras
import mlflow.keras
import numpy as np

mlflow.keras.autolog()

with mlflow.start_run() as run:
    input_dim = X_train.shape[1]
    num_classes = len(np.unique(y_train))

    keras.utils.set_random_seed(42)
    model = Sequential()
    model.add(Input(shape=(input_dim,))) 
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    loss = 'sparse_categorical_crossentropy'
    model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])

    # Train the model
    history = model.fit(
        X_train, y_train,
        epochs=50,
        batch_size=64,
        validation_data=(X_test, y_test),
        verbose=1
    )

    # Evaluate the model
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    mlflow.log_metric("accuracy", accuracy)

    mlflow.keras.log_model(
        model=model,
        artifact_path="model",
    )



Epoch 1/50
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.4187 - loss: 1.0734 - val_accuracy: 0.3821 - val_loss: 1.0684
Epoch 2/50
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4992 - loss: 1.0163 - val_accuracy: 0.4474 - val_loss: 1.0333
Epoch 3/50
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4990 - loss: 1.0062 - val_accuracy: 0.4396 - val_loss: 1.0292
Epoch 4/50
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5179 - loss: 0.9900 - val_accuracy: 0.4540 - val_loss: 1.0179
Epoch 5/50
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5114 - loss: 0.9829 - val_accuracy: 0.4585 - val_loss: 1.0088
Epoch 6/50
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5196 - loss: 0.9757 - val_accuracy: 0.4673 - val_loss: 1.0087
Epoch 7/50
[1m99/99[0m [32m━━━━━━━━━━

2024/10/20 02:32:23 INFO mlflow.tracking._tracking_service.client: 🏃 View run dashing-horse-353 at: http://localhost:5000/#/experiments/3/runs/dff792fed79140979cc431ab48f163ef.
2024/10/20 02:32:23 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/3.


In [20]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

loaded_model, last_run = mlflow_helper.load_last_mlflow_model("http://localhost:5000", "wikipedia")

# Get predicted probabilities
y_pred_proba = loaded_model.predict(X_test)

# Convert probabilities to class labels
y_pred = np.argmax(y_pred_proba, axis=1)

# Compute classification metrics
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print metrics
print("Accuracy:", accuracy)
print("Classification Report:")
print(report)

Latest run ID: dff792fed79140979cc431ab48f163ef


Downloading artifacts:   0%|          | 0/6 [00:00<?, ?it/s]

[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Accuracy: 0.4673311184939092
Classification Report:
              precision    recall  f1-score   support

           0       0.45      0.54      0.49       307
           1       0.70      0.42      0.52       528
           2       0.16      0.51      0.24        68

    accuracy                           0.47       903
   macro avg       0.44      0.49      0.42       903
weighted avg       0.57      0.47      0.49       903

