In [33]:
import csv
import requests
import pandas as pd
import dagshub
import mlflow
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import dvc.api

from dotenv import load_dotenv
import os

# Charger les variables d'environnement
load_dotenv()

# Récupérer les variables d'environnement
dagshub_token = os.getenv("DAGSHUB_TOKEN")
aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID")
aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY")


df = pd.read_csv("s3://mlprodprojectbucket/files/Food_Preference.csv")
print(df.head())




                     Timestamp Participant_ID  Gender Nationality  Age  \
0  2019/05/07 2:59:13 PM GMT+8         FPS001    Male      Indian   24   
1  2019/05/07 2:59:45 PM GMT+8         FPS002  Female      Indian   22   
2  2019/05/07 3:00:05 PM GMT+8         FPS003    Male      Indian   31   
3  2019/05/07 3:00:11 PM GMT+8         FPS004  Female      Indian   25   
4  2019/05/07 3:02:50 PM GMT+8         FPS005    Male      Indian   27   

               Food              Juice Dessert  
0  Traditional food        Fresh Juice   Maybe  
1      Western Food  Carbonated drinks     Yes  
2      Western Food        Fresh Juice   Maybe  
3  Traditional food        Fresh Juice   Maybe  
4  Traditional food        Fresh Juice   Maybe  


In [34]:
# DagsHub initialization
repo_owner = "thuylinh.co"
repo_name = "MLProduction_project"
dagshub.init(repo_owner=repo_owner, repo_name=repo_name, mlflow=True)
mlflow.set_tracking_uri(f"https://dagshub.com/{repo_owner}/{repo_name}.mlflow")




Token Dagshub OAuth token, valid until 2025-01-03 10:15:00+00:00 does not exist in the storage




Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=faa42abc-725f-4f76-9e18-540f32884dbf&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=fc1d1e005c297c3e1a91f1aa5e2aab922c502bcebd236bef600a1ff6817705fb




In [35]:
# Afficher les premières lignes pour vérifier
print(df.head())


                     Timestamp Participant_ID  Gender Nationality  Age  \
0  2019/05/07 2:59:13 PM GMT+8         FPS001    Male      Indian   24   
1  2019/05/07 2:59:45 PM GMT+8         FPS002  Female      Indian   22   
2  2019/05/07 3:00:05 PM GMT+8         FPS003    Male      Indian   31   
3  2019/05/07 3:00:11 PM GMT+8         FPS004  Female      Indian   25   
4  2019/05/07 3:02:50 PM GMT+8         FPS005    Male      Indian   27   

               Food              Juice Dessert  
0  Traditional food        Fresh Juice   Maybe  
1      Western Food  Carbonated drinks     Yes  
2      Western Food        Fresh Juice   Maybe  
3  Traditional food        Fresh Juice   Maybe  
4  Traditional food        Fresh Juice   Maybe  


In [46]:
print(df['Nationality'])

0      3
1      3
2      3
3      3
4      3
      ..
283    3
284    3
285    3
286    3
287    3
Name: Nationality, Length: 288, dtype: int64


In [None]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

def evaluate_model(X, y, target_name):
    # 1) Split entraînement / test
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # 2) Suréchantillonner uniquement le TRAIN
    ros = RandomOverSampler(random_state=42)
    X_train_res, y_train_res = ros.fit_resample(X_train, y_train)

    # 3) Définir le préprocesseur (OneHotEncoder pour 'Gender', 'Nationality')
    categorical_features = ['Gender', 'Nationality']
    numeric_features = ['Age']

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', 'passthrough', numeric_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ]
    )

    # 4) Construire le pipeline (prétraitement + classifier)
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(
            n_estimators=100,
            random_state=42,
            class_weight='balanced'  # vous l’aviez déjà ajouté
        ))
    ])

    # 5) Entraîner sur le jeu suréchantillonné
    model.fit(X_train_res, y_train_res)

    # 6) Prédire sur le vrai jeu de test (non suréchantillonné)
    y_pred = model.predict(X_test)

    # 7) Calculer les métriques
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    # 8) Enregistrer et logger dans MLflow (optionnel, selon vos besoins)
    with mlflow.start_run():
        mlflow.log_param("target", target_name)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)
        mlflow.sklearn.log_model(model, f"{target_name}_model")

        cr = classification_report(y_test, y_pred, output_dict=True)
        mlflow.log_dict(cr, f"{target_name}_classification_report.json")

        registered_model = mlflow.register_model(
            model_uri=f"runs:/{mlflow.active_run().info.run_id}/{target_name}_model",
            name=f"{target_name}_model"
        )
        
        client = mlflow.tracking.MlflowClient()
        client.transition_model_version_stage(
            name=f"{target_name}_model",
            version=registered_model.version,
            stage="Production"
        )

        print(f"{target_name}_model saved in Model Registry")

    # 9) Afficher le rapport dans la console
    print(f"---- {target_name} Model Evaluation ----")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

# Ensuite, appelez votre fonction comme avant
evaluate_model(X, y_food, "Food")
evaluate_model(X, y_juice, "Juice")
evaluate_model(X, y_dessert, "Dessert")


Registered model 'Food_model' already exists. Creating a new version of this model...
2025/01/03 16:46:04 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Food_model, version 8
Created version '8' of model 'Food_model'.
  client.transition_model_version_stage(


Food_model saved in Model Registry
🏃 View run popular-ray-887 at: https://dagshub.com/thuylinh.co/MLProduction_project.mlflow/#/experiments/0/runs/87a9a3170182496daf7bda449c454e69
🧪 View experiment at: https://dagshub.com/thuylinh.co/MLProduction_project.mlflow/#/experiments/0
---- Food Model Evaluation ----
Accuracy: 0.7931
Precision: 0.8331
Recall: 0.7931
F1 Score: 0.8095

Classification Report:
                  precision    recall  f1-score   support

Traditional food       0.91      0.84      0.88        50
    Western Food       0.33      0.50      0.40         8

        accuracy                           0.79        58
       macro avg       0.62      0.67      0.64        58
    weighted avg       0.83      0.79      0.81        58



Registered model 'Juice_model' already exists. Creating a new version of this model...
2025/01/03 16:46:20 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Juice_model, version 8
Created version '8' of model 'Juice_model'.
  client.transition_model_version_stage(


Juice_model saved in Model Registry
🏃 View run skittish-asp-420 at: https://dagshub.com/thuylinh.co/MLProduction_project.mlflow/#/experiments/0/runs/87d1386aca8646dfa20df8a648a988c5
🧪 View experiment at: https://dagshub.com/thuylinh.co/MLProduction_project.mlflow/#/experiments/0
---- Juice Model Evaluation ----
Accuracy: 0.7241
Precision: 0.8392
Recall: 0.7241
F1 Score: 0.7742

Classification Report:
                   precision    recall  f1-score   support

Carbonated drinks       0.08      0.20      0.11         5
      Fresh Juice       0.91      0.77      0.84        53

         accuracy                           0.72        58
        macro avg       0.49      0.49      0.47        58
     weighted avg       0.84      0.72      0.77        58



Registered model 'Dessert_model' already exists. Creating a new version of this model...
2025/01/03 16:46:40 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Dessert_model, version 8
Created version '8' of model 'Dessert_model'.
  client.transition_model_version_stage(


Dessert_model saved in Model Registry
🏃 View run honorable-mule-588 at: https://dagshub.com/thuylinh.co/MLProduction_project.mlflow/#/experiments/0/runs/af6c28186c21457ba5bdbc8b2a78a0f7
🧪 View experiment at: https://dagshub.com/thuylinh.co/MLProduction_project.mlflow/#/experiments/0
---- Dessert Model Evaluation ----
Accuracy: 0.4138
Precision: 0.4538
Recall: 0.4138
F1 Score: 0.4260

Classification Report:
              precision    recall  f1-score   support

       Maybe       0.55      0.44      0.49        25
          No       0.22      0.40      0.29        10
         Yes       0.45      0.39      0.42        23

    accuracy                           0.41        58
   macro avg       0.41      0.41      0.40        58
weighted avg       0.45      0.41      0.43        58



In [37]:
from dotenv import load_dotenv
import os

# Charger les variables d'environnement
load_dotenv()

aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID")
aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY")

# Exemple d'utilisation pour charger un fichier S3
df = pd.read_csv(
    "s3://mlprodprojectbucket/files/Food_Preference.csv",
    storage_options={
        "key": aws_access_key_id,
        "secret": aws_secret_access_key,
    }
)


In [78]:
food_model = mlflow.sklearn.load_model("models:/Food_model/8")
juice_model = mlflow.sklearn.load_model("models:/Juice_model/8")
dessert_model = mlflow.sklearn.load_model("models:/Dessert_model/8")


df_input = pd.DataFrame([
  {"Gender": "Female", "Nationality": "Canadian", "Age": 25}
])
print(df_input)

test_cases = [
    {"Gender": "Female", "Nationality": "Canadian", "Age": 25},
    {"Gender": "Male",   "Nationality": "Canadian", "Age": 25},
    {"Gender": "Male",   "Nationality": "Indian",   "Age": 40},
    {"Gender": "Male",   "Nationality": "Yemen",    "Age": 50},
    {"Gender": "Female", "Nationality": "Pakistani","Age": 20},
    {"Gender": "Female",   "Nationality": "Canadian","Age": 16},
    {"Gender": "Male",   "Nationality": "Malaysian",    "Age": 35},
    {"Gender": "Female", "Nationality": "Pakistani","Age": 15}
]

for case in test_cases:
    df_test = pd.DataFrame([case])
    food_pred = food_model.predict(df_test)
    juice_pred = juice_model.predict(df_test)
    dessert_pred = dessert_model.predict(df_test)
    print(case, "=>", food_pred, juice_pred,dessert_pred)


Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00,  6.15it/s]
Downloading artifacts: 100%|██████████| 5/5 [00:03<00:00,  1.66it/s]
Downloading artifacts: 100%|██████████| 5/5 [00:05<00:00,  1.02s/it]


   Gender Nationality  Age
0  Female    Canadian   25
{'Gender': 'Female', 'Nationality': 'Canadian', 'Age': 25} => ['Traditional food'] ['Fresh Juice'] ['Yes']
{'Gender': 'Male', 'Nationality': 'Canadian', 'Age': 25} => ['Traditional food'] ['Fresh Juice'] ['Yes']
{'Gender': 'Male', 'Nationality': 'Indian', 'Age': 40} => ['Traditional food'] ['Fresh Juice'] ['Yes']
{'Gender': 'Male', 'Nationality': 'Yemen', 'Age': 50} => ['Traditional food'] ['Fresh Juice'] ['Yes']
{'Gender': 'Female', 'Nationality': 'Pakistani', 'Age': 20} => ['Traditional food'] ['Fresh Juice'] ['Maybe']
{'Gender': 'Female', 'Nationality': 'Canadian', 'Age': 16} => ['Traditional food'] ['Fresh Juice'] ['Yes']
{'Gender': 'Male', 'Nationality': 'Malaysian', 'Age': 35} => ['Traditional food'] ['Fresh Juice'] ['Yes']
{'Gender': 'Female', 'Nationality': 'Pakistani', 'Age': 15} => ['Traditional food'] ['Fresh Juice'] ['Maybe']


In [64]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

X = df[['Gender', 'Nationality', 'Age']]
y = df['Food']

ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

print(sorted(Counter(y_resampled).items()))


[('Traditional food', 238), ('Western Food', 238)]
