In [1]:
import pandas as pd
import os
import sys
import yaml
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
import mlflow
import pickle
from mlflow.models import infer_signature
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score,accuracy_score,precision_score,recall_score,f1_score


In [2]:
df=pd.read_csv("/Users/pratik.kujur/Desktop/Projects/Mlops-end-to-end/data/raw/winequality.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,is_red
0,0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1
1,1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1
2,2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,1
3,3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,1
4,4,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5,1


### Data Pre-Processing

In [3]:
%pwd

'/Users/pratik.kujur/Desktop/Projects/Mlops-end-to-end/experiments'

In [4]:
os.chdir("../")
%pwd

'/Users/pratik.kujur/Desktop/Projects/Mlops-end-to-end'

In [5]:
params=yaml.safe_load(open("/Users/pratik.kujur/Desktop/Projects/Mlops-end-to-end/params.yaml"))['preprocess']

In [6]:
def DataPreprocessing(input_path,output_path):
    df=pd.read_csv(input_path)
    print("Raw Data \n",df.head())
    # add preprocessing step here

    unecessary_col=['Unnamed: 0']
    df.drop(labels=unecessary_col,axis='columns',inplace=True)
    
    os.makedirs(os.path.dirname(output_path),exist_ok=True)
    df.to_csv(output_path,index=False,header=True)
    print("Preprocessed Data \n",df.head())
    print(f"Preprocessed data is saved in this location -> {output_path}")

In [7]:
params_trainer=yaml.safe_load(open("/Users/pratik.kujur/Desktop/Projects/Mlops-end-to-end/params.yaml"))['train']

params_evalution=yaml.safe_load(open("/Users/pratik.kujur/Desktop/Projects/Mlops-end-to-end/params.yaml"))["evalution"]

params_track=yaml.safe_load(open("/Users/pratik.kujur/Desktop/Projects/Mlops-end-to-end/params.yaml"))["mlflow"]

### DataSplit

In [8]:
def DataSplit(params_trainer,params_evalution,processed_path,
              X_train_reg_path,y_train_reg_path,X_train_clf_path,y_train_clf_path,
              y_test_reg_path,y_test_clf_path,X_test_reg_path,X_test_clf_path):

    df=pd.read_csv(processed_path)
    if 'Unnamed: 0' in df.columns:
        df.drop(columns=['Unnamed: 0'], inplace=True)

    y_reg=df['quality']
    y_clf=df['is_red']

    X_clf=df.iloc[:,1:12]
    X_reg=df.iloc[:,1:11]

    X_train_reg,X_test_reg,y_train_reg,y_test_reg=train_test_split(X_reg,y_reg,test_size=0.2,random_state=42)
    X_train_clf,X_test_clf,y_train_clf,y_test_clf=train_test_split(X_clf,y_clf,test_size=0.2,random_state=42)

    os.makedirs(os.path.dirname(X_train_reg_path),exist_ok=True)
    os.makedirs(os.path.dirname(X_test_reg_path),exist_ok=True)
    os.makedirs(os.path.dirname(y_train_reg_path),exist_ok=True)
    os.makedirs(os.path.dirname(y_test_reg_path),exist_ok=True)
    os.makedirs(os.path.dirname(X_train_clf_path),exist_ok=True)
    os.makedirs(os.path.dirname(X_test_clf_path),exist_ok=True)
    os.makedirs(os.path.dirname(y_train_clf_path),exist_ok=True)
    os.makedirs(os.path.dirname(y_test_clf_path),exist_ok=True)
    
    X_train_reg.to_csv(X_train_reg_path,index=False,header=True)
    X_test_reg.to_csv(X_test_reg_path,index=False,header=True)
    y_train_reg.to_csv(y_train_reg_path,index=False,header=True)
    y_test_reg.to_csv(y_test_reg_path,index=False,header=True)
    X_train_clf.to_csv(X_train_clf_path,index=False,header=True)
    X_test_clf.to_csv(X_test_clf_path,index=False,header=True)
    y_train_clf.to_csv(y_train_clf_path,index=False,header=True)
    y_test_clf.to_csv(y_test_clf_path,index=False,header=True)
    
    print("Training Data Shape:", X_train_clf.shape, "Columns:", X_train_clf.columns)
    print("Testing Data Shape:", X_test_clf.shape, "Columns:", X_test_clf.columns)

    print(f"Data for Taining are saved at ->{params_trainer}")
    print(f"Data for evalution are saved at ->{params_evalution}")

In [9]:
def ModelTrainer(reg,clf,model_reg_path,model_clf_path,
                 X_train_reg,y_train_reg,X_train_clf,y_train_clf):
    X_train_clf=pd.read_csv(X_train_clf)
    y_train_clf=pd.read_csv(y_train_clf)
    X_train_reg=pd.read_csv(X_train_reg)
    y_train_reg=pd.read_csv(y_train_reg)
    
    reg.fit(X_train_reg,y_train_reg)
    os.makedirs(os.path.dirname(model_reg_path),exist_ok=True)
    pickle.dump(reg,open(model_reg_path,'wb'))

    clf.fit(X_train_clf,y_train_clf)
    os.makedirs(os.path.dirname(model_clf_path),exist_ok=True)
    pickle.dump(clf,open(model_clf_path,'wb'))

    print(f"Regression model is saved at -> {model_reg_path}",f"Classification model is saved at -> {model_clf_path}")


In [10]:
def ModelEvalution(uri,model_reg,model_clf,
                   X_test_reg,y_test_reg,X_test_clf,y_test_clf):
    

    X_test_reg=pd.read_csv(X_test_reg)
    y_test_reg=pd.read_csv(y_test_reg)
    X_test_clf=pd.read_csv(X_test_clf)
    y_test_clf=pd.read_csv(y_test_clf)

    model_reg=pickle.load(open(model_reg,'rb'))
    model_clf=pickle.load(open(model_clf,'rb'))

    # Parameters to be track
    reg_params=model_reg.get_params()
    clf_params=model_clf.get_params()

    #Metrics to be track
    y_pred_reg=model_reg.predict(X_test_reg)
    y_pred_clf=model_clf.predict(X_test_clf)

    #for regression
    mse=mean_squared_error(y_pred_reg,y_test_reg)
    mae=mean_absolute_error(y_pred_reg,y_test_reg)
    r2_scr=r2_score(y_pred_reg,y_test_reg)

    #for classification
    acc=accuracy_score(y_pred_clf,y_test_clf)
    precision_scr=precision_score(y_pred_clf,y_test_clf)
    recall_scr=recall_score(y_pred_clf,y_test_clf)
    f1_scr=f1_score(y_pred_clf,y_test_clf)

    reg_metrics={
        'mean_squared_error': mse,
        'mean_absolute_error': mae,
        'r2_score':r2_scr
    }

    clf_metrics={
        'accuracy_score': acc,
        'precision_score': precision_scr,
        'recall_score':recall_scr,
        'f1_score':f1_scr
    }



    # Mlflow Tracking
    mlflow.set_registry_uri(uri=uri)
    mlflow.set_experiment(experiment_name="Multioutput_Model")

    with mlflow.start_run():

        mlflow.log_params(reg_params)
        mlflow.log_metrics(reg_metrics)
        
        signature_reg=infer_signature(X_test_reg,model_reg.predict(X_test_reg))
        
        model_reg_info=mlflow.sklearn.log_model(
            sk_model=model_reg,
            artifact_path="artifacts/model_reg",
            signature=signature_reg,
            input_example=X_test_reg,
            registered_model_name="tracking-regressor"
        )

      
    with mlflow.start_run():

            mlflow.log_params(clf_params)

            mlflow.log_metrics(clf_metrics)

            signature_clf=infer_signature(X_test_clf,model_clf.predict(X_test_clf))

            model_clf_info=mlflow.sklearn.log_model(
                sk_model=model_clf,
                artifact_path="artifacts/model_clf",
                signature=signature_clf,
                input_example=X_test_reg,
                registered_model_name="tracking-classifier"
            )
    
    
    print("artifacts are save in-> artifacts/")


In [11]:
def ModelPredict(model_reg,model_clf,X_values):

    #load models
    model_reg=pickle.load(open(model_reg,'rb'))
    model_clf=pickle.load(open(model_clf,'rb'))

    model_reg_output=model_reg.predict(X_values)

    model_clf_input=np.append(X_values,model_reg_output)

    model_clf_input=model_clf_input.reshape(1,-1)

    model_clf_output=model_clf.predict(model_clf_input)

    print(f"Wine Quality is -> {model_reg_output}",f"Wine is red if 1 else white if 0 {model_clf_output}")




In [12]:
if __name__=="__main__":
    DataPreprocessing(params['input'],params['output'])

    DataSplit(params_trainer,params_evalution,params['output'],
              params_trainer['X_train_reg'],params_trainer['y_train_reg'],params_trainer['X_train_clf'],params_trainer['y_train_clf'],
              params_evalution['y_test_reg'],params_evalution['y_test_clf'],params_evalution['X_test_reg'],params_evalution['X_test_clf'])
    
    reg=RandomForestRegressor()
    clf=RandomForestClassifier()

    ModelTrainer(reg,clf,params_trainer['model_reg'],params_trainer['model_clf'],
                params_trainer['X_train_reg'],params_trainer['y_train_reg'],params_trainer['X_train_clf'],params_trainer['y_train_clf'])
    ModelEvalution(params_track["uri"],params_trainer['model_reg'],params_trainer['model_clf'],
                   params_evalution['X_test_reg'],params_evalution['y_test_reg'],params_evalution['X_test_clf'],params_evalution['y_test_clf'])
    
    X_test_reg=pd.read_csv(params_evalution['X_test_reg'])
    X_values=np.array(X_test_reg.iloc[1:2,:])
    print("X_Value shapeeeee",X_values.shape)
    ModelPredict(params_trainer['model_reg'],params_trainer['model_clf'],X_values)

Raw Data 
    Unnamed: 0  fixed acidity  volatile acidity  citric acid  residual sugar  \
0           0            7.4              0.70         0.00             1.9   
1           1            7.8              0.88         0.00             2.6   
2           2            7.8              0.76         0.04             2.3   
3           3           11.2              0.28         0.56             1.9   
4           4            7.4              0.66         0.00             1.8   

   chlorides  free sulfur dioxide  total sulfur dioxide  density    pH  \
0      0.076                 11.0                  34.0   0.9978  3.51   
1      0.098                 25.0                  67.0   0.9968  3.20   
2      0.092                 15.0                  54.0   0.9970  3.26   
3      0.075                 17.0                  60.0   0.9980  3.16   
4      0.075                 13.0                  40.0   0.9978  3.51   

   sulphates  alcohol  quality  is_red  
0       0.56      9.4       

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
2025/01/16 11:50:58 INFO mlflow.tracking.fluent: Experiment with name 'Multioutput_Model' does not exist. Creating a new experiment.


Regression model is saved at -> models/model_1_reg.pkl Classification model is saved at -> models/model_1_clf.pkl


Successfully registered model 'tracking-regressor'.
2025/01/16 11:51:01 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: tracking-regressor, version 1
Created version '1' of model 'tracking-regressor'.
  "dataframe_split": {
    "columns": [
      "volatile acidity",
      "citric acid",
      "residual sugar",
      "chlorides",
      "free sulfur dioxide",
      "total sulfur dioxide",
      "density",
      "pH",
      "sulphates",
      "alcohol"
    ],
    "data": [
      [
        0.32,
        0.12,
        6.6,
        0.043,
        22.0,
        141.0,
        0.9937,
        3.36,
        0.6,
        10.4
      ],
      [
        0.3,
        0.27,
        4.4,
        0.055,
        17.0,
        135.0,
        0.9925,
        3.23,
        0.44,
        12.2
      ],
      [
        0.22,
        0.32,
        2.2,
        0.028,
        36.0,
        92.0,
        0.99076,
        3.27,
        0

artifacts are save in-> artifacts/
X_Value shapeeeee (1, 10)
Wine Quality is -> [6.13] Wine is red if 1 else white if 0 [0]


Created version '1' of model 'tracking-classifier'.
