# Importing necessary libraries

In [1]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error, r2_score

# Load dataset

In [2]:
import seaborn as sns
tips = sns.load_dataset('tips')

In [3]:
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


# Identify column types

In [4]:
cat_cols = tips.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = tips.select_dtypes(include=['int64', 'float64']).columns.tolist()
num_cols.remove('tip')  # target column
print("Categorical Columns:", cat_cols)
print("Numeric Columns:", num_cols)

Categorical Columns: ['sex', 'smoker', 'day', 'time']
Numeric Columns: ['total_bill', 'size']


# Split data

In [5]:
X = tips.drop('tip', axis=1)
y = tips['tip']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Preprocessing

In [6]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(drop='first'), cat_cols)
    ]
)

# MLFlow Tracking

**Run the following command in terminal**

> * mlflow ui 

In [7]:
import mlflow
import mlflow.sklearn


mlflow.set_tracking_uri("http://127.0.0.1:5000")

mlflow.set_experiment("Tips Regression")

2025/10/21 16:04:27 INFO mlflow.tracking.fluent: Experiment with name 'Tips Regression' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/814188437445801376', creation_time=1761042867445, experiment_id='814188437445801376', last_update_time=1761042867445, lifecycle_stage='active', name='Tips Regression', tags={}>

# Mutlimodel comparision

In [10]:
models = {
    "LinearRegression": LinearRegression(),
    "DecisionTree": DecisionTreeRegressor(),
    "RandomForest": RandomForestRegressor()
}

for name, model in models.items():
        
        with mlflow.start_run(run_name=name) as run:

            # Get the run id
            run_id = run.info.run_id


            # Pipeline with model
            pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('regressor', model)
            ])


            # Train model
            pipeline.fit(X_train, y_train)
            preds = pipeline.predict(X_test)


            # Fit and predict already done above
            mse = mean_squared_error(y_test, preds)
            r2 = r2_score(y_test,preds)
            
            # Log model, metrics
            mlflow.log_metric("mse", mse)
            mlflow.log_metric("r2_score",r2)


            mlflow.sklearn.log_model(pipeline, name= name)  # logs both preprocessing + model
            
            print(f"MSE: {mse:.3f}, r2_score: {r2}")
            print(f"{name} saved at run id {run_id}")




MSE: 0.932, r2_score: 0.29309667441267395
LinearRegression saved at run id be200f50ebb741ab8232e63a59575b5d
🏃 View run LinearRegression at: http://127.0.0.1:5000/#/experiments/814188437445801376/runs/be200f50ebb741ab8232e63a59575b5d
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/814188437445801376




MSE: 1.261, r2_score: 0.04342054005289808
DecisionTree saved at run id fcd717e527f44895a24b644a832f48e4
🏃 View run DecisionTree at: http://127.0.0.1:5000/#/experiments/814188437445801376/runs/fcd717e527f44895a24b644a832f48e4
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/814188437445801376




MSE: 0.931, r2_score: 0.29359805363623204
RandomForest saved at run id 59495a3cdad34823876f102a8b0e60c6
🏃 View run RandomForest at: http://127.0.0.1:5000/#/experiments/814188437445801376/runs/59495a3cdad34823876f102a8b0e60c6
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/814188437445801376


In [10]:
pipeline

In [11]:
import pickle

with open("best_model.pkl",'wb') as f:
    pickle.dump(pipeline,f)

# Register the best model

In [None]:
model_uri = "runs:/9638c0bd02c84f97a022cb75a8dc368d/RandomForest"  # runs:/run_id/run name
mlflow.register_model(model_uri, "Regressormodel")

Successfully registered model 'Regressormodel'.
2025/10/21 16:08:16 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Regressormodel, version 1
Created version '1' of model 'Regressormodel'.


<ModelVersion: aliases=[], creation_timestamp=1761043096319, current_stage='None', deployment_job_state=<ModelVersionDeploymentJobState: current_task_name='', job_id='', job_state='DEPLOYMENT_JOB_CONNECTION_STATE_UNSPECIFIED', run_id='', run_state='DEPLOYMENT_JOB_RUN_STATE_UNSPECIFIED'>, description='', last_updated_timestamp=1761043096319, metrics=None, model_id=None, name='Regressormodel', params=None, run_id='9638c0bd02c84f97a022cb75a8dc368d', run_link='', source='models:/m-61cd22c8519443e4a14c346af8e629fa', status='READY', status_message=None, tags={}, user_id='', version='1'>