<a href="https://colab.research.google.com/github/RohitMakaniProfile/Answers/blob/main/ML_FLOW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-3.6.0-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-skinny==3.6.0 (from mlflow)
  Downloading mlflow_skinny-3.6.0-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-tracing==3.6.0 (from mlflow)
  Downloading mlflow_tracing-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting Flask-CORS<7 (from mlflow)
  Downloading flask_cors-6.0.1-py3-none-any.whl.metadata (5.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting huey<3,>=2.5.0 (from mlflow)
  Downloading huey-2.5.4-py3-none-any.whl.metadata (4.6 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.6.0->mlflow)
  Downloading databricks_sdk-0.73.0-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━

In [2]:
!pip install feature_engine

Collecting feature_engine
  Downloading feature_engine-1.9.3-py3-none-any.whl.metadata (10 kB)
Downloading feature_engine-1.9.3-py3-none-any.whl (229 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m230.0/230.0 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: feature_engine
Successfully installed feature_engine-1.9.3


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
import mlflow.sklearn

In [4]:
from sklearn.model_selection import train_test_split
from sklearn import set_config
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    StandardScaler,
    MinMaxScaler,
    OneHotEncoder
)
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    ConfusionMatrixDisplay
)


In [5]:
from feature_engine.encoding import CountFrequencyEncoder
from feature_engine.outliers.winsorizer import Winsorizer

In [6]:
set_config(transform_output='pandas')
import warnings
warnings.filterwarnings('ignore')

In [8]:
try:
    df = pd.read_csv("titanic.csv")
    print("Data Loaded Successfully.")
except FileNotFoundError:
    print("Error: titanic.csv not found. Please upload the file.")

Data Loaded Successfully.


In [9]:
def clean_data(df):
    columns_to_drop = ['passengerid','name','ticket','cabin']
    return (
        df
        .rename(columns=str.lower)
        .drop(columns=columns_to_drop)
        .assign(
            family = lambda df_ : df_['sibsp'] + df_['parch']
        )
        .drop(columns=['sibsp','parch'])
    )

In [10]:
final_df = clean_data(df)
print(f'The cleaned data has {final_df.shape[0]} rows and {final_df.shape[1]} columns')

The cleaned data has 891 rows and 7 columns


In [11]:
X = final_df.drop(columns=['survived'])
y = final_df['survived']


In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [13]:
print('Training shape:', X_train.shape)
print('Testing shape:', X_test.shape)

Training shape: (712, 6)
Testing shape: (179, 6)


In [14]:
age_pipe = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='median')),
    ('outliers', Winsorizer(capping_method='gaussian', fold=3)),
    ('scale', StandardScaler())
])


In [15]:
fare_pipe = Pipeline(steps=[
    ('outliers', Winsorizer(capping_method='iqr', fold=1.5)),
    ('scale', StandardScaler())
])


In [16]:
embarked_pipe = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('count_encode', CountFrequencyEncoder(encoding_method='count')),
    ('scale', MinMaxScaler())
])

In [17]:
preprocessor = ColumnTransformer(transformers=[
    ('age', age_pipe, ['age']),
    ('fare', fare_pipe, ['fare']),
    ('embarked', embarked_pipe, ['embarked']),
    ('sex', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), ['sex']),
    ('family', MinMaxScaler(), ['family'])
], remainder='passthrough', n_jobs=-1)

In [18]:
model_params = {
    'bootstrap': True,
    'criterion': 'gini',
    'max_depth': 6,
    'max_features': 'sqrt',
    'n_estimators': 300,
    'n_jobs': -1,
    'random_state': 30,
    'warm_start': False
}

In [19]:
model_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('clf', RandomForestClassifier(**model_params))
])

In [20]:
print("Training Model...")
model_pipe.fit(X_train, y_train)

Training Model...


In [21]:
y_pred = model_pipe.predict(X_test)

In [22]:
metrics = {
    'accuracy': accuracy_score(y_test, y_pred),
    'precision': precision_score(y_test, y_pred),
    'recall': recall_score(y_test, y_pred),
    'f1_score': f1_score(y_test, y_pred)
}

In [23]:
print("Metrics:", metrics)

Metrics: {'accuracy': 0.8100558659217877, 'precision': 0.8125, 'recall': 0.7027027027027027, 'f1_score': 0.7536231884057971}


In [24]:
mlflow.set_experiment("Titanic_Prediction_Experiment")
print("Starting MLflow Run...")

2025/11/22 08:37:33 INFO mlflow.tracking.fluent: Experiment with name 'Titanic_Prediction_Experiment' does not exist. Creating a new experiment.


Starting MLflow Run...


In [25]:
with mlflow.start_run() as run:
    mlflow.log_params(model_params)
    mlflow.log_metrics(metrics)
    signature = mlflow.models.infer_signature(
        model_input=X_train,
        model_output=model_pipe.predict(X_train)
    )
    mlflow.sklearn.log_model(
        sk_model=model_pipe,
        artifact_path="titanic_model",
        signature=signature,
        input_example=X_train.iloc[:5]
    )
    cm = ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
    plt.title("Confusion Matrix")

    plt.savefig("confusion_matrix.png")
    mlflow.log_artifact("confusion_matrix.png")
    plt.close()
    print(f"Run Complete. Experiment ID: {run.info.experiment_id}")
    print(f"Run ID: {run.info.run_id}")
    print("Results saved locally in './mlruns'")



Run Complete. Experiment ID: 338678160041224627
Run ID: 7ffaf9a749c44bdd9bdeeff416cf2462
Results saved locally in './mlruns'
