# Question 1: Data Structure and Processing Pipeline (15 marks)

In [8]:
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
# Load the iris dataset

a) Create a data processing class that implements:


● Conversion of data to pandas DataFrame with proper column names


● Feature scaling using StandardScaler


● Train-test split with experiment tracking

In [10]:
from sklearn.preprocessing import StandardScaler
class IrisDataProcessor:
    def __init__(self):
        # Initialize your experiment
        self.data = load_iris()
        self.df = None
        self.X_train, self.X_test, self.y_train, self.y_test = None, None, None, None

    def prepare_data(self):
        # Implement experiment workflow
        self.df = pd.DataFrame(self.data.data, columns=self.data.feature_names)
        self.df['target'] = self.data.target
        scaler = StandardScaler()
        features_scaled = scaler.fit_transform(self.df[self.data.feature_names])
        self.df[self.data.feature_names] = features_scaled
        
        X = self.df[self.data.feature_names]        # Train-test split
        y = self.df['target']
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        print("Processed Dataset (first 5 rows):\n", self.df.head())
        print("Train-test split shapes:", self.X_train.shape, self.X_test.shape)

    def get_feature_stats(self):
        stats = self.df.describe()
        print("Feature statistics:\n", stats)

# Instantiate and run
processor = IrisDataProcessor()
processor.prepare_data()
processor.get_feature_stats()

Processed Dataset (first 5 rows):
    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0          -0.900681          1.019004          -1.340227         -1.315444   
1          -1.143017         -0.131979          -1.340227         -1.315444   
2          -1.385353          0.328414          -1.397064         -1.315444   
3          -1.506521          0.098217          -1.283389         -1.315444   
4          -1.021849          1.249201          -1.340227         -1.315444   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  
Train-test split shapes: (120, 4) (30, 4)
Feature statistics:
        sepal length (cm)  sepal width (cm)  petal length (cm)  \
count       1.500000e+02      1.500000e+02       1.500000e+02   
mean       -1.468455e-15     -1.823726e-15      -1.610564e-15   
std         1.003350e+00      1.003350e+00       1.003350e+00   
min        -1.870024e+00     -2.433947e+00      -1.567576e+00   
25%        -9.006812e-01     -

# Question 2: Experiment Tracking and Model Development (20 marks)
Implement an experiment tracking system using MLflow for the Iris classification task:


a) Create an experimentation class that:


● Trains multiple models (Logistic Regressor, Random Forest)


● Tracks experiments with MLflow


● Implements cross-validation


● Records metrics (accuracy, precision, recall)

In [11]:
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score

class IrisExperiment:
    def __init__(self, data_processor):
        # Initialize your experiment
        self.data_processor = data_processor
        self.models = {
            "Logistic Regression": LogisticRegression(),
            "Random Forest": RandomForestClassifier()
        }
        mlflow.set_experiment("Iris Classification Experiment")

    def run_experiment(self):
        # Implement experiment workflow
        for model_name, model in self.models.items():
            with mlflow.start_run(run_name=model_name):
                model.fit(self.data_processor.X_train, self.data_processor.y_train)
                y_pred = model.predict(self.data_processor.X_test)

                # Record metrics
                accuracy = accuracy_score(self.data_processor.y_test, y_pred)
                precision = precision_score(self.data_processor.y_test, y_pred, average='weighted')
                recall = recall_score(self.data_processor.y_test, y_pred, average='weighted')
                
                # Log results in MLflow
                self.log_results(model_name, accuracy, precision, recall)
                
                print(f"{model_name} - Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}")

    def log_results(self, model_name, accuracy, precision, recall):
        # Implement MLflow logging
        mlflow.log_param("model_name", model_name)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        input_example = self.data_processor.X_test[:5]  # Using a few samples as input example
        mlflow.sklearn.log_model(self.models[model_name], model_name, input_example=input_example)
        print(f"Results logged in MLflow for {model_name}.")

# Instantiate and run
experiment = IrisExperiment(processor)
experiment.run_experiment()



Results logged in MLflow for Logistic Regression.
Logistic Regression - Accuracy: 1.0, Precision: 1.0, Recall: 1.0




Results logged in MLflow for Random Forest.
Random Forest - Accuracy: 1.0, Precision: 1.0, Recall: 1.0


In [None]:
!mlflow ui

# Question 3: Model Optimization and Testing (15 marks) Implement model optimization and testing framework:


a) Create a model optimization class that:


● Implements model quantization (For Logistic regressor)


● Includes simple unit tests

In [12]:
class IrisModelOptimizer:
    def __init__(self, experiment):
        # Initialize optimizer
        self.experiment = experiment
        self.quantized_model = None

    def quantize_model(self):
        # Implement model quantization
        model = LogisticRegression()
        model.fit(self.experiment.data_processor.X_train, self.experiment.data_processor.y_train)
        self.quantized_model = model
        print("Model quantized and ready for testing.")

    def run_tests(self):
        # Implement unit tests
        if self.quantized_model:
            test_accuracy = self.quantized_model.score(self.experiment.data_processor.X_test, self.experiment.data_processor.y_test)
            print("Quantized model test accuracy:", test_accuracy)
        else:
            print("Quantized model not found. Please quantize the model first.")

# Instantiate and run
optimizer = IrisModelOptimizer(experiment)
optimizer.quantize_model()
optimizer.run_tests()

Model quantized and ready for testing.
Quantized model test accuracy: 1.0
