### Question 1: Data Structure and Processing Pipeline (15 marks)

a) Create a data processing class that implements:
    
    ● Conversion of data to pandas DataFrame with proper column names
    
    ● Feature scaling using `StandardScaler`
    
    ● Train-test split with experiment tracking

### Question 2: Experiment Tracking and Model Development (20 marks)

Implement an experiment tracking system using MLflow for the Iris classification task:
a) Create an experimentation class that:
    ● Trains multiple models (Logistic Regressor, Random Forest)
    ● Tracks experiments with MLflow
    ● Implements cross-validation
    ● Records metrics (accuracy, precision, recall)

### Question 3: Model Optimization and Testing (15 marks)

Implement model optimization and testing framework:

a) Create a model optimization class that: 
    ● Implements model quantization (For Logistic regressor)
    ● Includes simple unit tests

### The main function that runs in the  { docker } image

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import unittest

class IrisDataProcessor:
    def __init__(self):
        self.data = load_iris()
        self.df = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None

    def prepare_data(self):
        self.df = pd.DataFrame(
            data=np.c_[self.data['data'], self.data['target']],
            columns=self.data['feature_names'] + ['target']
        )
        scaler = StandardScaler()
        features = self.df[self.data['feature_names']]
        self.df[self.data['feature_names']] = scaler.fit_transform(features)
        X = self.df[self.data['feature_names']]
        y = self.df['target']
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=0.3, random_state=42
        )
        return self.X_train, self.X_test, self.y_train, self.y_test

class IrisExperiment:
    def __init__(self, data_processor):
        self.data_processor = data_processor
        self.models = {
            'Logistic Regression': LogisticRegression(),
            'Random Forest': RandomForestClassifier()
        }

    def run_experiment(self):
        results = {}
        X_train, X_test, y_train, y_test = self.data_processor.prepare_data()

        for model_name, model in self.models.items():
            model.fit(X_train, y_train)
            accuracy = model.score(X_test, y_test)
            cv_scores = cross_val_score(model, X_train, y_train, cv=5)

            results[model_name] = {
                'Accuracy': accuracy,
                'Cross-Validation Mean': np.mean(cv_scores),
                'Cross-Validation Std': np.std(cv_scores)
            }
        return results

class IrisModelOptimizer:
    def __init__(self, experiment):
        self.experiment = experiment

    def quantize_model(self):
        model = self.experiment.models['Logistic Regression']
        model.coef_ = np.round(model.coef_, 2)
class ModelTest(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        processor = IrisDataProcessor()
        cls.X_train, cls.X_test, cls.y_train, cls.y_test = processor.prepare_data()
        cls.experiment = IrisExperiment(processor)
        cls.optimizer = IrisModelOptimizer(cls.experiment)

    def test_data_split(self):
        self.assertTrue(len(self.X_train) > 0, "X_train should not be empty")

    def test_model_training(self):
        model = self.experiment.models['Logistic Regression']
        self.assertIsNotNone(model, "Model should be initialized")

processor = IrisDataProcessor()
X_train, X_test, y_train, y_test = processor.prepare_data()

experiment = IrisExperiment(processor)
results = experiment.run_experiment()

optimizer = IrisModelOptimizer(experiment)
optimizer.quantize_model()

print("Experiment Results:")
for model, metrics in results.items():
    print(f"\n{model}:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.4f}")

unittest.main(argv=[''], exit=False)

..
----------------------------------------------------------------------
Ran 2 tests in 0.002s

OK


Experiment Results:

Logistic Regression:
  Accuracy: 1.0000
  Cross-Validation Mean: 0.9429
  Cross-Validation Std: 0.0356

Random Forest:
  Accuracy: 1.0000
  Cross-Validation Mean: 0.9429
  Cross-Validation Std: 0.0356


<unittest.main.TestProgram at 0x14d8c7c50>