In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics
import joblib

# Load the data
data = pd.read_csv('iris.csv')

# Features and target
y = data['species']
X = data.drop(columns=['species'])

# Train-test split (30% test, random state 23)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=23)

# Create a median imputer
imputer = SimpleImputer(strategy='median')

# ColumnTransformer that applies the imputer to all columns
preprocessor = ColumnTransformer(
    transformers=[
        ('imputer', imputer, X.columns)
    ]
)

# Random Forest Classifier with random state 23
clf = RandomForestClassifier(random_state=23)

# Pipeline with preprocessor and classifier
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', clf)
])

# Train the pipeline
pipe.fit(X_train, y_train)

# Predict on test set
y_pred = pipe.predict(X_test)

# Print classification report
print(metrics.classification_report(y_test, y_pred))

# Save pipeline to file
joblib.dump(pipe, '../app/iris.mdl')

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        18
Iris-versicolor       0.93      1.00      0.97        14
 Iris-virginica       1.00      0.92      0.96        13

       accuracy                           0.98        45
      macro avg       0.98      0.97      0.98        45
   weighted avg       0.98      0.98      0.98        45



['../app/iris.mdl']