# Model Training Notebook (Iris)
This notebook mirrors `train.py` and produces `model.pkl` and metrics.

In [None]:
from pathlib import Path
import json, numpy as np, pandas as pd
from sklearn.datasets import load_iris
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

DATA_DIR = Path('data')
DATA_DIR.mkdir(parents=True, exist_ok=True)

iris = load_iris(as_frame=True)
df = iris.frame.copy()
df.rename(columns={'target':'target'}, inplace=True)
X = df.drop(columns=['target'])
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

numeric_features = X.columns.tolist()
pre = ColumnTransformer([('num', StandardScaler(), numeric_features)], remainder='drop')
logreg = Pipeline([('prep', pre), ('clf', LogisticRegression(max_iter=1000))])
rf = Pipeline([('prep', pre), ('clf', RandomForestClassifier(random_state=42))])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
compare = {}
for name, pipe in {'LogReg': logreg, 'RandomForest': rf}.items():
    cv = cross_val_score(pipe, X_train, y_train, cv=skf, scoring='accuracy')
    compare[name] = {'cv_mean': float(np.mean(cv)), 'cv_std': float(np.std(cv))}

best = None
best_acc = -1
for name, pipe in {'LogReg': logreg, 'RandomForest': rf}.items():
    pipe.fit(X_train, y_train)
    acc = accuracy_score(y_test, pipe.predict(X_test))
    if acc > best_acc:
        best, best_acc = (name, pipe), acc

with open('model.pkl', 'wb') as f:
    pickle.dump(best[1], f)
df.to_csv(DATA_DIR / 'dataset.csv', index=False)
test_df = X_test.copy(); test_df['target'] = y_test.values; test_df.to_csv(DATA_DIR / 'test.csv', index=False)
cm = confusion_matrix(y_test, best[1].predict(X_test))
fig, ax = plt.subplots(); im=ax.imshow(cm); ax.set_title('Confusion Matrix'); fig.colorbar(im); fig.savefig(DATA_DIR / 'confusion_matrix.png'); plt.close(fig)
with open('metrics.json','w') as f:
    json.dump({'best_model_name': best[0], 'best_test_accuracy': float(best_acc), 'comparison': compare}, f, indent=2)
with open('model_info.json','w') as f:
    json.dump({'feature_names': numeric_features, 'target_name':'target', 'class_names': iris.target_names.tolist()}, f, indent=2)
print('Notebook training complete.')
