In [None]:
# -----------------------------
# 1. Imports
# -----------------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
import joblib
import json
import os

# -----------------------------
# 2. Load dataset
# -----------------------------
# Ensure your CSV is in 'data/breast_cancer.csv'
data_path = os.path.join("data", "breast_cancer.csv")
data = pd.read_csv(data_path)

# Quick look at dataset
print("Dataset shape:", data.shape)
print(data.head())

# -----------------------------
# 3. Preprocessing
# -----------------------------
# Example preprocessing steps
# Drop missing values
data = data.dropna()

# Assume target column is 'diagnosis' (adjust if needed)
# Encode target if categorical
if data['diagnosis'].dtype == 'object':
    data['diagnosis'] = data['diagnosis'].astype('category').cat.codes

# Split features and target
X = data.drop('diagnosis', axis=1)
y = data['diagnosis']

# Train/test split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])

# -----------------------------
# 4. Model Training
# -----------------------------
clf = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)
clf.fit(X_train, y_train)

# -----------------------------
# 5. Evaluation
# -----------------------------
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1-score (weighted): {f1:.4f}")

# -----------------------------
# 6. Save metrics
# -----------------------------
os.makedirs("artifacts", exist_ok=True)
metrics = {'accuracy': float(accuracy), 'f1_score': float(f1)}
metrics_file = os.path.join("artifacts", "metrics.json")
with open(metrics_file, 'w') as f:
    json.dump(metrics, f)
print(f"Metrics saved to {metrics_file}")

# -----------------------------
# 7. Save model
# -----------------------------
os.makedirs("models", exist_ok=True)
model_file = os.path.join("models", "model.pkl")
joblib.dump(clf, model_file)
print(f"Model saved to {model_file}")

# -----------------------------
# 8. Optional: Feature importance
# -----------------------------
import matplotlib.pyplot as plt

feat_importances = pd.Series(clf.feature_importances_, index=X.columns)
feat_importances = feat_importances.sort_values(ascending=False)
plt.figure(figsize=(10,6))
feat_importances.plot(kind='bar')
plt.title("Feature Importances")
plt.tight_layout()
plt.show()
