In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib

file_path = "breast-cancer.csv"
df = pd.read_csv(file_path)

if 'id' in df.columns:
    df = df.drop(columns=['id'])

le = LabelEncoder()
df['diagnosis'] = le.fit_transform(df['diagnosis'])

X = df.drop('diagnosis', axis=1)
y = df['diagnosis']

num_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_features = X.select_dtypes(include=['object']).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(drop='first'), cat_features)
    ]
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("\n Evaluation Metrics:")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"F1-score: {f1:.4f}")

joblib.dump(pipeline, "breast_cancer_ml_pipeline.pkl")
print("\n Pipeline saved as 'breast_cancer_ml_pipeline.pkl'")


 Evaluation Metrics:
Accuracy: 0.9737
Precision: 1.0000
Recall: 0.9286
F1-score: 0.9630

 Pipeline saved as 'breast_cancer_ml_pipeline.pkl'


# Task 15: End-to-End ML Pipeline (Breast Cancer)

**Tools:** Python, Pandas, NumPy, Scikit-learn, Joblib  

## Steps:
1. **Load Dataset:** Import CSV file and inspect data.  
2. **Preprocessing:**  
   - Drop irrelevant columns (`id`)  
   - Encode target variable (`diagnosis`: M=1, B=0)  
   - Fill missing values (if any)  
   - Scale numerical features using `StandardScaler`  
   - Encode categorical features using `OneHotEncoder` (if present)  
3. **Train-Test Split:** 80-20 split, stratified on target.  
4. **Pipeline Creation:**  
   - Use `ColumnTransformer` for preprocessing  
   - Combine with `RandomForestClassifier` in a `Pipeline`  
5. **Train Pipeline:** Fit on training data.  
6. **Predictions & Evaluation:** Accuracy, Precision, Recall, F1-score  
7. **Save Pipeline:** Save complete pipeline as `.pkl` for deployment  

## Deliverables:
- Trained ML pipeline  
- Evaluation metrics  
- Saved pipeline model (`breast_cancer_ml_pipeline.pkl`)
