
# Predictive Analysis Using Machine Learning — Classification

**Goal:** Build and evaluate a machine learning model to predict outcomes from a dataset, demonstrating **feature selection**, **model training**, and **evaluation**.

**Dataset:** Breast Cancer Wisconsin (Diagnostic) — available via scikit-learn.

**Outline:**
1. Load and explore data
2. Split data; create preprocessing & baseline model
3. Perform feature selection
4. Train tuned models
5. Evaluate with metrics & visualizations
6. Inspect feature importance and conclude


In [None]:

# Imports
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, RocCurveDisplay
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

# Ensure reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)


In [None]:

# Load dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name="target")

X.head(), y.head()


In [None]:

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

X_train.shape, X_test.shape



## Baseline Model (No Feature Selection)

We'll start with a logistic regression model on all features to establish a baseline.


In [None]:

# Baseline: Logistic Regression on all features
numeric_features = X.columns.tolist()

preprocess = ColumnTransformer(
    transformers=[("num", StandardScaler(), numeric_features)],
    remainder="drop",
)

baseline_clf = Pipeline(steps=[
    ("prep", preprocess),
    ("clf", LogisticRegression(max_iter=500, random_state=RANDOM_STATE))
])

baseline_clf.fit(X_train, y_train)
y_pred_base = baseline_clf.predict(X_test)
y_prob_base = baseline_clf.predict_proba(X_test)[:, 1]

baseline_metrics = {
    "Accuracy": accuracy_score(y_test, y_pred_base),
    "Precision": precision_score(y_test, y_pred_base),
    "Recall": recall_score(y_test, y_pred_base),
    "F1": f1_score(y_test, y_pred_base),
    "ROC_AUC": roc_auc_score(y_test, y_prob_base),
}
baseline_metrics



## Feature Selection

We'll use **mutual information** with `SelectKBest` to score features and choose the most informative ones.


In [None]:

# Try several k values and pick the one with best cross-validated score
k_values = [5, 8, 10, 12, 15, 20, X_train.shape[1]]

pipe_fs = Pipeline(steps=[
    ("prep", preprocess),
    ("select", SelectKBest(score_func=mutual_info_classif)),
    ("clf", LogisticRegression(max_iter=500, random_state=RANDOM_STATE))
])

param_grid = {"select__k": k_values}
grid_fs = GridSearchCV(pipe_fs, param_grid=param_grid, cv=5, scoring="roc_auc", n_jobs=-1)
grid_fs.fit(X_train, y_train)

best_k = grid_fs.best_params_["select__k"]
best_k, grid_fs.best_score_


In [None]:

# Fit the best feature selection pipeline on the full training set and evaluate
best_fs_model = grid_fs.best_estimator_
y_pred_fs = best_fs_model.predict(X_test)
y_prob_fs = best_fs_model.predict_proba(X_test)[:, 1]

fs_metrics = {
    "Accuracy": accuracy_score(y_test, y_pred_fs),
    "Precision": precision_score(y_test, y_pred_fs),
    "Recall": recall_score(y_test, y_pred_fs),
    "F1": f1_score(y_test, y_pred_fs),
    "ROC_AUC": roc_auc_score(y_test, y_prob_fs),
}
fs_metrics



## Model Training with an Alternative Classifier

We'll also train a **RandomForestClassifier** behind the same preprocessing + feature selection block and tune a couple of key hyperparameters.


In [None]:

pipe_rf = Pipeline(steps=[
    ("prep", preprocess),
    ("select", SelectKBest(score_func=mutual_info_classif, k=best_k)),
    ("rf", RandomForestClassifier(random_state=RANDOM_STATE))
])

param_grid_rf = {
    "rf__n_estimators": [100, 300],
    "rf__max_depth": [None, 5, 10],
    "rf__min_samples_split": [2, 5]
}

grid_rf = GridSearchCV(pipe_rf, param_grid=param_grid_rf, cv=5, scoring="roc_auc", n_jobs=-1)
grid_rf.fit(X_train, y_train)

y_pred_rf = grid_rf.best_estimator_.predict(X_test)
y_prob_rf = grid_rf.best_estimator_.predict_proba(X_test)[:, 1]

rf_metrics = {
    "Accuracy": accuracy_score(y_test, y_pred_rf),
    "Precision": precision_score(y_test, y_pred_rf),
    "Recall": recall_score(y_test, y_pred_rf),
    "F1": f1_score(y_test, y_pred_rf),
    "ROC_AUC": roc_auc_score(y_test, y_prob_rf),
}

grid_rf.best_params_, rf_metrics



## Evaluation: Confusion Matrix & ROC Curves


In [None]:

# Confusion matrices
from sklearn.metrics import ConfusionMatrixDisplay

fig = plt.figure()
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_base)
plt.title("Confusion Matrix — Baseline Logistic Regression")
plt.show()

fig = plt.figure()
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_fs)
plt.title("Confusion Matrix — Logistic Regression with Feature Selection")
plt.show()

fig = plt.figure()
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_rf)
plt.title("Confusion Matrix — Random Forest with Feature Selection")
plt.show()


In [None]:

# ROC curves
fig = plt.figure()
RocCurveDisplay.from_predictions(y_test, y_prob_base)
plt.title("ROC Curve — Baseline Logistic Regression")
plt.show()

fig = plt.figure()
RocCurveDisplay.from_predictions(y_test, y_prob_fs)
plt.title("ROC Curve — Logistic Regression with Feature Selection")
plt.show()

fig = plt.figure()
RocCurveDisplay.from_predictions(y_test, y_prob_rf)
plt.title("ROC Curve — Random Forest with Feature Selection")
plt.show()



## Feature Scores and Importances

We'll inspect which features were selected and their mutual information scores. For tree-based models, we'll also inspect feature importances.


In [None]:

# Get scores from SelectKBest fitted inside the best logistic regression pipeline
# Refit SelectKBest on the training data with best_k to extract scores clearly
selector = SelectKBest(score_func=mutual_info_classif, k=best_k)
selector.fit(preprocess.fit_transform(X_train), y_train)

selected_mask = selector.get_support()
selected_features = np.array(numeric_features)[selected_mask]
scores = selector.scores_[selected_mask]

feat_scores = pd.DataFrame({"feature": selected_features, "mutual_info": scores}).sort_values("mutual_info", ascending=False)
feat_scores.head(best_k)


In [None]:

# Plot mutual information scores
fig = plt.figure(figsize=(8, 6))
plt.barh(feat_scores["feature"], feat_scores["mutual_info"])
plt.gca().invert_yaxis()
plt.xlabel("Mutual Information Score")
plt.title("Top Features by Mutual Information")
plt.tight_layout()
plt.show()


In [None]:

# Feature importances from RandomForest
rf_model = grid_rf.best_estimator_.named_steps["rf"]
# Retrieve feature names after selection
selected_feature_names = selected_features

importances = rf_model.feature_importances_
imp_df = pd.DataFrame({"feature": selected_feature_names, "importance": importances}).sort_values("importance", ascending=False)
imp_df.head(best_k)


In [None]:

# Plot RF feature importances
fig = plt.figure(figsize=(8, 6))
plt.barh(imp_df["feature"], imp_df["importance"])
plt.gca().invert_yaxis()
plt.xlabel("Importance")
plt.title("Random Forest Feature Importances")
plt.tight_layout()
plt.show()



## Model Comparison


In [None]:

results = pd.DataFrame([
    {"Model": "Baseline LogisticRegression (all features)", **baseline_metrics},
    {"Model": f"LogisticRegression + SelectKBest(k={best_k})", **fs_metrics},
    {"Model": "RandomForest + SelectKBest(best k)", **rf_metrics},
])
results.sort_values("ROC_AUC", ascending=False)



## Conclusion

- We established a strong baseline with Logistic Regression.
- Using **feature selection (SelectKBest with mutual information)** helped identify the most informative features and can slightly improve performance and interpretability.
- An alternative **Random Forest** model provided a useful comparison; depending on hyperparameters and selected features, it can achieve similar or better ROC AUC.
- This notebook demonstrates a full workflow: preprocessing, feature selection, model training, hyperparameter tuning, and evaluation with confusion matrices and ROC curves.
