In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import pprint

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings # current version of seaborn generates a bunch of warnings that we'll ignore
warnings.filterwarnings("ignore")


In [None]:
pd.read_csv("breast-cancer.csv")

In [None]:
df = pd.read_csv("breast-cancer.csv")
print("Dataset Shape:", df.shape)
print(df.head())

In [None]:
print("\nSummary Statistics:\n", df.describe())

In [None]:
print("\nMissing Values:\n", df.isnull().sum())

# Visualization

In [None]:
sns.countplot(x='diagnosis', data=df, palette="Set1")
plt.title("Class Distribution")
plt.show()

print(df['diagnosis'].value_counts(normalize=True))

In [None]:
px.pie(df, 'diagnosis', color='diagnosis',color_discrete_sequence=['#007500','#5CFF5C'],title='Data Distribution')

In [None]:
for column in  df.drop('diagnosis',axis=1).columns[:5]:
    fig = px.box(data_frame=df,x='diagnosis',color='diagnosis',y=column,color_discrete_sequence=['#007500','#5CFF5C'],orientation='v')
    fig.show()  

In [None]:
df['diagnosis'] = (df['diagnosis'] == 'M').astype(int) #encode the label into 1/0
corr = df.corr()
plt.figure(figsize=(20,20))
sns.heatmap(corr, cmap='viridis_r',annot=True)
plt.show()


In [None]:
# Correlation with diagnosis
corr = df.corr()['diagnosis'].sort_values(ascending=False).head(11)  # top 10 + diagnosis
plt.figure(figsize=(8,6))
sns.heatmap(df[corr.index].corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Top Correlated Features with Diagnosis")
plt.show()


In [None]:
features = ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean']
plt.figure(figsize=(12,6))
for i, feat in enumerate(features, 1):
    plt.subplot(2, 3, i)
    sns.violinplot(x="diagnosis", y=feat, data=df, palette="muted", split=True)
    plt.title(f"{feat} by Diagnosis")
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.swarmplot(x="diagnosis", y="radius_mean", data=df, palette="Set1", alpha=0.7)
plt.title("Swarm Plot: Radius Mean by Diagnosis")
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.boxenplot(x="diagnosis", y="area_mean", data=df, palette="Set2")
plt.title("Boxen Plot: Area Mean by Diagnosis")
plt.show()


In [None]:
df.drop(['id'], axis=1, errors='ignore').hist(bins=30, figsize=(15,10), color="skyblue")
plt.suptitle("Feature Distributions", fontsize=16)
plt.show()


In [None]:
df['radius_bin'] = pd.qcut(df['radius_mean'], q=5)  # divide into 5 bins
ct = pd.crosstab(df['radius_bin'], df['diagnosis'])

ct.plot(kind="bar", stacked=True, figsize=(10,6), colormap="coolwarm")
plt.title("Diagnosis Distribution across Radius Mean Bins")
plt.ylabel("Count")
plt.show()

In [None]:
features = ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean']
for feat in features:
    plt.figure(figsize=(6,4))
    sns.kdeplot(df[df['diagnosis']==0][feat], label="Benign", shade=True)
    sns.kdeplot(df[df['diagnosis']==1][feat], label="Malignant", shade=True)
    plt.title(f"Distribution of {feat} by Diagnosis")
    plt.legend()
    plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x="diagnosis", y="radius_mean", data=df, palette="Set2")
plt.title("Radius Mean by Diagnosis")
plt.show()

In [None]:
# Distribution of classes
sns.countplot(x='diagnosis', data=df, palette='Set2')
plt.title("Target Variable Distribution (0 = Benign, 1 = Malignant)")
plt.show()

In [None]:
# Select only first 5 numerical features + target column
sns.pairplot(df[['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'diagnosis']], 
             hue="diagnosis", diag_kind="kde")
plt.show()

In [None]:
#  Preprocessing

X = df.drop(['id','diagnosis'], axis=1, errors='ignore')  # drop id if present
y = df['diagnosis']

X = X.select_dtypes(include=[np.number])
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(8,6))
plt.scatter(X_pca[:,0], X_pca[:,1], c=y, cmap="coolwarm", edgecolor="k", s=40)
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.title("PCA Visualization of Breast Cancer Data")
plt.show()

In [None]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)

fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_pca[:,0], X_pca[:,1], X_pca[:,2], c=y, cmap="coolwarm", s=50)
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.set_zlabel("PC3")
plt.title("3D PCA Visualization")
plt.show()


In [None]:
# Train SVM (Linear & RBF)

svm_linear = SVC(kernel="linear")
svm_linear.fit(X_train, y_train)
y_pred_linear = svm_linear.predict(X_test)

print("\n--- Linear SVM Performance ---")
print(classification_report(y_test, y_pred_linear))

In [None]:
svm_rbf = SVC(kernel="rbf")
svm_rbf.fit(X_train, y_train)
y_pred_rbf = svm_rbf.predict(X_test)

print("\n--- RBF SVM Performance ---")
print(classification_report(y_test, y_pred_rbf))

In [None]:
# Decision Boundary (2D Visualization)

# Use only two features (e.g., radius_mean, texture_mean)
X2 = df[['radius_mean', 'texture_mean']]
y2 = df['diagnosis']
X2_scaled = scaler.fit_transform(X2)

X2_train, X2_test, y2_train, y2_test = train_test_split(X2_scaled, y2, test_size=0.2, random_state=42)

clf2 = SVC(kernel='rbf', C=1, gamma=0.5)
clf2.fit(X2_train, y2_train)

In [None]:
# Meshgrid for decision boundary
x_min, x_max = X2_scaled[:,0].min() - 1, X2_scaled[:,0].max() + 1
y_min, y_max = X2_scaled[:,1].min() - 1, X2_scaled[:,1].max() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200),
                     np.linspace(y_min, y_max, 200))

Z = clf2.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.contourf(xx, yy, Z, alpha=0.3, cmap=plt.cm.coolwarm)
plt.scatter(X2_scaled[:,0], X2_scaled[:,1], c=y2, edgecolors='k', cmap=plt.cm.coolwarm)
plt.xlabel("Radius Mean")
plt.ylabel("Texture Mean")
plt.title("SVM Decision Boundary (RBF Kernel)")
plt.show()

In [None]:
#  Hyperparameter Tuning

param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf']
}

grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=1, cv=5)
grid.fit(X_train, y_train)

print("\nBest Hyperparameters:", grid.best_params_)
print("Best Cross-Validation Score:", grid.best_score_)

In [None]:
# Evaluate best model
y_pred_best = grid.best_estimator_.predict(X_test)
print("\n--- Tuned RBF SVM Performance ---")
print(classification_report(y_test, y_pred_best))

In [None]:
#  Cross-validation with Linear SVM

cv_scores = cross_val_score(svm_linear, X_scaled, y, cv=5)
print("\nCross-validation Accuracy (Linear SVM):", cv_scores.mean())

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(score_func=f_classif, k=10)
fit = selector.fit(X_scaled, y)
feature_scores = pd.DataFrame({
    'Feature': X.columns,
    'Score': fit.scores_
}).sort_values(by="Score", ascending=False)

print(feature_scores.head(10))


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_best)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Benign", "Malignant"])
disp.plot(cmap="Blues")
plt.show()

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_best)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6,4))
plt.plot(fpr, tpr, label=f"ROC curve (area = {roc_auc:.2f})")
plt.plot([0,1],[0,1],'--', color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()


In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(X_scaled)

plt.figure(figsize=(8,6))
plt.scatter(X_pca[:,0], X_pca[:,1], c=clusters, cmap="coolwarm", s=40, edgecolor="k")
plt.title("KMeans Clustering (2 clusters)")
plt.show()


In [None]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# ============================
# Hyperparameter Tuning (RBF SVM)
# ============================
param_grid = {
    'C': [0.1, 1, 10, 100],        # Regularization parameter
    'gamma': [1, 0.1, 0.01, 0.001], # Kernel coefficient
    'kernel': ['rbf']
}

grid = GridSearchCV(SVC(probability=True), param_grid, refit=True, verbose=1, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

print("✅ Best Parameters:", grid.best_params_)
print("✅ Best CV Score:", grid.best_score_)

In [None]:
# Evaluate on test set
y_pred_best = grid.best_estimator_.predict(X_test)

print("\n--- Tuned RBF SVM Performance ---")
print("Accuracy:", accuracy_score(y_test, y_pred_best))
print(classification_report(y_test, y_pred_best))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_best))

In [None]:
# Plot Confusion Matrix
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_test, y_pred_best),
                              display_labels=["Benign","Malignant"])
disp.plot(cmap="Blues")
plt.title("Confusion Matrix - Tuned RBF SVM")
plt.show()

In [None]:
# Cross-validation (Linear SVM)

# Linear SVM
svm_linear = SVC(kernel="linear", C=1)
cv_scores_linear = cross_val_score(svm_linear, X_scaled, y, cv=5, scoring='accuracy')
print("\nCross-validation Accuracy (Linear SVM):", cv_scores_linear.mean())

In [None]:
# Best RBF SVM (from GridSearch)
best_rbf = grid.best_estimator_
cv_scores_rbf = cross_val_score(best_rbf, X_scaled, y, cv=5, scoring='accuracy')
print("Cross-validation Accuracy (Best RBF SVM):", cv_scores_rbf.mean())

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Predict probabilities with best RBF SVM
y_prob = grid.best_estimator_.predict_proba(X_test)[:,1]

# Compute ROC curve and AUC
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

# Plot ROC Curve
plt.figure(figsize=(7,5))
plt.plot(fpr, tpr, color="blue", lw=2, label=f"RBF SVM (AUC = {roc_auc:.2f})")
plt.plot([0,1], [0,1], color="gray", linestyle="--", lw=1)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Tuned RBF SVM")
plt.legend(loc="lower right")
plt.grid(True, linestyle="--", alpha=0.7)
plt.show()


In [None]:
# Cross-validation accuracy results
linear_acc = cv_scores_linear.mean()
rbf_acc = cv_scores_rbf.mean()

# Bar chart
models = ['Linear SVM', 'Tuned RBF SVM']
scores = [linear_acc, rbf_acc]

plt.figure(figsize=(6,5))
plt.bar(models, scores, color=['skyblue','salmon'])
plt.ylabel("Mean CV Accuracy")
plt.ylim(0,1)
for i, score in enumerate(scores):
    plt.text(i, score+0.01, f"{score:.3f}", ha='center', fontsize=11, fontweight='bold')
plt.title("Cross-Validation Accuracy Comparison")
plt.show()
