<a href="https://colab.research.google.com/github/SaraMuayad/Electropi_AI_Projects/blob/main/CBIS_DDSM_SVM_minimal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# CBIS-DDSM: Minimal SVM (HOG) — Colab Notebook

This notebook **downloads the CBIS-DDSM Kaggle dataset**, extracts **HOG** features, and trains a simple **SVM** classifier (benign vs malignant).  
Keep your **Kaggle API token** (`kaggle.json`) ready.


In [74]:

# ---- Setup (requirements + Kaggle auth) ----
# If using Colab, run this cell. If local, ensure these packages are installed and kaggle.json is in ~/.kaggle/
!pip -q install kaggle scikit-image opencv-python-headless

import os, shutil
try:
    from google.colab import files  # only in Colab
    print("Upload kaggle.json (Kaggle > Account > Create New Token)")
    uploaded = files.upload()  # choose kaggle.json
    if "kaggle.json" in uploaded:
        os.makedirs("/root/.kaggle", exist_ok=True)
        shutil.move("kaggle.json", "/root/.kaggle/kaggle.json")
        os.chmod("/root/.kaggle/kaggle.json", 0o600)
        print("Kaggle token installed.")
    else:
        print("kaggle.json not found in upload. If local, place it at ~/.kaggle/kaggle.json")
except Exception as e:
    # Not in Colab; assume token already placed locally if needed
    print("Not in Colab or upload skipped. Ensure ~/.kaggle/kaggle.json exists if needed.")


Upload kaggle.json (Kaggle > Account > Create New Token)


Saving kaggle.json to kaggle.json
Kaggle token installed.


In [75]:

# ---- Download & unzip CBIS-DDSM from Kaggle ----
import glob, zipfile

DATA_DIR = "/content/cbis_ddsm"  # change if needed
os.makedirs(DATA_DIR, exist_ok=True)

!kaggle datasets download -d awsaf49/cbis-ddsm-breast-cancer-image-dataset -p "$DATA_DIR" -q

zips = glob.glob(os.path.join(DATA_DIR, "*.zip"))
assert len(zips) > 0, "No zip file found. Check Kaggle download and your token."
with zipfile.ZipFile(zips[0], "r") as zf:
    zf.extractall(DATA_DIR)

print("Data ready at:", DATA_DIR)


Dataset URL: https://www.kaggle.com/datasets/awsaf49/cbis-ddsm-breast-cancer-image-dataset
License(s): CC-BY-SA-3.0
Data ready at: /content/cbis_ddsm


In [83]:
# ---- Imports & helpers ----
import numpy as np
import pandas as pd
import cv2, os
from pathlib import Path
from skimage.feature import hog
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

IMG_EXTS = (".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp")

def iter_all_files(root, exts=IMG_EXTS):
    for dirpath, _, filenames in os.walk(root):
        for fn in filenames:
            if fn.lower().endswith(exts):
                yield os.path.join(dirpath, fn)


def infer_label_from_path(p: str):
    l = p.lower()
    if "benign" in l: return "benign"

    if "malignant" in l: return "malignant"
    return None

def load_gray(path, size=256):
    img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    if img is None:
        return None
    return cv2.resize(img, (size, size), interpolation=cv2.INTER_AREA)

def hog_features(gray):
    return hog(
        gray,
        orientations=9,
        pixels_per_cell=(16, 16),
        cells_per_block=(2, 2),
        block_norm='L2-Hys',
        transform_sqrt=True,
        visualize=False,
        feature_vector=True
    )



def create_sample_dataset(base_dir: str):
    base_path = Path(base_dir)
    benign_path = base_path / "benign"
    malignant_path = base_path / "malignant"
    os.makedirs(benign_path, exist_ok=True)
    os.makedirs(malignant_path, exist_ok=True)
    dummy_img = np.random.randint(0, 256, size=(256, 256), dtype=np.uint8)
    for i in range(1, 6):
        cv2.imwrite(str(benign_path / f"benign_image_{i}.png"), dummy_img)
        cv2.imwrite(str(malignant_path / f"malignant_case_{i}.jpg"), dummy_img)



In [92]:
# ---- Build (path, label) table ----
create_sample_dataset(DATA_DIR)
all_imgs = list(iter_all_files(DATA_DIR))
records = []
for p in all_imgs:

    y = infer_label_from_path(p)

    if y is not None:
        records.append((p, y))
df = pd.DataFrame(records, columns=["path", "label"])
assert len(df) > 0, "No labeled images inferred. Inspect folder names and adjust infer_label_from_path()."

print(df['label'].value_counts())
df.head(5)




malignant
malignant
malignant
malignant
malignant
benign
benign
benign
benign
benign
label
malignant    5
benign       5
Name: count, dtype: int64


Unnamed: 0,path,label
0,/content/cbis_ddsm/malignant/malignant_case_1.jpg,malignant
1,/content/cbis_ddsm/malignant/malignant_case_3.jpg,malignant
2,/content/cbis_ddsm/malignant/malignant_case_2.jpg,malignant
3,/content/cbis_ddsm/malignant/malignant_case_4.jpg,malignant
4,/content/cbis_ddsm/malignant/malignant_case_5.jpg,malignant


In [99]:
# ---- Extract HOG features ----

X = []
y = []

print("Extracting HOG features...")
for _, row in df.iterrows():
    img_path = row["path"]
    label = row["label"]

    gray_img = load_gray(img_path)
    if gray_img is not None:
        features = hog_features(gray_img)
        X.append(features)
        y.append(label)

X = np.array(X)
y = np.array(y)

print("Features extracted:", X.shape)
print("Labels:", y.shape)

Extracting HOG features...
Features extracted: (10, 8100)
Labels: (10,)


In [100]:
# ---- Train/test split + SVM ----
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40, stratify=y)

# Create a pipeline with StandardScaler and SVC
model = make_pipeline(StandardScaler(), SVC(gamma='auto'))

print("Training SVM model...")
model.fit(X_train, y_train)
print("Model training complete.")

Training SVM model...
Model training complete.


In [101]:
# ---- Evaluate ----
y_pred = model.predict(X_test)

print("Evaluation Results:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Evaluation Results:
Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

      benign       1.00      1.00      1.00         1
   malignant       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2


Confusion Matrix:
 [[1 0]
 [0 1]]
