In [None]:
!pip install requests pandas tensorflow scikit-learn opencv-python tqdm pydicom

In [8]:

import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tqdm.auto import tqdm
import cv2
from PIL import Image
from pydicom import dcmread
import requests
import json


In [2]:

# API URL
url = "https://api.gdc.cancer.gov/cases"

# API Parameters
params = {
    "filters": json.dumps({
        "op": "in",
        "content": {
            "field": "project.project_id",
            "value": ["TCGA-BRCA"]
        }
    }),
    "fields": "case_id,demographic.gender,demographic.race,diagnoses.age_at_diagnosis",
    "size": "2000"
}

# Fetch data from the API
response = requests.get(url, params=params)
if response.status_code == 200:
    clinical_data = pd.json_normalize(response.json()["data"]["hits"])
    print(f"Retrieved {len(clinical_data)} clinical records.")
    print(clinical_data.head())  # Display the first few rows
else:
    print(f"Failed to fetch clinical data: {response.status_code}")


Retrieved 1098 clinical records.
                                     id                               case_id  \
0  3c612e12-6de8-44fa-a095-805c45474821  3c612e12-6de8-44fa-a095-805c45474821   
1  3cb06c7a-f2a8-448b-91a8-dd201bbf2ddd  3cb06c7a-f2a8-448b-91a8-dd201bbf2ddd   
2  3d676bba-154b-4d22-ab59-d4d4da051b94  3d676bba-154b-4d22-ab59-d4d4da051b94   
3  dfaabd03-2d40-4422-b210-caf112ff4229  dfaabd03-2d40-4422-b210-caf112ff4229   
4  dfd0b7ba-c7d3-498e-b455-346301865452  dfd0b7ba-c7d3-498e-b455-346301865452   

                       diagnoses           demographic.race demographic.gender  
0  [{'age_at_diagnosis': 21369}]                      white             female  
1  [{'age_at_diagnosis': 19027}]                      white             female  
2  [{'age_at_diagnosis': 10564}]                      white             female  
3  [{'age_at_diagnosis': 26535}]  black or african american             female  
4  [{'age_at_diagnosis': 22751}]                      white             fem

In [3]:

# Extract 'age_at_diagnosis' from the nested 'diagnoses' field
clinical_data['age_at_diagnosis'] = clinical_data['diagnoses'].apply(
    lambda x: x[0]['age_at_diagnosis'] if isinstance(x, list) and len(x) > 0 else None
)

# Verify the extraction
print(clinical_data[['age_at_diagnosis']].head())

# Preprocess the clinical data
clinical_data['gender'] = LabelEncoder().fit_transform(clinical_data['demographic.gender'])
clinical_data['race'] = LabelEncoder().fit_transform(clinical_data['demographic.race'])

# Standardize numerical data
scaler = StandardScaler()
clinical_features = clinical_data[['age_at_diagnosis', 'gender', 'race']].dropna()

# Ensure no missing values in features before standardization
X_structured = scaler.fit_transform(clinical_features)

# Generate simulated labels for demonstration purposes
y_structured = np.random.randint(0, 2, len(X_structured))

# Output the processed features and labels
print("Processed features shape:", X_structured.shape)
print("Generated labels shape:", y_structured.shape)


   age_at_diagnosis
0           21369.0
1           19027.0
2           10564.0
3           26535.0
4           22751.0
Processed features shape: (1082, 3)
Generated labels shape: (1082,)


In [4]:
metadata_file_path = r"D:\PROJECTS_FINAL\Cancer Treatment Prediction\final stuff\manifest-1732338211342\metadata.csv"
base_dir = r"D:\PROJECTS_FINAL\Cancer Treatment Prediction\final stuff\manifest-1732338211342"
processed_images_dir = os.path.join(base_dir, "breast_cancer_images_png")

# Ensure output directory exists
os.makedirs(processed_images_dir, exist_ok=True)

# Load metadata
metadata = pd.read_csv(metadata_file_path)

# Update paths in the metadata to absolute paths
metadata['Absolute Path'] = metadata['File Location'].apply(lambda x: os.path.join(base_dir, x.lstrip(".\\")))

IMG_SIZE = (128, 128)
images, labels = [], []

# Process each folder listed in the metadata
for folder_path in tqdm(metadata['Absolute Path'], desc="Processing DICOM folders"):
    if not os.path.exists(folder_path):
        print(f"Folder not found: {folder_path}, skipping.")
        continue

    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if not file_name.endswith(".dcm"):
            continue

        try:
            dicom = dcmread(file_path)
            if 'PixelData' not in dicom:
                continue

            pixel_array = dicom.pixel_array
            img = Image.fromarray(pixel_array)
            img = img.resize(IMG_SIZE)
            img.save(os.path.join(processed_images_dir, f"{os.path.basename(folder_path)}_{file_name.replace('.dcm', '.png')}"))

            images.append(np.array(img) / 255.0)
            labels.append(0)  # Adjust labeling logic as needed

        except Exception as e:
            print(f"Error processing file {file_path}: {e}")

X_images = np.array(images).reshape(-1, IMG_SIZE[0], IMG_SIZE[1], 1)
y_images = np.array(labels)
print(f"Processed {len(X_images)} images.")
print(f"Label distribution: {np.unique(y_images, return_counts=True)}")


Processing DICOM folders:   0%|          | 0/23 [00:00<?, ?it/s]

Processed 923 images.
Label distribution: (array([0]), array([923], dtype=int64))


In [5]:
X_images = np.array(images).reshape(-1, IMG_SIZE[0], IMG_SIZE[1], 1)
y_images = np.array(labels)

print(f"Processed images shape: {X_images.shape}")
print(f"Processed labels shape: {y_images.shape}")


Processed images shape: (923, 128, 128, 1)
Processed labels shape: (923,)


In [6]:
X_train_img, X_test_img, y_train_img, y_test_img = train_test_split(
    X_images, y_images, test_size=0.3, random_state=42
)

print(f"Training set shape: {X_train_img.shape}, {y_train_img.shape}")
print(f"Testing set shape: {X_test_img.shape}, {y_test_img.shape}")


Training set shape: (646, 128, 128, 1), (646,)
Testing set shape: (277, 128, 128, 1), (277,)


In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator  # Add this import


cnn_model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(128, 128, 1)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Add data augmentation
datagen = ImageDataGenerator(
    rotation_range=15,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True,
    fill_mode='nearest'
)

datagen.fit(X_train_img)

cnn_model.fit(
    datagen.flow(X_train_img, y_train_img, batch_size=32),
    validation_data=(X_test_img, y_test_img),
    epochs=10
)


Epoch 1/10


  self._warn_if_super_not_called()


[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 267ms/step - accuracy: 0.9440 - loss: 0.1111 - val_accuracy: 1.0000 - val_loss: 5.4883e-11
Epoch 2/10
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 251ms/step - accuracy: 1.0000 - loss: 1.2603e-11 - val_accuracy: 1.0000 - val_loss: 6.1317e-13
Epoch 3/10
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 270ms/step - accuracy: 1.0000 - loss: 1.9849e-15 - val_accuracy: 1.0000 - val_loss: 3.1367e-13
Epoch 4/10
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 264ms/step - accuracy: 1.0000 - loss: 3.4418e-12 - val_accuracy: 1.0000 - val_loss: 2.8711e-13
Epoch 5/10
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 254ms/step - accuracy: 1.0000 - loss: 6.4748e-13 - val_accuracy: 1.0000 - val_loss: 2.8394e-13
Epoch 6/10
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 254ms/step - accuracy: 1.0000 - loss: 7.3937e-14 - val_accuracy: 1.0000 - val_loss: 2.8356

<keras.src.callbacks.history.History at 0x1db903676e0>

In [10]:
from sklearn.metrics import confusion_matrix, classification_report

y_pred = (cnn_model.predict(X_test_img) > 0.5).astype("int32")
print(classification_report(y_test_img, y_pred))
cm = confusion_matrix(y_test_img, y_pred)
print(f"Confusion Matrix:\n{cm}")


[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       277

    accuracy                           1.00       277
   macro avg       1.00      1.00      1.00       277
weighted avg       1.00      1.00      1.00       277

Confusion Matrix:
[[277]]


