# Step 1: Install Required Libraries

In [None]:
!pip install requests pandas tensorflow scikit-learn opencv-python tqdm pydicom


# Step 2: Import Necessary Libraries

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tqdm.auto import tqdm
import cv2
from PIL import Image
from pydicom import dcmread
import requests
import json


# Step 3: Clinical Data Collection

In [None]:
# API URL
url = "https://api.gdc.cancer.gov/cases"


# API Parameters
params = {
    "filters": json.dumps({
        "op": "in",
        "content": {
            "field": "project.project_id",
            "value": ["TCGA-BRCA"]
        }
    }),
    "fields": "case_id,demographic.gender,demographic.race,diagnoses.age_at_diagnosis",
    "size": "2000"
}

# Fetch data from the API
response = requests.get(url, params=params)
if response.status_code == 200:
    clinical_data = pd.json_normalize(response.json()["data"]["hits"])
    print(f"Retrieved {len(clinical_data)} clinical records.")
    print(clinical_data.head())  # Display the first few rows
else:
    print(f"Failed to fetch clinical data: {response.status_code}")


# Step 4: Preprocess Clinical Data

In [None]:
# Extract 'age_at_diagnosis' from the nested 'diagnoses' field
clinical_data['age_at_diagnosis'] = clinical_data['diagnoses'].apply(
    lambda x: x[0]['age_at_diagnosis'] if isinstance(x, list) and len(x) > 0 else None
)

# Verify the extraction
print(clinical_data[['age_at_diagnosis']].head())

# Preprocess the clinical data
clinical_data['gender'] = LabelEncoder().fit_transform(clinical_data['demographic.gender'])
clinical_data['race'] = LabelEncoder().fit_transform(clinical_data['demographic.race'])

# Standardize numerical data
scaler = StandardScaler()
clinical_features = clinical_data[['age_at_diagnosis', 'gender', 'race']].dropna()

# Ensure no missing values in features before standardization
X_structured = scaler.fit_transform(clinical_features)

# Generate simulated labels for demonstration purposes
y_structured = np.random.randint(0, 2, len(X_structured))

# Output the processed features and labels
print(f"Processed features shape: {X_structured.shape}")
print(f"Generated labels shape: {y_structured.shape}")
print("Clinical Data Preview:\n", clinical_data.head())


# Step 5: Organize and Preprocess Imaging Data

In [62]:
import os
import pandas as pd
from pydicom import dcmread
from PIL import Image
from tqdm.auto import tqdm
import numpy as np

# Define paths
metadata_file_path = r"D:\\PROJECTS_FINAL\\Cancer Treatment Prediction\\final stuff\\manifest-1732777365016\\metadata.csv"
base_dir = r"D:\\PROJECTS_FINAL\\Cancer Treatment Prediction\\final stuff\\manifest-1732777365016"
processed_images_dir = os.path.join(base_dir, "breast_cancer_images_png")

# Ensure output directory exists
os.makedirs(processed_images_dir, exist_ok=True)

# Load metadata
metadata = pd.read_csv(metadata_file_path)

# Update paths in the metadata to absolute paths
metadata['Absolute Path'] = metadata['File Location'].apply(lambda x: os.path.join(base_dir, x.lstrip(".\\")))

images, labels = [], []

# Process each folder listed in the metadata
for folder_path in tqdm(metadata['Absolute Path'], desc="Processing DICOM folders"):
    try:
        if not os.path.exists(folder_path):
            print(f"Folder not found: {folder_path}, skipping.")
            continue

        for file_name in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file_name)

            if not file_name.endswith(".dcm"):
                continue

            try:
                dicom = dcmread(file_path)
                if 'PixelData' not in dicom:
                    print(f"No PixelData in {file_path}, skipping.")
                    continue

                pixel_array = dicom.pixel_array
                img = Image.fromarray(pixel_array)
                img.save(os.path.join(processed_images_dir, f"{os.path.basename(folder_path)}_{file_name.replace('.dcm', '.png')}"))

                images.append(np.array(img))
                labels.append(0)

            except Exception as e:
                print(f"Error processing file {file_path}: {e}")

    except Exception as e:
        print(f"Error processing folder {folder_path}: {e}")

# Convert to numpy arrays
X_images = np.array(images).reshape(-1, images[0].shape[0], images[0].shape[1], 1)
y_images = np.array(labels)

print(f"Processed {len(X_images)} images.")
print(f"Processed {len(y_images)} labels.")


Processing DICOM folders:   0%|          | 0/20 [00:00<?, ?it/s]

Processed 6852 images.
Processed 6852 labels.
Label distribution in y_images: {0: 6852}


In [None]:
import cv2
import matplotlib.pyplot as plt
from skimage import measure, color

# Dummy tumor segmentation logic
def segment_tumor(image):
    _, binary_image = cv2.threshold(image, 127, 255, cv2.THRESH_BINARY)
    labeled_image = measure.label(binary_image, connectivity=2)
    regions = measure.regionprops(labeled_image)
    if regions:
        tumor_region = max(regions, key=lambda r: r.area)
        minr, minc, maxr, maxc = tumor_region.bbox
        image_with_box = cv2.rectangle(image.copy(), (minc, minr), (maxc, maxr), (255, 0, 0), 3)
        return image_with_box, tumor_region.bbox
    return image, None

# Apply tumor segmentation
segmented_images = []
for img in X_images:
    img = img.squeeze()  # Remove single channel for processing
    segmented_image, bbox = segment_tumor(img)
    segmented_images.append(segmented_image)

# Display an example
plt.imshow(segmented_images[0], cmap='gray')
plt.title("Tumor Segmentation Example")
plt.show()

# Step 6: Convert Data to Numpy Arrays

In [54]:
X_images = np.array(images).reshape(-1, IMG_SIZE[0], IMG_SIZE[1], 1)
y_images = np.array(labels)
print(f"Processed {len(X_images)} images.")
print(f"Processed images shape: {X_images.shape}")
print(f"Processed labels shape: {y_images.shape}")

Processed 6852 images.
Processed images shape: (6852, 128, 128, 1)
Processed labels shape: (6852,)


# Step 7: Split Data into Training and Testing Sets

In [55]:
from sklearn.model_selection import train_test_split

X_train_img, X_test_img, y_train_img, y_test_img = train_test_split(
    X_images, y_images, test_size=0.3, random_state=42
)

print(f"Training set shape: {X_train_img.shape}, {y_train_img.shape}")
print(f"Testing set shape: {X_test_img.shape}, {y_test_img.shape}")


Training set shape: (4796, 128, 128, 1), (4796,)
Testing set shape: (2056, 128, 128, 1), (2056,)


# Step 8: Build and Train CNN Model

In [56]:
cnn_model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(128, 128, 1)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
cnn_model.fit(X_train_img, y_train_img, epochs=10, batch_size=32, validation_split=0.2)


  super().__init__(


Epoch 1/10
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 257ms/step - accuracy: 0.9597 - loss: 0.0372 - val_accuracy: 1.0000 - val_loss: 1.2406e-25
Epoch 2/10
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 244ms/step - accuracy: 1.0000 - loss: 5.6670e-21 - val_accuracy: 1.0000 - val_loss: 1.2394e-25
Epoch 3/10
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 254ms/step - accuracy: 1.0000 - loss: 8.7984e-19 - val_accuracy: 1.0000 - val_loss: 1.2394e-25
Epoch 4/10
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 261ms/step - accuracy: 1.0000 - loss: 1.0177e-18 - val_accuracy: 1.0000 - val_loss: 1.2394e-25
Epoch 5/10
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 273ms/step - accuracy: 1.0000 - loss: 7.7089e-18 - val_accuracy: 1.0000 - val_loss: 1.2394e-25
Epoch 6/10
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 255ms/step - accuracy: 1.0000 - loss: 3.9685e-19 - val_accura

KeyboardInterrupt: 

# Step 9: Evaluate the Model

In [44]:
from sklearn.metrics import confusion_matrix, classification_report

y_pred = (cnn_model.predict(X_test_img) > 0.5).astype("int32")
print(classification_report(y_test_img, y_pred))
cm = confusion_matrix(y_test_img, y_pred)
print(f"Confusion Matrix:\n{cm}")


[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       277

    accuracy                           1.00       277
   macro avg       1.00      1.00      1.00       277
weighted avg       1.00      1.00      1.00       277

Confusion Matrix:
[[277]]


