In [5]:
pip install kagglehub

Note: you may need to restart the kernel to use updated packages.


In [6]:
import os
import kagglehub

path = kagglehub.dataset_download("adityamahimkar/iqothnccd-lung-cancer-dataset")

print("Path to dataset files:", path)

# List the contents of the downloaded directory
print("Contents of the directory:", os.listdir(path))

Path to dataset files: C:\Users\HP\.cache\kagglehub\datasets\adityamahimkar\iqothnccd-lung-cancer-dataset\versions\2
Contents of the directory: ['Test cases', 'The IQ-OTHNCCD lung cancer dataset']


In [7]:
import os

DATASET_PATH = "C:\\Users\\HP\\.cache\\kagglehub\\datasets\\adityamahimkar\\iqothnccd-lung-cancer-dataset\\versions\\2"
subfolders = os.listdir(os.path.join(DATASET_PATH, "The IQ-OTHNCCD lung cancer dataset"))

print("Contents of 'The IQ-OTHNCCD lung cancer dataset':", subfolders)

Contents of 'The IQ-OTHNCCD lung cancer dataset': ['The IQ-OTHNCCD lung cancer dataset']


In [8]:
import os

# Define base dataset path
BASE_PATH = "C:\\Users\\HP\\.cache\\kagglehub\\datasets\\adityamahimkar\\iqothnccd-lung-cancer-dataset\\versions\\2"
NESTED_PATH = os.path.join(BASE_PATH, "The IQ-OTHNCCD lung cancer dataset", "The IQ-OTHNCCD lung cancer dataset")

# List contents of the nested folder
print("Final dataset folder contents:", os.listdir(NESTED_PATH))

Final dataset folder contents: ['Bengin cases', 'IQ-OTH_NCCD lung cancer dataset.txt', 'Malignant cases', 'Normal cases']


In [13]:
import os
import cv2
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from tensorflow.keras.utils import to_categorical

# Dataset Paths
RAW_DATASET_PATH = "C:\\Users\\HP\\.cache\\kagglehub\\datasets\\adityamahimkar\\iqothnccd-lung-cancer-dataset\\versions\\2\\The IQ-OTHNCCD lung cancer dataset\\The IQ-OTHNCCD lung cancer dataset"
CLEANED_DATASET_PATH = "cleaned_dataset"
IMG_SIZE = 224

# Class categories
CATEGORIES = ["Bengin cases", "Malignant cases", "Normal cases"]

# Ensure output directory exists
os.makedirs(CLEANED_DATASET_PATH, exist_ok=True)

def process_and_save_images():
    data, labels = [], []
    for label, category in enumerate(CATEGORIES):
        input_folder = os.path.join(RAW_DATASET_PATH, category)
        output_folder = os.path.join(CLEANED_DATASET_PATH, category)
        os.makedirs(output_folder, exist_ok=True)

        print(f"Processing {category}...")

        for img_name in tqdm(os.listdir(input_folder)):
            img_path = os.path.join(input_folder, img_name)

            try:
                img = cv2.imread(img_path)
                if img is None:
                    continue

                img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

                output_img_path = os.path.join(output_folder, os.path.splitext(img_name)[0] + ".jpg")
                cv2.imwrite(output_img_path, img)

                img = img.astype("float32") / 255.0
                data.append(img)
                labels.append(label)

            except Exception as e:
                print(f"Error processing {img_name}: {e}")

    return np.array(data), np.array(labels)

# Process images
data, labels = process_and_save_images()

# Ensure images were processed
if len(data) == 0:
    raise RuntimeError("No images processed! Check dataset path.")

# Split dataset (80% train, 10% validation, 10% test)
x_temp, x_test, y_temp, y_test = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)
x_train, x_valid, y_train, y_valid = train_test_split(x_temp, y_temp, test_size=0.125, stratify=y_temp, random_state=42)

# SMOTE resampling for balancing dataset
smote = SMOTE(random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train.reshape(-1, IMG_SIZE * IMG_SIZE * 3), y_train)
x_train_resampled = x_train_resampled.reshape(-1, IMG_SIZE, IMG_SIZE, 3)

# Convert labels to categorical
y_train_resampled = to_categorical(y_train_resampled, num_classes=3)
y_valid = to_categorical(y_valid, num_classes=3)
y_test = to_categorical(y_test, num_classes=3)

# Save preprocessed data
np.save("x_train.npy", x_train_resampled)
np.save("y_train.npy", y_train_resampled)
np.save("x_valid.npy", x_valid)
np.save("y_valid.npy", y_valid)
np.save("x_test.npy", x_test)
np.save("y_test.npy", y_test)

print("Preprocessing complete! Data saved in cleaned_dataset/")


Processing Bengin cases...


100%|██████████| 120/120 [00:01<00:00, 74.09it/s]


Processing Malignant cases...


100%|██████████| 561/561 [00:07<00:00, 70.65it/s]


Processing Normal cases...


100%|██████████| 416/416 [00:05<00:00, 73.33it/s]


Preprocessing complete! Data saved in cleaned_dataset/


In [15]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Load model
from tensorflow.keras.models import load_model
model = load_model("C:\\Users\\HP\\Desktop\\Workspace\\lung_cancer_densenet169.h5")

# Predict test data
y_pred = np.argmax(model.predict(x_test), axis=1)
y_true = np.argmax(y_test, axis=1)

# Print classification report
print(classification_report(y_true, y_pred, target_names=["Benign", "Malignant", "Normal"]))

# Print confusion matrix
print(confusion_matrix(y_true, y_pred))




[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 2s/step
              precision    recall  f1-score   support

      Benign       0.95      0.79      0.86        24
   Malignant       1.00      1.00      1.00       113
      Normal       0.94      0.99      0.96        83

    accuracy                           0.97       220
   macro avg       0.96      0.93      0.94       220
weighted avg       0.97      0.97      0.97       220

[[ 19   0   5]
 [  0 113   0]
 [  1   0  82]]


In [16]:

test_loss, test_acc = model.evaluate(x_test, y_test)
print(f"Test Accuracy: {test_acc:.2f}")

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 1s/step - accuracy: 0.9613 - loss: 0.1060
Test Accuracy: 0.97


In [17]:

test_loss, test_acc = model.evaluate(x_valid, y_valid)
print(f"Test Accuracy: {test_acc:.2f}")

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1s/step - accuracy: 0.9854 - loss: 0.0820
Test Accuracy: 0.98
