<a href="https://colab.research.google.com/github/Sameer438/AI-USING-EARLY-DETECTION/blob/main/Tubercolis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Importing the necessary libraries:
import cv2 as cv
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import os

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("tawsifurrahman/tuberculosis-tb-chest-xray-dataset")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/tawsifurrahman/tuberculosis-tb-chest-xray-dataset/versions/3


In [None]:
#Initializing the values needed for all the image files
normaldir = '/kaggle/input/tuberculosis-tb-chest-xray-dataset/TB_Chest_Radiography_Database/Normal'
tbdir = '/kaggle/input/tuberculosis-tb-chest-xray-dataset/TB_Chest_Radiography_Database/Tuberculosis'
images = []
labels = []
imagesize = 256

In [None]:
import cv2 as cv
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import os
import kagglehub

# Download latest version
path = kagglehub.dataset_download("tawsifurrahman/tuberculosis-tb-chest-xray-dataset")

print("Path to dataset files:", path)

#Initializing the values needed for all the image files
# Update the paths to reflect the actual location of the dataset
# You might need to inspect the 'path' variable to determine the correct location
normaldir = os.path.join(path, 'TB_Chest_Radiography_Database', 'Normal')
tbdir = os.path.join(path, 'TB_Chest_Radiography_Database', 'Tuberculosis')
images = []
labels = []
imagesize = 256

#Storing all the image directories in the 'images' array and corresponding them to either 1 for TB images or 0 for normal images.
for x in os.listdir(normaldir):
    imagedir = os.path.join(normaldir, x)
    image = cv.imread(imagedir, cv.IMREAD_GRAYSCALE)
    image = cv.resize(image, (imagesize, imagesize))
    images.append(image)
    labels.append(0)

for y in os.listdir(tbdir):
    imagedir = os.path.join(tbdir, y)
    image = cv.imread(imagedir, cv.IMREAD_GRAYSCALE)
    image = cv.resize(image, (imagesize, imagesize))
    images.append(image)
    labels.append(1)

Path to dataset files: /root/.cache/kagglehub/datasets/tawsifurrahman/tuberculosis-tb-chest-xray-dataset/versions/3


In [None]:
#Converting to NumPy arrays since they have more features than regular lists
images = np.array(images)
labels = np.array(labels)

#Splitting the images and labels into training and testing sets, then normalizing the values within them for computational efficiency (from 0-255 scale to 0-1 scale)
imagetrain, imagetest, labeltrain, labeltest = train_test_split(images, labels, test_size=0.3, random_state=42)
imagetrain = (imagetrain.astype('float32'))/255
imagetest = (imagetest.astype('float32'))/255

In [None]:
#Flattening the image array into 2D (making it [2940 images] x [all the pixels of the image in just one 1D array]) to be suitable for SMOTE oversampling
imagetrain = imagetrain.reshape(2940, (imagesize*imagesize))

#Performing oversampling
smote = SMOTE(random_state=42)
imagetrain, labeltrain = smote.fit_resample(imagetrain, labeltrain)

#Unflattening the images now to use them for convolutional neural network (4914 images of 256x256 size, with 1 color channel (grayscale, as compared to RGB with 3 color channels))
imagetrain = imagetrain.reshape(-1, imagesize, imagesize, 1)
print(imagetrain.shape)

(4914, 256, 256, 1)


In [None]:
#Classes balanced - equal counts of each label
print(np.unique(labeltrain, return_counts=True))

(array([0, 1]), array([2457, 2457]))


In [None]:
#Importing the necessary libraries
import tensorflow as tf
import keras
from keras import layers
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

In [None]:
#The CNN model has 3 convolutional layers, each followed by pooling to summarize the features found by the layer, starting with 16 and multiplying by 2 each time for computational efficiency, as bits are structured in powers of 2. 3x3 filters and ReLU activation used.
cnn = keras.Sequential(
    [
    #Input layer, same shape as all the images (256x256x1):
    keras.Input(shape=(imagesize, imagesize, 1)),

    #1st convolutional layer:
    Conv2D(16, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),

    #2nd convolutional layer:
    Conv2D(32, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),

    #3rd convolutional layer:
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),

    #Flattening layer for the dense layers:
    Flatten(),

    #1st dense layer following the convolutional layers:
    Dense(64, activation='relu'),

    #Dropout layer with heavy dropout rate to avoid overfitting in the large-ish dataset
    Dropout(0.5),

    #Output layer that squeezes each image to either 0 or 1 with sigmoid activation
    Dense(1, activation='sigmoid')
    ]
)

In [None]:
#Compiling the model with parameters best suited for the task at hand:
cnn.compile(
    loss='binary_crossentropy', #Best for binary classification
    optimizer = keras.optimizers.Adam(learning_rate=0.001), #Good starting LR for dataset of this size
    metrics=['accuracy'], #Looking for accuracy
)

In [None]:
#Fitting the model, with the ReduceLROnPlateau callback added to it to reduce the learning rate to take smaller steps in increasing the accuracy whenever the learning rate plateaus (goes in the wrong direction)
#Doing this with patience=1, meaning it will perform this if it even plateaus for one epoch, since only 10 epochs are used
#factor=0.1 means that for every time the learning rate is reduced, it is reduced by a factor of 0.1 - it also won't go lower than 0.00001
from keras.callbacks import ReduceLROnPlateau
reduce_lr = ReduceLROnPlateau(monitor='accuracy', factor=0.1, patience=1, min_lr=0.00001, verbose=1)

#Fitting the model w/ the callback. ON VS CODE, batch size of 16 makes each epoch take around a minute in this case w/ good accuracy, making the whole training process 10 min, but on Kaggle it should take longer due to less computational resources:
cnn.fit(imagetrain, labeltrain, batch_size=16, epochs=10, verbose=2, callbacks = [reduce_lr])

Epoch 1/10
308/308 - 307s - 998ms/step - accuracy: 0.8555 - loss: 0.3403 - learning_rate: 0.0010
Epoch 2/10
308/308 - 304s - 987ms/step - accuracy: 0.9501 - loss: 0.1397 - learning_rate: 0.0010
Epoch 3/10
308/308 - 323s - 1s/step - accuracy: 0.9742 - loss: 0.0805 - learning_rate: 0.0010
Epoch 4/10
308/308 - 289s - 939ms/step - accuracy: 0.9803 - loss: 0.0650 - learning_rate: 0.0010
Epoch 5/10
308/308 - 285s - 924ms/step - accuracy: 0.9847 - loss: 0.0437 - learning_rate: 0.0010
Epoch 6/10

Epoch 6: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
308/308 - 328s - 1s/step - accuracy: 0.9823 - loss: 0.0531 - learning_rate: 0.0010
Epoch 7/10
308/308 - 316s - 1s/step - accuracy: 0.9912 - loss: 0.0275 - learning_rate: 1.0000e-04
Epoch 8/10
308/308 - 327s - 1s/step - accuracy: 0.9955 - loss: 0.0156 - learning_rate: 1.0000e-04
Epoch 9/10

Epoch 9: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
308/308 - 317s - 1s/step - accuracy: 0.9949 - loss: 0.0136 - 

<keras.src.callbacks.history.History at 0x7ea2604abdc0>

In [None]:
#Evaluating the data w/ multiple types of metrics
print('TESTING DATA:')
cnn.evaluate(imagetest, labeltest, batch_size=32, verbose=2)

print('ADVANCED TESTING METRICS:')
from sklearn.metrics import classification_report, confusion_matrix
predictions = cnn.predict(imagetest, batch_size=32)
predicted_labels = (predictions > 0.5).astype('int32')
print(classification_report(labeltest, predicted_labels))
print(confusion_matrix(labeltest, predicted_labels))

TESTING DATA:
40/40 - 23s - 563ms/step - accuracy: 0.9881 - loss: 0.1021
ADVANCED TESTING METRICS:
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 583ms/step
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1043
           1       0.96      0.97      0.97       217

    accuracy                           0.99      1260
   macro avg       0.98      0.98      0.98      1260
weighted avg       0.99      0.99      0.99      1260

[[1035    8]
 [   7  210]]
