<a href="https://colab.research.google.com/github/Robin39-AFS/Automated-Pneumonia-Detection-in-Chest-X-rays-Using-Machine-Learning/blob/main/Automated_Pneumonia_Detection_in_Chest_X_rays_Using_Machine_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Checking Python version
!python --version

Python 3.11.12


In [None]:
# Basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Feature selection and evaluation
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Unzip dataset
import zipfile

zip_path = '/content/drive/MyDrive/Automated Pneumonia Detection in Chest X-rays /chest_xray.zip'
unzip_path = '/content'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(unzip_path)

print("Unzipped successfully.")


Unzipped successfully.


In [None]:
import numpy as np
import os
from PIL import Image

# Define image size for feature extraction (128x128 pixels)
image_size = (128, 128)

def load_images_from_folder(folder):
    images = []
    labels = []
    for label in ['NORMAL', 'PNEUMONIA']:
        path = os.path.join(folder, label)
        for filename in os.listdir(path):
            img_path = os.path.join(path, filename)
            img = Image.open(img_path).convert('L')  # Grayscale
            img = img.resize(image_size)
            img_array = np.array(img) / 255.0  # Normalize [0,1]
            images.append(img_array)
            labels.append(0 if label == 'NORMAL' else 1)
    return np.array(images), np.array(labels)

# Load Training data
train_folder = '/content/chest_xray/train'
X_train, y_train = load_images_from_folder(train_folder)

# Check shape of data
print("Training data shape:", X_train.shape)
print("Training labels shape:", y_train.shape)


Training data shape: (5216, 128, 128)
Training labels shape: (5216,)


In [None]:
# Flatten images into feature vectors
X_train_flat = X_train.reshape(X_train.shape[0], -1)
print("Flattened feature shape:", X_train_flat.shape)


Flattened feature shape: (5216, 16384)


In [None]:
from sklearn.decomposition import PCA

# Reduce dimensionality
pca = PCA(n_components=50)  # selecting top 50 components
X_train_pca = pca.fit_transform(X_train_flat)

print("Shape after PCA:", X_train_pca.shape)


Shape after PCA: (5216, 50)


# **LinearRegression**

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score

# Linear regression (demo; not ideal for classification tasks)
lin_reg = LinearRegression()
lin_reg.fit(X_train_pca, y_train)

# Predict on training set
y_pred_lin = lin_reg.predict(X_train_pca)
y_pred_lin_class = [1 if pred > 0.5 else 0 for pred in y_pred_lin]

# Accuracy on training set
acc_lin = accuracy_score(y_train, y_pred_lin_class)
print("Linear Regression accuracy (train):", acc_lin)


Linear Regression accuracy (train): 0.9478527607361963


# **K-Nearest Neighbors Classifier**

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_pca, y_train)

# Accuracy on training set
y_pred_knn = knn.predict(X_train_pca)
acc_knn = accuracy_score(y_train, y_pred_knn)
print("KNN accuracy (train):", acc_knn)


KNN accuracy (train): 0.9649156441717791


# **Decision Tree Classifier**

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Limit depth to prevent overfitting
tree = DecisionTreeClassifier(max_depth=5, random_state=42)
tree.fit(X_train_pca, y_train)

# Recheck training accuracy
y_pred_tree = tree.predict(X_train_pca)
acc_tree = accuracy_score(y_train, y_pred_tree)
print("Decision Tree accuracy with limited depth (train):", acc_tree)


Decision Tree accuracy with limited depth (train): 0.9189033742331288


# **Cross-validation**

In [None]:
from sklearn.model_selection import cross_val_score

# KNN cross-validation
knn_cv = cross_val_score(knn, X_train_pca, y_train, cv=5)
print("KNN Cross-validation accuracy:", knn_cv.mean())

# Decision Tree cross-validation
tree_cv = cross_val_score(tree, X_train_pca, y_train, cv=5)
print("Decision Tree Cross-validation accuracy:", tree_cv.mean())


KNN Cross-validation accuracy: 0.9440183232129542
Decision Tree Cross-validation accuracy: 0.8964699896775805


# **Final Evaluation**

In [None]:
# Load Test data
test_folder = '/content/chest_xray/test'
X_test, y_test = load_images_from_folder(test_folder)
X_test_flat = X_test.reshape(X_test.shape[0], -1)
X_test_pca = pca.transform(X_test_flat)

# Decision Tree as example
y_test_pred_tree = tree.predict(X_test_pca)
test_acc_tree = accuracy_score(y_test, y_test_pred_tree)
print("Decision Tree accuracy (test):", test_acc_tree)

# KNN as example
y_test_pred_knn = knn.predict(X_test_pca)
test_acc_knn = accuracy_score(y_test, y_test_pred_knn)
print("KNN accuracy (test):", test_acc_knn)


Decision Tree accuracy (test): 0.7387820512820513
KNN accuracy (test): 0.7772435897435898
