In [41]:
import cv2
import os
import numpy as np
import matplotlib.pyplot as plt
import random

kaggle_directory1 = '/kaggle/input/datascienceproject4/chest_xray/train'
kaggle_directory2 = '/kaggle/input/chest-xray-pneumonia/chest_xray/train'
local = './chest_xray/train'
normal_directory = os.path.join(local,'NORMAL' )
pneumonia_directory = os.path.join(local,'PNEUMONIA' )
cleaned_data_directory = './cleaned_data'
edgedImages_data_directory = './edge_images'
normalizedImages_data_directory = './normalized_images'

In [None]:
# Function to check for missing values in an image
def check_missing_values(image_path):
    # Load image
    image = cv2.imread(image_path)

    # Check for missing values (e.g., NaN or None)
    missing_values = np.isnan(image).sum()  # Example for NumPy array
    if missing_values > 0:
        print(f"Missing values found in image: {image_path}")
    else:
        print(f"No missing values found in image: {image_path}")

# List all files under the input directory
for dirname, _, filenames in os.walk(local):
    for filename in filenames:
        image_path = os.path.join(dirname, filename)
        check_missing_values(image_path)

In [None]:
# Function to check for missing values in an image and plot histogram
def check_missing_values_and_plot_histogram(image_path):
    # Load image
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

    # Check for missing values (e.g., NaN or None)
    missing_values = np.isnan(image).sum()  # Example for NumPy array
    if missing_values > 0:
        print(f"Missing values found in image: {image_path}")
    else:
        print(f"No missing values found in image: {image_path}")

    # Plot histogram of pixel intensities
    hist = cv2.calcHist([image], [0], None, [256], [0, 256])
    plt.plot(hist, color='black')
    plt.xlabel('Pixel Intensity')
    plt.ylabel('Frequency')
    plt.title('Histogram of Pixel Intensities')
    plt.show()


In [None]:
check_missing_values_and_plot_histogram(local)

In [None]:
#An image histogram is a type of histogram that acts as a 
#graphical representation of the tonal distribution in a 
#digital image. It plots the number of pixels for each tonal value. 
#By looking at the histogram for a specific image a viewer will be able
#to judge the entire tonal distribution at a glance.
#The horizontal axis of the graph represents 
#the tonal variations, while the vertical axis represents 
#the total number of pixels in that particular tone.[1]

**Combined Histogram**

In [None]:
# Function to accumulate histograms of all images in a directory
def accumulate_histograms(directory):
    # Initialize an empty list to store histograms
    histograms = []

    # List all files under the specified directory
    for dirname, _, filenames in os.walk(directory):
        for filename in filenames:
            # Load image
            image_path = os.path.join(dirname, filename)
            image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

            # Calculate histogram
            hist = cv2.calcHist([image], [0], None, [256], [0, 256])

            # Append histogram to the list
            histograms.append(hist)

    # Combine histograms using numpy sum function along axis 0
    combined_hist = np.sum(histograms, axis=0)

    return combined_hist

# Function to plot a histogram
def plot_histogram(hist, data_name):
    plt.plot(hist, color='black')
    plt.xlabel('Pixel Intensity')
    plt.ylabel('Frequency')
    plt.title('Histogram of Pixel Intensities' + ' of ' + data_name + ' data set')
    plt.show()

# Directory containing the image data
# data_directory = '/kaggle/input/datascienceproject4/chest_xray/train'

# Accumulate histograms of all images
combined_hist = accumulate_histograms(normal_directory)
# Plot the combined histogram
plot_histogram(combined_hist, 'NORMAL')

# Accumulate histograms of all images
combined_hist = accumulate_histograms(pneumonia_directory)
# Plot the combined histogram
plot_histogram(combined_hist, 'PNEUMONIA')


In [None]:
# Define the subdirectories for normal and pneumonia-infected individuals
# normal_directory = os.path.join(data_directory, 'NORMAL')
# pneumonia_directory = os.path.join(data_directory, 'PNEUMONIA')

# Define the number of sample images to display from each category
num_samples = 5

# Function to display sample images
def display_sample_images(directory, label):
    print(f"Sample images for {label} category:")
    fig, axes = plt.subplots(1, num_samples, figsize=(15, 3))
    for i, filename in enumerate(os.listdir(directory)[:num_samples]):
        image_path = os.path.join(directory, filename)
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        axes[i].imshow(image)
        axes[i].axis('off')
    plt.show()

# Display sample images for normal and pneumonia categories
display_sample_images(normal_directory, 'NORMAL')
display_sample_images(pneumonia_directory, 'PNEUMONIA')


**Data Cleaning**

In [None]:
import os
from PIL import Image
import imagehash
import shutil

# Define the directory containing the original data
# original_data_directory = '/kaggle/input/datascienceproject4/chest_xray/train'

# Define the directory for the cleaned data
# cleaned_data_directory = '/kaggle/working/cleaned_data'
cleaned_data_directory = './cleaned_data'

# Copy the original data to the cleaned data directory
# shutil.copytree(original_data_directory, cleaned_data_directory)

shutil.copytree(local, cleaned_data_directory)

In [None]:
def count_files_in_directories(directory):
    try:
        for subdir, _, files in os.walk(directory):
            print(f"Directory: {subdir}, Number of files: {len(files)}")
    except FileNotFoundError:
        print("Directory not found.")

# Function to remove duplicate images
def remove_duplicates(directory):
    hash_dict = {}
    duplicates = []

    for root, _, filenames in os.walk(directory):
        for filename in filenames:
            image_path = os.path.join(root, filename)
            with Image.open(image_path) as img:
                hash_value = str(imagehash.average_hash(img))
            if hash_value in hash_dict:
                duplicates.append(image_path)
            else:
                hash_dict[hash_value] = image_path

    # Remove duplicate images
    for duplicate in duplicates:
        os.remove(duplicate)

# Function to remove samples with missing images
def remove_missing_samples(directory):
    for root, _, filenames in os.walk(directory):
        for filename in filenames:
            image_path = os.path.join(root, filename)
            if not os.path.exists(image_path):
                os.remove(image_path)

In [52]:
print("Count of files BEFORE removing missing sample and duplicates")
count_files_in_directories(cleaned_data_directory)

# Remove duplicate images from the cleaned data directory
remove_duplicates(cleaned_data_directory)
print("Count of files after removing duplicates")
count_files_in_directories(cleaned_data_directory)

# Remove samples with missing images from the cleaned data directory
remove_missing_samples(cleaned_data_directory)
print("Count of files after removing missing sample")
count_files_in_directories(cleaned_data_directory)


Count of files BEFORE removing missing sample and duplicates
Directory: ./cleaned_data, Number of files: 0
Directory: ./cleaned_data\NORMAL, Number of files: 1260
Directory: ./cleaned_data\PNEUMONIA, Number of files: 3702
Count of files after removing duplicates
Directory: ./cleaned_data, Number of files: 0
Directory: ./cleaned_data\NORMAL, Number of files: 1260
Directory: ./cleaned_data\PNEUMONIA, Number of files: 3702
Count of files after removing missing sample
Directory: ./cleaned_data, Number of files: 0
Directory: ./cleaned_data\NORMAL, Number of files: 1260
Directory: ./cleaned_data\PNEUMONIA, Number of files: 3702


In [None]:
import random

# Function to apply Adaptive Histogram Equalization (AHE) to an image
def apply_ahe(image):
    # Convert image to grayscale if not already
    if len(image.shape) > 2:
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Apply Adaptive Histogram Equalization (AHE)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    enhanced_image = clahe.apply(image)
    
    return enhanced_image

# Function to apply Contrast-Limited Adaptive Histogram Equalization (CLAHE) to an image
def apply_clahe(image):
    # Convert image to grayscale if not already
    if len(image.shape) > 2:
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Apply Contrast-Limited Adaptive Histogram Equalization (CLAHE)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    enhanced_image = clahe.apply(image)
    
    return enhanced_image

# # List all files under the input directory
# for dirname, _, filenames in os.walk('/kaggle/working/cleaned_data'):
#     for filename in filenames:
#         image_path = os.path.join(dirname, filename)
        
#         # Load image
#         image = cv2.imread(image_path)
        
#         # Apply AHE and CLAHE
#         ahe_image = apply_ahe(image)
#         clahe_image = apply_clahe(image)
        
#         # Plot original and enhanced images
#         plt.figure(figsize=(10, 4))
#         plt.subplot(1, 3, 1)
#         plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
#         plt.title('Original Image')
#         plt.axis('off')
        
#         plt.subplot(1, 3, 2)
#         plt.imshow(ahe_image, cmap='gray')
#         plt.title('AHE')
#         plt.axis('off')
        
#         plt.subplot(1, 3, 3)
#         plt.imshow(clahe_image, cmap='gray')
#         plt.title('CLAHE')
#         plt.axis('off')
        
#         plt.show()


for subdir in ['normal', 'pneumonia']:
    print(f"Random files from {subdir}:")
    
    # Get all file names from the current subdirectory
    files = os.listdir(os.path.join(cleaned_data_directory, subdir))
    
    # Randomly select 10 files
    random_files = random.sample(files, min(5, len(files)))
    
    for filename in random_files:
        # Construct the full path to the image
        image_path = os.path.join(cleaned_data_directory, subdir, filename)
        
        # Load image
        image = cv2.imread(image_path)
        
        # Apply AHE and CLAHE
        ahe_image = apply_ahe(image)
        clahe_image = apply_clahe(image)
        
        # Plot original and enhanced images
        plt.figure(figsize=(10, 4))
        
        plt.subplot(1, 3, 1)
        plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        plt.title('Original Image')
        plt.axis('off')
        
        plt.subplot(1, 3, 2)
        plt.imshow(ahe_image, cmap='gray')
        plt.title('AHE')
        plt.axis('off')
        
        plt.subplot(1, 3, 3)
        plt.imshow(clahe_image, cmap='gray')
        plt.title('CLAHE')
        plt.axis('off')
        
        plt.show()


**Edge Detection Techniques**

In [None]:
# Function to apply Canny edge detection to an image
def apply_canny_edge_detection(image):
    # Convert image to grayscale if not already
    if len(image.shape) > 2:
        gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    else:
        gray_image = image

    # Apply Canny edge detection
    edges = cv2.Canny(gray_image, threshold1=30, threshold2=100)  # Adjust thresholds as needed

    return edges

# Function to list all files under a directory
def list_files(directory):
    files = []
    for dirname, _, filenames in os.walk(directory):
        for filename in filenames:
            files.append(os.path.join(dirname, filename))
    return files

# Define the directory containing the data
# data_directory = '/kaggle/working/cleaned_data'

# List all files under the input directory
# data_files = list_files(data_directory)
data_files = list_files(cleaned_data_directory)

# Define the output directory for edge-detected images
# output_dir = '/kaggle/working/edge_images'
output_dir = edgedImages_data_directory

os.makedirs(output_dir, exist_ok=True)

# Iterate over each image in the data directory
for image_path in data_files:
    # Load the image
    image = cv2.imread(image_path)
    
    # Apply Canny edge detection
    edges = apply_canny_edge_detection(image)
    
    # Save the edge-detected image
    # filename = os.path.basename(image_path)
    # cv2.imwrite(os.path.join(output_dir, filename), edges)

    # Determine the parent folder name (normal or pneumonia)
    parent_folder = os.path.basename(os.path.dirname(image_path))
    
    # Create the corresponding output directory if it doesn't exist
    output_subdir = os.path.join(output_dir, parent_folder)
    os.makedirs(output_subdir, exist_ok=True)
    
    # Save the edge-detected image
    filename = os.path.basename(image_path)
    output_path = os.path.join(output_subdir, filename)
    cv2.imwrite(output_path, edges)


**Image Pre-Processing - Normalization**

In [48]:
# Define directories
# input_directory = '/kaggle/input/datascienceproject4/chest_xray/train'
# output_directory = '/kaggle/working/preprocessed_images'
input_directory = edgedImages_data_directory
output_directory = normalizedImages_data_directory

# Create output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Function to resize and normalize images
def preprocess_image(image_path, output_directory, target_size=(224, 224)):
    # Read image
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    
    # Resize image
    image = cv2.resize(image, target_size)
    
    # Normalize pixel values
    image = image / 255.0
    
    # Save preprocessed image
    # filename = os.path.basename(image_path)
    # output_path = os.path.join(output_directory, filename)
    # cv2.imwrite(output_path, image * 255.0)  # Save normalized image

    # Determine the parent folder name (normal or pneumonia)
    parent_folder = os.path.basename(os.path.dirname(image_path))
    
    # Create the corresponding output directory if it doesn't exist
    output_subdir = os.path.join(output_directory, parent_folder)
    os.makedirs(output_subdir, exist_ok=True)
    
    # Save preprocessed image    
    filename = os.path.basename(image_path)
    output_path = os.path.join(output_subdir, filename)
    cv2.imwrite(output_path, edges)
    
# Process images in the input directory
for root, _, filenames in os.walk(input_directory):
    for filename in filenames:
        image_path = os.path.join(root, filename)
        preprocess_image(image_path, output_directory)

print("Image Normalization completed.")


Image preprocessing completed.


In [None]:
# Function to display normalized images
def display_normalized_images(directory, num_samples=2):
    images = []
    # Iterate over the directory and select sample images
    for root, _, filenames in os.walk(directory):
        for filename in filenames:
            image_path = os.path.join(root, filename)
            images.append(cv2.imread(image_path, cv2.IMREAD_GRAYSCALE))
            if len(images) == num_samples:
                break
        if len(images) == num_samples:
            break
    
    print("Number of images:", len(images))
    
    # Display the selected images
    fig, axes = plt.subplots(1, num_samples, figsize=(12, 6))
    for i in range(num_samples):
        axes[i].imshow(images[i], cmap='gray')
        axes[i].set_title('Noramilzed image Image {}'.format(i+1))
        axes[i].axis('off')
    plt.show()

# Display 2 sample images from the directory
display_normalized_images(normalizedImages_data_directory, num_samples=2)


**----------------------------------------------------------------------------------------------------------------------------------------------------**

**Texture analysis involves extracting features that capture the spatial arrangement of pixel intensities in an image. Haralick texture features and Gabor filters are two common methods for texture analysis. Here's an overview of how you can compute these texture features**

In [None]:
import cv2
import matplotlib.pyplot as plt

# Load the image
image_path = '/kaggle/input/datascienceproject4/chest_xray/train/NORMAL/IM-0115-0001.jpeg'
image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

# Display the image
plt.imshow(image, cmap='gray')
plt.axis('off')
plt.show()


In [None]:
import os
import cv2
import matplotlib.pyplot as plt

# Function to display normalized images
def display_normalized_images(directory, num_samples=2):
    pneumonia_images = []
    normal_images = []
    # Iterate over the directory and select sample images from pneumonia and normal patients
    for root, _, filenames in os.walk(directory):
        for filename in filenames:
            if 'NORMAL' in root and len(normal_images) < num_samples:
                image_path = os.path.join(root, filename)
                normal_images.append(cv2.imread(image_path, cv2.IMREAD_GRAYSCALE))
            elif 'PNEUMONIA' in root and len(pneumonia_images) < num_samples:
                image_path = os.path.join(root, filename)
                pneumonia_images.append(cv2.imread(image_path, cv2.IMREAD_GRAYSCALE))
            if len(pneumonia_images) == num_samples and len(normal_images) == num_samples:
                break
    
    print("Number of normal images:", len(normal_images))
    print("Number of pneumonia images:", len(pneumonia_images))
    
    # Display the selected images
    fig, axes = plt.subplots(2, 2, figsize=(10, 10))
    if normal_images:
        axes[0, 0].imshow(normal_images[0], cmap='gray')
        axes[0, 0].set_title('Normal Patient 1')
        axes[0, 0].axis('off')
        if len(normal_images) > 1:
            axes[0, 1].imshow(normal_images[1], cmap='gray')
            axes[0, 1].set_title('Normal Patient 2')
            axes[0, 1].axis('off')
    if pneumonia_images:
        axes[1, 0].imshow(pneumonia_images[0], cmap='gray')
        axes[1, 0].set_title('Pneumonia Patient 1')
        axes[1, 0].axis('off')
        if len(pneumonia_images) > 1:
            axes[1, 1].imshow(pneumonia_images[1], cmap='gray')
            axes[1, 1].set_title('Pneumonia Patient 2')
            axes[1, 1].axis('off')
    plt.show()

# Display normalized images from both pneumonia and normal patients
display_normalized_images('/kaggle/working/edge_images', num_samples=2)


In [None]:
import cv2
import matplotlib.pyplot as plt

# Load the image
image_path = '/kaggle/input/datascienceproject4/chest_xray/train/NORMAL/IM-0115-0001.jpeg'
image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

# Display the image
plt.imshow(image, cmap='gray')
plt.axis('off')
plt.show()


**compute_haralick_features**

In [None]:
import cv2
import os
import mahotas as mh

# Function to compute Haralick texture features
def compute_haralick_features(image):
    # Convert image to grayscale if not already
    if len(image.shape) > 2:
        gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    else:
        gray_image = image

    # Compute Haralick texture features
    haralick_features = mh.features.haralick(gray_image).mean(axis=0)

    return haralick_features

# Select one image path from the data files list
image_path = '/kaggle/input/datascienceproject4/chest_xray/train/PNEUMONIA/person1000_bacteria_2931.jpeg'

# Load the image
image = cv2.imread(image_path)

# Compute Haralick texture features for the selected image
haralick_features = compute_haralick_features(image)

# Print or save the computed features as needed
print(haralick_features)


In [None]:

# Select one image path from the data files list
image_path = '/kaggle/input/datascienceproject4/chest_xray/train/NORMAL/IM-0122-0001.jpeg'

# Load the image
image = cv2.imread(image_path)

# Compute Haralick texture features for the selected image
haralick_features = compute_haralick_features(image)

# Print or save the computed features as needed
print(haralick_features)
