## Import Dependencies

In [None]:
#import libraries
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np



## Load Data

In [None]:
file_path = os.path.join(os.path.dirname('./'), 'train.csv')
data = pd.read_csv(file_path)

In [None]:
# Check the first few rows of the data
data.head()

## EDA

In [None]:
# Check the shape of the data (number of rows and columns)
data.shape

In [None]:
# Check the data types of each column
data.dtypes

In [None]:
# Check for missing values
data.isnull().sum()

In [None]:
# Check basic statistics of numerical columns
data.describe()

In [None]:

def plot_categorical_distributions(categorical_variable, data, bar_title, pie_title, bar_bool=True, pie_bool=True):
    class_counts = data[categorical_variable].value_counts().rename_axis(categorical_variable).reset_index(name='counts')
    print(class_counts)

    ############### Bar Chart ###############
    if bar_bool:
        # Check the distribution of the anomaly class
        fig, ax = plt.subplots(figsize=(12, 6))
        sns.barplot(y=categorical_variable, x='counts', data=class_counts, ax=ax, hue=categorical_variable, order=class_counts[categorical_variable])
        for container in ax.containers:
            ax.bar_label(container)
        ax.set_title(bar_title)
        plt.tight_layout()
        plt.show()

    ############### Pie Chart ###############
    if pie_bool:
        # Calculate percentage for each class manually
        class_counts['percentage'] = (class_counts['counts'] / class_counts['counts'].sum()) * 100

        # Plot pie chart without labels on the slices but with autopct to show percentages
        fig, ax = plt.subplots(figsize=(10, 6))
        ax.pie(class_counts['counts'], autopct='%1.1f%%')

        # Create custom labels for the legend by combining class name and percentage
        labels = [f'{row[categorical_variable]}: {row.percentage:.1f}%' for index, row in class_counts.iterrows()]

        # Set legend with custom labels
        ax.legend(labels, loc="center left", bbox_to_anchor=(1, 0.5))

        # Equal aspect ratio ensures that pie is drawn as a circle.
        ax.axis('equal')
        ax.set_title(pie_title)
        plt.tight_layout()
        plt.show()


plot_categorical_distributions('class_name', data, 'Distribution of Anomalies', 'Distribution of Anomalies by Percentage', True, True)
plot_categorical_distributions('rad_id', data, 'Share of Labels by Radiologists', 'Share of Labels by Radiologists by Percentage', True, True)
plot_categorical_distributions('image_id', data, _, _, False, False)

In [None]:
# Check the distribution of numerical columns
data.drop(columns=['image_id', 'class_name', 'rad_id']).plot(kind = 'hist', subplots = True, layout = (3,3), sharex = False, sharey = False, figsize = (15,15))

In [None]:
# Check the correlation between numerical columns
sns.heatmap(data.drop(columns=['image_id', 'class_name', 'rad_id']).corr(), annot=True)

## Data Preprocessing

### Image conversion to compatible format

In [None]:
# convert images to jpeg, supported by YOLOv8

import dicom_convert as dconv

dicom_folder = r"C:\Users\cassm\OneDrive\Desktop\vinbigdata-chest-xray-abnormalities-detection\train"

In [None]:
# To convert to JPEG
output_folder = r"C:\Users\cassm\OneDrive\Desktop\vinbigdata-chest-xray-abnormalities-detection\images_jpg"
dconv.convert_dicom_to_jpeg(dicom_folder, output_folder)

In [None]:
# To convert to PNG
output_folder = r"C:\Users\cassm\OneDrive\Desktop\vinbigdata-chest-xray-abnormalities-detection\images_png"
dconv.convert_dicom_to_png(dicom_folder, output_folder)

In [None]:
# To convert to TIFF
output_folder = r"C:\Users\cassm\OneDrive\Desktop\vinbigdata-chest-xray-abnormalities-detection\images_tiff"
dconv.convert_dicom_to_tiff(dicom_folder, output_folder)

### Data Sampling: Group classes by number of instances -> Stratify by Class_Name ->  Random Sampling