## Exploratory Data Analysis

In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [35]:
class CTData:

    def __init__(self, name:str, dir:str):
        data_label = pd.read_csv(dir)
        aggregated_data = []
        aggregated_data_header = []

        for i in range(84):
            if i==0:
                start = 1
            else:
                start = (i*52) + 1
            end = start + 52
            current_header = data_label.columns[start].split('*')[0]
            aggregated_data.append(data_label.iloc[:,start:end].sum(axis=1))
            aggregated_data_header.append(current_header)

        label_count = [np.count_nonzero(data > 0) for data in aggregated_data]
        data_summary = pd.DataFrame(data=[label_count], columns=aggregated_data_header)

        self.columns = aggregated_data_header
        self.raw_data = np.asarray(data_label)
        self.aggregated_data = np.asarray(aggregated_data)
        self.label_count = label_count
        self.dir = dir
        self.name = name
        self.summary = data_summary
        self.total_columns = 84


In [36]:
train_data = CTData("train","Source/dataset/imgtrain_Abnormality_and_Location_Labels.csv")
test_data = CTData("test","Source/dataset/imgtest_Abnormality_and_Location_Labels.csv")
val_data = CTData("val","Source/dataset/imgvalid_Abnormality_and_Location_Labels.csv")

In [48]:
df = pd.DataFrame([{'train':train_data.label_count, 
                        'test':test_data.label_count,
                        'valid':val_data.label_count}])

In [50]:
df = df.apply(pd.Series.explode)
df = df.reset_index(drop=True)

In [55]:
df.columns = train_data.columns

In [64]:
excel_file_path = 'data_label_comparison.xlsx'
df.to_excel(excel_file_path)

In [71]:
path = 'Source/dataset'
dir_list = os.listdir(path)

In [72]:
print(dir_list)

['A systematic literature review of machine learning application in COVID-19 medical image classification.pdf', 'CT_Scan_Metadata_Complete_35747.csv', 'Extrema_35747.csv', 'imgtest_Abnormality_and_Location_Labels.csv', 'imgtrain_Abnormality_and_Location_Labels.csv', 'imgvalid_Abnormality_and_Location_Labels.csv', 'Indications_35747.csv', 'Summary_3630.csv', 'trn00022.npz', 'trn00039.npz', 'trn00042.npz', 'trn00058.npz', 'trn00067.npz', 'trn00093.npz', 'trn00131.npz', 'trn00139.npz', 'trn00141.npz', 'trn00143.npz', 'trn00149.npz', 'trn00167.npz', 'trn00180.npz', 'trn00187.npz', 'trn00193.npz', 'trn00195.npz', 'trn00252.npz', 'trn00268.npz', 'trn00270.npz', 'trn00277.npz', 'trn00301.npz', 'trn00309.npz', 'trn00317.npz', 'trn00318.npz', 'trn00323.npz', 'trn00324.npz', 'trn00335.npz', 'trn00336.npz', 'trn00345.npz', 'trn00347.npz', 'trn00367.npz', 'trn00370.npz', 'trn00384.npz', 'trn00393.npz', 'trn00396.npz', 'trn00420.npz', 'trn00425.npz', 'trn00428.npz', 'trn00437.npz', 'trn00450.npz', 

In [75]:
train_count = 0 
test_count = 0
valid_count = 0
for file in dir_list:
    if file.startswith('trn'):
        train_count+=1
    if file.startswith('tst'):
        test_count+=1
    if file.startswith('val'):
        valid_count+=1

In [76]:
print(train_count, test_count, valid_count)

2286 360 984


In [77]:
2286+360+984

3630