**Quick Exploratory Data Analysis for RANZCR CLiP - Catheter and Line Position Challenge challenge**

In this competition, you’ll detect the presence and position of catheters and lines on chest x-rays. Use machine learning to train and test your model on 40,000 images to categorize a tube that is poorly placed.

In [None]:
!pip install -q -U pip
!pip install -q -U seaborn

# **Importing Libraries**

In [None]:
import os
import ast
import random

import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn3
import seaborn as sns

In [None]:
BASE_DIR = "../input/ranzcr-clip-catheter-line-classification/"
print(os.listdir(BASE_DIR))

In [None]:
df_train = pd.read_csv(os.path.join(BASE_DIR, "train.csv"), index_col=0)
df_train.head()

**StudyInstanceUID** - unique ID for each image <br>
**ETT** - Abnormal - endotracheal tube placement abnormal<br>
**ETT** - Borderline - endotracheal tube placement borderline abnormal<br>
**ETT** - Normal - endotracheal tube placement normal<br>
**NGT** - Abnormal - nasogastric tube placement abnormal<br>
**NGT** - Borderline - nasogastric tube placement borderline abnormal<br>
**NGT** - Incompletely Imaged - nasogastric tube placement inconclusive due to imaging<br>
**NGT** - Normal - nasogastric tube placement borderline normal<br>
**CVC** - Abnormal - central venous catheter placement abnormal<br>
**CVC** - Borderline - central venous catheter placement borderline abnormal<br>
**CVC** - Normal - central venous catheter placement normal<br>
Swan Ganz Catheter Present<br>
**PatientID** - unique ID for each patient in the dataset

In [None]:
plt.figure(figsize=(8, 8))
df_tmp = df_train.iloc[:, :-1].sum()
sns.barplot(x=df_tmp.values, y=df_tmp.index)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.xlabel("Number of images", fontsize=15)
plt.title("Distribution of labels", fontsize=16);

In [None]:
print("Number of unique patients: ", df_train["PatientID"].unique().shape[0])

In [None]:
plt.figure(figsize=(16, 6))
df_tmp = df_train["PatientID"].value_counts()
sns.countplot(x=df_tmp.values)
plt.xticks(fontsize=12, rotation=90)
plt.yticks(fontsize=14)
plt.xlabel("Number of observations", fontsize=15)
plt.ylabel("Number of patients", fontsize=15)
plt.title("Distribution of observations by PatientID", fontsize=16);

# **Annotations**

In [None]:
df_annot = pd.read_csv(os.path.join(BASE_DIR, "train_annotations.csv"))
df_annot.head()

In [None]:
def plot_image_with_annotations(row_ind):
    row = df_annot.iloc[row_ind]
    image_path = os.path.join(BASE_DIR, "train", row["StudyInstanceUID"] + ".jpg")
    label = row["label"]
    data = np.array(ast.literal_eval(row["data"]))
    
    plt.figure(figsize=(10, 5))
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    plt.subplot(1, 2, 1)
    plt.imshow(image)
    plt.subplot(1, 2, 2)
    plt.imshow(image)
    plt.scatter(data[:, 0], data[:, 1])
    
    plt.suptitle(label, fontsize=15)

In [None]:
plot_image_with_annotations(8)

In [None]:

def visualize_annotations(file_id):
    plt.figure(figsize=(8, 8))
    
    image = cv2.imread(os.path.join(BASE_DIR, "train", file_id + ".jpg"))
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    plt.imshow(image)
    
    df_patient = df_annot.loc[df_annot["StudyInstanceUID"] == file_id]
    
    if df_patient.shape[0]:        
        labels = df_patient["label"].values.tolist()
        lines = df_patient["data"].apply(ast.literal_eval).values.tolist()

        for line, label in zip(lines, labels):         
            line = np.asarray(line)
            plt.scatter(line[:, 0], line[:, 1], s=40, label=label)
        
        plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0, prop={'size': 20})
        
    plt.tick_params(axis="x", labelsize=15)
    plt.tick_params(axis="y", labelsize=15)
    
    plt.show()

In [None]:
image_ids = [
    "1.2.826.0.1.3680043.8.498.83331936392921199432218327504041001669",
    "1.2.826.0.1.3680043.8.498.11693509889426445054876979814173446281",
    "1.2.826.0.1.3680043.8.498.15159015355212130418020059688126994534",
    "1.2.826.0.1.3680043.8.498.92067938763801985117661596637576203997",
]

for image_id in image_ids:
    visualize_annotations(image_id)

# **ETT Abnormal**

In [None]:
def visualize_batch(image_ids):
    plt.figure(figsize=(16, 10))
    
    for ind, image_id in enumerate(image_ids):
        plt.subplot(2, 3, ind + 1)
        image = cv2.imread(os.path.join(BASE_DIR, "train", f"{image_id}.jpg"))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        plt.imshow(image)
        plt.axis("off")
    
    plt.show()

    
def plot_statistics(df, col):
    plt.figure(figsize=(16, 2))
    sns.countplot(y=df[col])
    
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.xlabel("Number of observations", fontsize=15)
    plt.ylabel(col, fontsize=15)
    plt.title(f"Distribution of {col}", fontsize=16);
    
    plt.show()
    
def process_class(col_name):
    plot_statistics(df_train, col_name)
    tmp_df = df_train[df_train[col_name] == 1]
    visualize_batch(random.sample(tmp_df.index.tolist(), 6))

In [None]:
process_class("ETT - Abnormal")

In [None]:
visualize_annotations("1.2.826.0.1.3680043.8.498.93345761486297843389996628528592497280")

In [None]:
process_class("ETT - Borderline")

In [None]:
process_class("ETT - Normal")

# **Abnormal**

In [None]:
process_class("NGT - Abnormal")

# **Borderline**

In [None]:
process_class("NGT - Borderline")

# **Incomplete Images**

In [None]:
process_class("NGT - Incompletely Imaged")

# **Venn Diagrams**

In [None]:
def plot_venn2(col_1, col_2):
    plt.figure(figsize=(6, 6))
    
    area_10 = df_train[col_1].sum()
    area_01 = df_train[col_2].sum()
    area_11 = df_train[(df_train[col_1] == 1) & (df_train[col_2] == 1)].shape[0]

    venn2(
        subsets=(area_10, area_01, area_11), 
        set_labels=(col_1, col_2),
        alpha=0.5,
    )

In [None]:
plot_venn2("ETT - Abnormal", "NGT - Abnormal")

In [None]:
plot_venn2("ETT - Abnormal", "CVC - Abnormal")

In [None]:
def plot_venn3(col_1, col_2, col_3):
    plt.figure(figsize=(6, 6))
    
    area_100 = df_train[col_1].sum()
    area_010 = df_train[col_2].sum()
    area_110 = df_train[(df_train[col_1] == 1) & (df_train[col_2] == 1)].shape[0]
    area_001 = df_train[col_3].sum()
    area_101 = df_train[(df_train[col_1] == 1) & (df_train[col_3] == 1)].shape[0]
    area_011 = df_train[(df_train[col_2] == 1) & (df_train[col_3] == 1)].shape[0]
    area_111 = df_train[(df_train[col_1] == 1) & (df_train[col_2] == 1) & (df_train[col_3] == 1)].shape[0]

#     print(area_100, area_010, area_110, area_001, area_101, area_011, area_111)

    venn3(
        subsets=(area_100, area_010, area_110, area_001, area_101, area_011, area_111), 
        set_labels=(col_1, col_2, col_3), 
        alpha=0.5
    );

In [None]:
plot_venn3(
    "ETT - Abnormal",
    "NGT - Abnormal",
    "CVC - Abnormal",
)

In [None]:
plot_venn3(
    "ETT - Normal",
    "NGT - Normal",
    "CVC - Normal",
)

In [None]:
plot_venn3(
    "ETT - Borderline",
    "NGT - Borderline",
    "CVC - Borderline",
)

In [None]:
df_submission = pd.read_csv(os.path.join(BASE_DIR, "sample_submission.csv"), index_col=0)
df_submission

In [None]:
df_submission.to_csv("submission.csv")