# 12.2 병든 잎사귀 식별 경진대회 탐색적 데이터 분석
- [병든 잎사귀 식별 경진대회 링크](https://www.kaggle.com/c/plant-pathology-2020-fgvc7)

## 12.2.1 데이터 둘러보기

In [None]:
import pandas as pd
import numpy as np
import random
import os
import cv2
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# Data paths
data_path = './data/'
INPUT_IMAGES_DIR = os.path.join(data_path, 'images/')

train_df = pd.read_csv(data_path + 'train.csv')
test_df = pd.read_csv(data_path + 'test.csv')
submission_df = pd.read_csv(data_path + 'sample_submission.csv')

((1821, 5), (1821, 1))

### Dataset Size

In [None]:
print(f"{train_df.shape=}")
display(train_df.head())
print("================================================")
print(f"{test_df.shape=}")
display(test_df.head())
print("====================")
print(f"{submission_df.shape=}")
submission_df.head()

(1821, 9)


Unnamed: 0,image_id,healthy,multiple_diseases,rust,scab,Healthy,Rust,Scab,Multiple diseases
0,Train_0,0,0,0,1,False,False,True,False
1,Train_1,0,1,0,0,False,False,False,True
2,Train_2,1,0,0,0,True,False,False,False
3,Train_3,0,0,1,0,False,True,False,False
4,Train_4,1,0,0,0,True,False,False,False


### Dataset Visualization

#### Label distribution

In [None]:
# Data preparation for True/False bar chart
categories = ["Healthy", "Scab", "Rust", "Multiple diseases"]
true_counts = [
    train_df["healthy"].sum(),
    train_df["scab"].sum(),
    train_df["rust"].sum(),
    train_df["multiple_diseases"].sum(),
]
false_counts = [
    len(train_df) - train_df["healthy"].sum(),
    len(train_df) - train_df["scab"].sum(),
    len(train_df) - train_df["rust"].sum(),
    len(train_df) - train_df["multiple_diseases"].sum(),
]

# Comparison bar chart
fig = go.Figure()
fig.add_trace(go.Bar(
    x=categories,
    y=true_counts,
    name="True",
    marker_color=px.colors.qualitative.Plotly[0]
))
fig.add_trace(go.Bar(
    x=categories,
    y=false_counts,
    name="False",
    marker_color=px.colors.qualitative.Plotly[1]
))
fig.update_layout(
    title="Comparison of Conditions (True vs False)",
    xaxis_title="Conditions",
    yaxis_title="Count",
    barmode="group",
    template="simple_white"
)
fig.show()

#### Pie plot

In [None]:
fig = go.Figure(data=[go.Pie(labels=train_df.columns[1:],
                             values=[np.sum(train_df[col]) for col in train_df.columns[1:]])])

fig.update_traces(hoverinfo='label+value+percent',
                  textinfo='label+percent',
                  textfont_size=20,
                  marker=dict(line=dict(color='#000000', width=2)))

fig.update_layout(title_text="Target Distribution of Training-Data ")

fig.show()

[Conclusion]  
We notice that the multiple_disease class has the least number of samples.  
Therefore, we will attempt to increase the number of samples by employing augmentation techniques, aiming to ensure equal treatment of all classes by the model.

#### Image Visualization

In [None]:
# Function to display random images
def show_random_images(image_ids, titles, nrows=2, ncols=2):
    fig, axes = plt.subplots(nrows, ncols, figsize=(10, 10))
    axes = axes.flatten()
    
    for idx, ax in enumerate(axes):
        image_id = image_ids[idx]
        img = mpimg.imread(INPUT_IMAGES_DIR + image_id + '.jpg')
        ax.imshow(img)
        ax.set_title(titles[idx])
        ax.axis('off')

    plt.tight_layout()
    plt.show()

# Display random images from each class
disease_titles = ['Healthy', 'Rust', 'Scab', 'Multiple Diseases']
sample_images = [
    train_df[train_df['healthy'] == 1]['image_id'].iloc[0],
    train_df[train_df['scab'] == 1]['image_id'].iloc[0],
    train_df[train_df['rust'] == 1]['image_id'].iloc[0],
    train_df[train_df['multiple_diseases'] == 1]['image_id'].iloc[0],
]
show_random_images(sample_images, disease_titles, nrows=2, ncols=2)

#### Divide the image into RGB color scales

In [None]:
# Function to display all channels (RGB)
def show_all_channels(disease_ids, titles, nrows=4, ncols=4):
    fig, axes = plt.subplots(nrows, ncols, figsize=(18, 16))
    axes = axes.flatten()
    image_titles = ['Original', 'R', 'G', 'B']
    
    for idx, (image_id, title_prefix) in enumerate(zip(disease_ids, titles)):
        img = mpimg.imread(INPUT_IMAGES_DIR + image_id + '.jpg')
        
        for channel in range(4):
            ax = axes[idx * 4 + channel]
            ax.set_title(f"{title_prefix} - {image_titles[channel]}")
            if channel == 0:
                ax.imshow(img)
            else:
                ax.imshow(img[:, :, channel - 1])
            ax.axis('off')

    plt.tight_layout()
    plt.show()

# Display all channels for sample images
show_all_channels(sample_images, disease_titles, nrows=4, ncols=4)

In [None]:
# Select random images for each class
class_images = {
    'Healthy': train_df[train_df['healthy'] == 1]['image_id'].sample().iloc[0],
    'Rust': train_df[train_df['rust'] == 1]['image_id'].sample().iloc[0],
    'Scab': train_df[train_df['scab'] == 1]['image_id'].sample().iloc[0],
    'Multiple Diseases': train_df[train_df['multiple_diseases'] == 1]['image_id'].sample().iloc[0],
}

# Plotly figure setup (4 rows, 5 columns)
fig = make_subplots(
    rows=4, cols=5,
    subplot_titles=[
        'Healthy', 'Healthy Color Histogram', 'Healthy Color Box Plot', 
        'Healthy All Colors', 'Healthy All Colors Box Plot',
        'Rust', 'Rust Color Histogram', 'Rust Color Box Plot', 
        'Rust All Colors', 'Rust All Colors Box Plot',
        'Scab', 'Scab Color Histogram', 'Scab Color Box Plot', 
        'Scab All Colors', 'Scab All Colors Box Plot',
        'Multiple Diseases', 'Multiple Diseases Color Histogram', 'Multiple Diseases Color Box Plot', 
        'Multiple Diseases All Colors', 'Multiple Diseases All Colors Box Plot'
    ],
    specs=[
        [{'type': 'image'}, {'type': 'xy'}, {'type': 'xy'}, {'type': 'xy'}, {'type': 'xy'}],
        [{'type': 'image'}, {'type': 'xy'}, {'type': 'xy'}, {'type': 'xy'}, {'type': 'xy'}],
        [{'type': 'image'}, {'type': 'xy'}, {'type': 'xy'}, {'type': 'xy'}, {'type': 'xy'}],
        [{'type': 'image'}, {'type': 'xy'}, {'type': 'xy'}, {'type': 'xy'}, {'type': 'xy'}]
    ]
)

# Add images and histograms for each class
for idx, (disease, image_id) in enumerate(class_images.items()):
    image_path = data_path + 'images/' + image_id + '.jpg'
    img = cv2.imread(image_path)
    img = cv2.resize(img, (256, 256))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    # Add original image to subplot (first column)
    fig.add_trace(go.Image(z=img), row=idx + 1, col=1)

    for channel, color in enumerate(['red', 'green', 'blue']):
        # Color histograms (third column)
        fig.add_trace(
            go.Histogram(x=img[:, :, channel].ravel(), opacity=0.5, marker_color=color, name=f'{disease} - {color}'),
            row=idx + 1, col=3
        )

        # Color box plots (fifth column)
        fig.add_trace(
            go.Box(x=img[:, :, channel].ravel(), boxmean=True, name=f'{disease} - {color}', marker_color=color, orientation='h'),
            row=idx + 1, col=5
        )

        # All channels histogram (second column)
        all_channels = img.mean(axis=2).ravel()
        fig.add_trace(
            go.Histogram(x=all_channels, opacity=0.5, marker_color='gray', name=f'all channels'),
            row=idx + 1, col=2
        )
        
        # All channels box plot (fourth column)
        fig.add_trace(
            go.Box(x=all_channels, boxmean=True, name=f'all channels', marker_color='gray', orientation='h'),
            row=idx + 1, col=4
        )

# Final layout update
fig.update_layout(
    title='Images & Their Channel Distributions (Including All Colors & Box Plot)',
    height=2000,
    width=2000,
    showlegend=False
)

# Display the figure
fig.show()

In [None]:
# Define a color map for each disease
disease_colors = {
    'Healthy': 'green',
    'Rust': 'brown',
    'Scab': 'purple',
    'Multiple Diseases': 'orange',
}

# Initialize subplots (4 rows, 2 columns)
fig = make_subplots(
    rows=4, cols=2,
    subplot_titles=('All Channels (Box Plot)', 'All Channels (Histogram)',
                    'Red Channel (Box Plot)', 'Red Channel (Histogram)',
                    'Green Channel (Box Plot)', 'Green Channel (Histogram)',
                    'Blue Channel (Box Plot)', 'Blue Channel (Histogram)'),
    shared_xaxes=True, shared_yaxes=True
)

# Add box plots and histograms for each channel
for i, (disease, image_id) in enumerate(class_images.items()):
    image_path = data_path + 'images/' + image_id + '.jpg'
    img = cv2.imread(image_path)
    img = cv2.resize(img, (256, 256))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    color = disease_colors[disease]  # Get the color for the current disease
    
    # All channels box plot (1st row, 1st column)
    all_channels = img.mean(axis=2).ravel()
    row, col = 1, 1
    fig.add_trace(
        go.Box(x=all_channels, orientation='h', boxmean=True, opacity=0.5, name=f'{disease} - All Channels',
               marker=dict(color=color)), row=row, col=col
    )
    
    # All channels histogram (1st row, 2nd column)
    row, col = 1, 2
    fig.add_trace(
        go.Histogram(x=all_channels, opacity=0.5, name=f'{disease} - All Channels', histnorm='probability',
                     marker=dict(color=color)), row=row, col=col
    )

    # Red channel box plot (2nd row, 1st column)
    row, col = 2, 1
    fig.add_trace(
        go.Box(x=img[:, :, 0].ravel(), boxmean=True, orientation='h', opacity=0.5, name=f'{disease} - Red',
               marker=dict(color=color)), row=row, col=col
    )

    # Red channel histogram (2nd row, 2nd column)
    row, col = 2, 2
    fig.add_trace(
        go.Histogram(x=img[:, :, 0].ravel(), opacity=0.5, name=f'{disease} - Red', histnorm='probability',
                     marker=dict(color=color)), row=row, col=col
    )

    # Green channel box plot (3rd row, 1st column)
    row, col = 3, 1
    fig.add_trace(
        go.Box(x=img[:, :, 1].ravel(), boxmean=True, orientation='h', opacity=0.5, name=f'{disease} - Green',
               marker=dict(color=color)), row=row, col=col
    )

    # Green channel histogram (3rd row, 2nd column)
    row, col = 3, 2
    fig.add_trace(
        go.Histogram(x=img[:, :, 1].ravel(), opacity=0.5, name=f'{disease} - Green', histnorm='probability',
                     marker=dict(color=color)), row=row, col=col
    )

    # Blue channel box plot (4th row, 1st column)
    row, col = 4, 1
    fig.add_trace(
        go.Box(x=img[:, :, 2].ravel(), boxmean=True, orientation='h', opacity=0.5, name=f'{disease} - Blue',
               marker=dict(color=color)), row=row, col=col
    )

    # Blue channel histogram (4th row, 2nd column)
    row, col = 4, 2
    fig.add_trace(
        go.Histogram(x=img[:, :, 2].ravel(), opacity=0.5, name=f'{disease} - Blue', histnorm='probability',
                     marker=dict(color=color)), row=row, col=col
    )

# Update layout
fig.update_layout(
    title='Color Box Plots and Histograms for Different Diseases',
    barmode='group',
    xaxis_title='Pixel Intensity',
    yaxis_title='Channels',
    height=1200,  # Adjusted height for 4 rows
    width=1200,
)

# Display the figure
fig.show()