In [1]:
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
DATA_PATH = Path("../input/siim-isic-melanoma-classification")
TRAIN_PATH = DATA_PATH / "train.csv"
TEST_PATH = DATA_PATH / "test.csv"
IMAGE_TRAIN_PATH = DATA_PATH / "jpeg" / "train"
IMAGE_TEST_PATH = DATA_PATH / "jpeg" / "test"

In [3]:
train_df = pd.read_csv(TRAIN_PATH)
train_df.head()

We might have more or less precise diagnosis. **benign_malignant** and **target** indicates whether the mole is a benign or malignant tumor, while **diagnosis** offers a more precise diagnosis.

## Target and diagnosis

In [16]:
sns.countplot(x="diagnosis", data=train_df)
plt.xticks(rotation=45)
plt.title("Diagnosis distribution", fontsize=17, fontweight="bold")
plt.xlabel("Diagnosis", fontsize=13)
plt.ylabel("Number of instances", fontsize=13);

In [5]:
err1 = len(train_df[(train_df["benign_malignant"] == "benign") & (train_df["target"] == 1)])
err2 = len(train_df[(train_df["benign_malignant"] == "malignant") & (train_df["target"]  == 0)])
print("Number of mislabeled candidates:", err1 + err2)

*0* mislabeled image, so we can drop the *benign_malignant* column, it is redundant.

In [6]:
train_df.drop(axis=1, labels=["benign_malignant"], inplace=True)

In [17]:
num_positives = len(train_df[train_df["target"] == 1])
proportion = num_positives / len(train_df) * 100
print("Percent of positive class: %.2f" % proportion)

sns.countplot(x="target", data=train_df)
plt.title("Target distribution", fontsize=17, fontweight="bold")
plt.xlabel("Target", fontsize=13)
plt.ylabel("Number of instances", fontsize=13);

Only 1.76% of positive classes, handling imbalance should be a big part of the job.

In [19]:
# How does diagnosis correlate with the target?
sns.countplot(y='diagnosis', hue='target', data=train_df, palette="Greens_d");

All positives classes are labelled as melanoma in the diagnosis column. This might not be very informative given the large quantity of *Unknown* diagnosis.

## Anatomy part

In [35]:
fig, axes = plt.subplots(1, 2, figsize=(10, 3))

sns.countplot(y="anatom_site_general_challenge", data=train_df, ax=axes[0])
axes[0].set_title("Distribution of the anatomy part", fontsize=15)
axes[0].set_ylabel("Anatomy part", fontsize=13)

sns.countplot(y="target", hue="anatom_site_general_challenge", data=train_df, ax=axes[1])
axes[1].set_title("Correlation anatomy part and target", fontsize=15)
axes[1].set_ylabel("Target", fontsize=13);

In [39]:
sns.countplot(y="anatom_site_general_challenge", data=train_df[train_df["target"] == 1])
plt.title("Distribution of anatomy part (positive label)", fontsize=15)
plt.ylabel("Anatomy part", fontsize=13);

## Sex

In [44]:
fig, axes = plt.subplots(1, 2, figsize=(10, 3))

sns.countplot(y="sex", data=train_df, ax=axes[0])
axes[0].set_title("Distribution of sex", fontsize=15)
axes[0].set_ylabel("Sex", fontsize=13)

sns.countplot(y="target", hue="sex", data=train_df, ax=axes[1])
axes[1].set_title("Correlation sex and target", fontsize=15)
axes[1].set_ylabel("Target", fontsize=13);

## Age

In [57]:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

sns.kdeplot(x="age_approx", data=train_df, ax=axes[0])
axes[0].set_title("Distribution of age", fontsize=15)
axes[0].set_xlabel("Age", fontsize=13)

train_df["age_group"] = pd.cut(train_df["age_approx"], bins=[g for g in range(0, 90, 10)], include_lowest=True)
sns.countplot(x="target", hue="age_group", data=train_df, ax=axes[1])
axes[1].set_title("Correlation age and target", fontsize=15)
axes[1].set_xlabel("Target", fontsize=13);

sns.countplot(x="target", hue="age_group", data=train_df[train_df["target"] == 1], ax=axes[2])
axes[2].set_title("Correlation age and target (positive class)", fontsize=15)
axes[2].set_xlabel("Target", fontsize=13);

## Patients

In [62]:
print("There are %i unique patients." % train_df["patient_id"].nunique())

In [68]:
train_df.groupby("patient_id")["target"].nunique().plot(kind="hist");