In [1]:
import numpy as np              # linear algebra
import pandas as pd             # data processing, CSV file I/O (e.g. pd.read_csv)
                                
import matplotlib.pyplot as plt # data visualization
import seaborn as sns           # data visualization
                                


# Data Preparation

---

## Data Extraction

In [2]:
sample_submission = pd.read_csv("/kaggle/input/tabular-playground-series-nov-2021/sample_submission.csv")
train = pd.read_csv("/kaggle/input/tabular-playground-series-nov-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-nov-2021/test.csv")

In [3]:
sample_submission.shape

In [4]:
train.shape

In [5]:
train.info()

In [6]:
test.shape

In [7]:
test.info()

## Data Concatenation

In [8]:
data = pd.concat([train, test], sort = False)
data.shape

In [9]:
data.info()

In [10]:
data.head()

## Null Check

In [11]:
null_cols = [col for col in data.iloc[: , : -1].columns if data[col].isnull().sum() != 0]
null_cols

In [12]:
float_cols = [col for col in data.iloc[: , 1 : -1].columns if data[col].dtype == "float64"]
len(float_cols)

In [13]:
CHUNMEIHONG = '#f1939c'
QIUBOLAN = '#8abcd1'
XIANGYABAI = '#fffef8'
ZHENZHUHUI = '#e4dfd7'

fig, axes = plt.subplots(20, 5, figsize = (16, 48))
axes = axes.flatten()

def features_distribution(axes):
    for idx, ax in enumerate(axes):
        sns.kdeplot(
            data = train.iloc[: , 1 :],
            ax = ax,
            hue = 'target',
            fill = True,
            x = f'f{idx}',
            palette = [f'{CHUNMEIHONG}', f'{QIUBOLAN}'],
            legend = idx == 0,
            alpha = .5,
            linewidth = 2.5,
        )
        
        ax.grid(
            color = XIANGYABAI,
            linestyle = ":",
            linewidth = 1.25,
            alpha = 0.3,
        )
        ax.set_facecolor(ZHENZHUHUI)
        #ax.set_xticks([])
        #ax.set_yticks([])
        ax.spines['left'].set_visible(False)
        ax.spines['top'].set_visible(False)
        ax.yaxis.tick_right()
        ax.yaxis.set_label_position("left")
        ax.set_title(
            f'f{idx}',
            loc = 'right',
            weight = 'bold',
            fontsize = 10,
        )
        #ax.set_xticks([])
        #ax.set_yticks([])
        ax.set_xlabel('')
        ax.set_ylabel('')
        #if idx % 5 != 0:
        #    ax.set_ylabel('')

features_distribution(axes)

fig.supxlabel('Probability', ha = 'center', fontweight = 'bold', fontsize = 16, y = -0.01,)
fig.supylabel('Density', ha = 'center', fontweight = 'bold', fontsize = 16, x = -0.01,)
fig.suptitle('Features Distribution', ha = 'center', fontweight = 'heavy', fontsize = 20, y = 1,)
fig.tight_layout()

In [14]:
CHUNMEIHONG = '#f1939c'
QIUBOLAN = '#8abcd1'
XIANGYABAI = '#fffef8'
ZHENZHUHUI = '#e4dfd7'

fig, axes = plt.subplots(2, 5, figsize = (16, 4.8))
axes = axes.flatten()

def features_distribution(axes):
    for idx, ax in enumerate(axes):
        sns.kdeplot(
            data = train.iloc[: , [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1]],
            ax = ax,
            hue = 'target',
            fill = True,
            x = f'f{idx}',
            palette = [f'{CHUNMEIHONG}', f'{QIUBOLAN}'],
            legend = idx == 0,
            alpha = .5,
            linewidth = 2.5,
        )
        
        ax.grid(
            color = XIANGYABAI,
            linestyle = ":",
            linewidth = 1.25,
            alpha = 0.3,
        )
        ax.set_facecolor(ZHENZHUHUI)
        #ax.set_xticks([])
        #ax.set_yticks([])
        ax.spines['left'].set_visible(False)
        ax.spines['top'].set_visible(False)
        ax.yaxis.tick_right()
        ax.yaxis.set_label_position("left")
        ax.set_title(
            f'f{idx}',
            loc = 'right',
            weight = 'bold',
            fontsize = 10,
        )
        #ax.set_xticks([])
        #ax.set_yticks([])
        ax.set_xlabel('')
        ax.set_ylabel('')
        #if idx % 5 != 0:
        #    ax.set_ylabel('')

features_distribution(axes)

fig.supxlabel('Probability', ha = 'center', fontweight = 'bold', fontsize = 16, y = -0.01,)
fig.supylabel('Density', ha = 'center', fontweight = 'bold', fontsize = 16, x = -0.01,)
fig.suptitle('Features Distribution', ha = 'center', fontweight = 'heavy', fontsize = 20, y = 1,)
fig.tight_layout()

In [15]:
train.iloc[: , [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1]]

In [16]:
sns.color_palette("light:#0ABAB5", n_colors = 5, desat = .9)

In [17]:
palettes = []

for i in np.linspace(1, 0.05, num = 5):
    for j in range(1):
        palette = []
        palette.append(sns.color_palette("light:#F7CAC9", n_colors = 5, desat = i)[j])
        palette.append(sns.color_palette("light:#92A8D1", n_colors = 5, desat = i)[j])
        palettes.append(palette)
        
palettes

In [18]:
train.loc[: , 'f0' :].columns

In [19]:
5 % 5 

In [20]:
0% 5