In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Introduction

This Notebook is for beginner who just start kaggle

## TL;DR
**Let's Start Journey with me!**

<h2>Update</h2>

> Please visit the last update Notebook :)

[[TPS-Dec] End-to-End ML Project for Beginner 😃](https://www.kaggle.com/leeyj0511/tps-dec-end-to-end-ml-project-for-beginner/notebook)

# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import callbacks
from tensorflow.keras.utils import to_categorical

from matplotlib import ticker
import time
import warnings
warnings.filterwarnings('ignore')

# Data Loading and Preperation

In [None]:
train = pd.read_csv("../input/tabular-playground-series-dec-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-dec-2021/test.csv")
submission = pd.read_csv("../input/tabular-playground-series-dec-2021/sample_submission.csv")

In [None]:
print(f"train.shape: {train.shape}")
print(f"test.shape: {test.shape}")

In [None]:
train.head()

In [None]:
# basic statistics for train data
train.describe()

In [None]:
test.head()

In [None]:
# basic statistics for test data
test.describe()

In [None]:
# get target variable
set(train.columns) - set(test.columns)

'Cover_Type' is target variable!

In [None]:
# remove 'Id' from train and test
train.drop(['Id'], axis = 1, inplace = True)
test.drop(['Id'], axis = 1, inplace = True)

# get features
target = 'Cover_Type'
features = [col for col in train.columns if col not in ['Id', target]]
print(f"features: {features}")
print(f"features len: {len(features)}")  # we have dataset that has 54 columns

random_state = 42

# Simple EDA

In [None]:
train.iloc[:, :-1].describe().T.sort_values(by='std' , ascending = False)\
                     .style.background_gradient(cmap='GnBu')\
                     .bar(subset=["max"], color='#BB0000')\
                     .bar(subset=["mean",], color='green')

## Continuous and Categorical Data Distribution

In [None]:
df = pd.concat([train[features], test[features]], axis = 0)

cat_features = [col for col in features if df[col].nunique() < 25]
cont_features = [col for col in features if df[col].nunique() > 25]

del df
print(f'Total number of features: {len(features)}')
print(f'Number of categorical features: {len(cat_features)}')
print(f'Number of continuos features: {len(cont_features)}')

plt.pie([len(cat_features), len(cont_features)], 
        labels=['Categorical', 'Continuos'],
        colors=['#76D7C4', '#F5B7B1'],
        textprops={'fontsize': 13},
        autopct='%1.1f%%')
plt.show()

## Feature Distribution of Continuous Features

In [None]:
ncols = 5
nrows = int(len(cont_features) / ncols + (len(features) % ncols > 0))-1

fig, axes = plt.subplots(nrows, ncols, figsize=(18, 8), facecolor='#EAEAF2')

for r in range(nrows):
    for c in range(ncols):
        col = cont_features[r*ncols+c]
        sns.kdeplot(x=train[col], ax=axes[r, c], color='#58D68D', label='Train data')
        sns.kdeplot(x=test[col], ax=axes[r, c], color='#DE3163', label='Test data')
        axes[r, c].set_ylabel('')
        axes[r, c].set_xlabel(col, fontsize=8, fontweight='bold')
        axes[r, c].tick_params(labelsize=5, width=0.5)
        axes[r, c].xaxis.offsetText.set_fontsize(4)
        axes[r, c].yaxis.offsetText.set_fontsize(4)
plt.show()

## Feature Distribution of Categorical Features

In [None]:
if len(cat_features) == 0 :
    print("No Categorical features")
else:
    ncols = 5
    nrows = int(len(cat_features) / ncols + (len(features) % ncols > 0)) 

    fig, axes = plt.subplots(nrows, ncols, figsize=(18, 45), facecolor='#EAEAF2')

    for r in range(nrows):
        for c in range(ncols):
            if r*ncols+c >= len(cat_features):
                break
            col = cat_features[r*ncols+c]
            sns.countplot(x=train[col], ax=axes[r, c], color='#58D68D', label='Train data')
            sns.countplot(x=test[col], ax=axes[r, c], color='#DE3163', label='Test data')
            axes[r, c].set_ylabel('')
            axes[r, c].set_xlabel(col, fontsize=8, fontweight='bold')
            axes[r, c].tick_params(labelsize=5, width=0.5)
            axes[r, c].xaxis.offsetText.set_fontsize(4)
            axes[r, c].yaxis.offsetText.set_fontsize(4)
    plt.show()

## Target Distribution

In [None]:
target_df = pd.DataFrame(train[target].value_counts()).reset_index()
target_df.columns = [target, 'count']
fig = px.bar(data_frame =target_df, 
             x = 'Cover_Type',
             y = 'count' , 
             color = "count",
             color_continuous_scale="Emrld") 
fig.show()
target_df.sort_values(by =target , ignore_index = True)

## Removing Unwanted Rows and columns

In [None]:
train = train.drop(index = int(np.where(train["Cover_Type"] == 5)[0]))
train = train.drop(labels = ["Soil_Type7", "Soil_Type15"], axis = 1)

features.remove("Soil_Type7")
features.remove("Soil_Type15")