In [None]:
#colab
# from google.colab import drive
# drive.mount('/content/drive')

# **Import Library**

## **Used Library**

### **Basic Library**
*   **Numpy**
*   **Pandas**
*   **Seaborn**
*   **Matplotlib**

### **Model Library**


*   **CatBoost**
*   **XGBoost**
*   **Sklearn**

In [None]:
!pip install catboost

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#--------------------------------#

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold, train_test_split

# **Data Load**

*   **Train Data**
*   **Test Data**

**Train Data & Test Data have lots of features! \
So, we need to check NAN values & target's distribution**

## **Data Read**

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jun-2021/train.csv')
train

In [None]:
test = pd.read_csv('../input/tabular-playground-series-jun-2021/test.csv')
test

## **Checking Missing Values**

### **There are no missing values**

In [None]:
print(train.isnull().sum().max())
print(test.isnull().sum().max())

In [None]:
train.isnull().sum().plot()

In [None]:
test.isnull().sum().plot()

## **Target is skewed!!**

### **→ Need to use K-Fold validation**



In [None]:
plt.figure(figsize=(12,6))
sns.countplot(train['target'])

# **Model**

## **Validation**

In [None]:
scores_logloss = []

kf = KFold(n_splits=5, shuffle=True, random_state=42)
train_x = train.drop(columns = 'target')
train_y = train['target']

### **XGBoost**

In [None]:
for tr_idx, va_idx in kf.split(train_x):
  tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
  tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

  xgb = XGBClassifier(eta = 0.05,
    max_depth = 10,
    subsample = 0.8,
    colsample_bytree = 0.7,
    objective = 'reg:logistic',
    eval_metric = 'auc',
    tree_method = 'gpu_hist', 
    predictor = 'gpu_predictor')
  
  xgb.fit(tr_x, tr_y, verbose = True)
  xgb_pred = xgb.predict_proba(va_x)

  logloss = log_loss(va_y, xgb_pred)
  scores_logloss.append(logloss)
  print(logloss)

logloss_xgb = np.mean(scores_logloss)
logloss_xgb

### **CatBoost**

In [None]:
for tr_idx, va_idx in kf.split(train_x):
  tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
  tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

  cat = CatBoostClassifier(depth=8,
                         iterations=1000,
                         learning_rate=0.02,                    
                         eval_metric='MultiClass',
                         loss_function='MultiClass',
                         bootstrap_type= 'Bernoulli',
                         leaf_estimation_method='Gradient',
                         random_state=123,
                         task_type='GPU')
  
  cat.fit(tr_x, tr_y, verbose = False)
  cat_pred = cat.predict_proba(va_x)

  logloss = log_loss(va_y, cat_pred)
  scores_logloss.append(logloss)
  print(logloss)

logloss_cat = np.mean(scores_logloss)
logloss_cat

## **Fitting**

### **Train Data Split**

In [None]:
tr_x, val_x, tr_y, val_y = train_test_split(train_x, train['target'], test_size=0.2, random_state = 42)

### **XGBoost Fit**

In [None]:
xgb.fit(train_x, train_y, verbose = True)

### **CatBoost Fit**

In [None]:
cat.fit(tr_x, tr_y, eval_set=(val_x, val_y), verbose = 10, early_stopping_rounds=30)

## **Ensemble**

In [None]:
result_xgb = xgb.predict_proba(test)
result_cat = cat.predict_proba(test)
result_esn = (result_cat+result_xgb)/2

In [None]:
sub = pd.read_csv('../input/tabular-playground-series-jun-2021/sample_submission.csv')
col = sub.columns[1:]
result = pd.DataFrame(result_esn)
result.columns = col
result['id'] = result.index+200000
result

In [None]:
result.to_csv('sub.csv', index=False)