In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import log_loss

from lightgbm import LGBMClassifier

#import lightgbm as lgb
#import optuna.integration.lightgbm as lgb

import optuna

import warnings
warnings.filterwarnings("ignore")

In [None]:
pd.set_option('display.max_columns', 100)

# 1.Import data

In [None]:
sample_submission = pd.read_csv("/kaggle/input/tabular-playground-series-jun-2021/sample_submission.csv")
train = pd.read_csv("/kaggle/input/tabular-playground-series-jun-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-jun-2021/test.csv")

# 2. Preprocessing

In [None]:
sample_submission

In [None]:
train

In [None]:
test

In [None]:
train = train.drop(columns=['id'],axis=1)
test = test.drop(columns=['id'],axis=1)

In [None]:
# Search for missing data

msno.matrix(df=train, figsize=(10,6), color=(0,.3,.3))

In [None]:
# Search for missing data

msno.matrix(df=test, figsize=(10,6), color=(0,.3,.3))

In [None]:
plt.figure(figsize=(10,6))
#sns.countplot(x='target', data=train, order=train['target'].value_counts().index)
sns.countplot(x='target', data=train, order=sorted(train['target'].unique()))

In [None]:
train.drop(columns=['target']).describe().T\
        .style.bar(subset=['mean'], color=px.colors.qualitative.G10[0])\
        .background_gradient(subset=['std'], cmap='Greens')\
        .background_gradient(subset=['50%'], cmap='BuGn')

In [None]:
test.describe().T\
        .style.bar(subset=['mean'], color=px.colors.qualitative.G10[0])\
        .background_gradient(subset=['std'], cmap='Greens')\
        .background_gradient(subset=['50%'], cmap='BuGn')

In [None]:
le = LabelEncoder()
train['target'] = le.fit_transform(train['target'])

In [None]:
train

# 3. Check the correlation between each item

In [None]:
train_corr = train.corr()
train_corr

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(train_corr, vmin=0, vmax=0.12, center=0, square=False, annot=False, cmap='coolwarm');

# 4. Modeling

In [None]:
X = train.drop('target',axis=1)
y = train['target']

In [None]:
def cross_val(X, y, model, params, folds=10):
    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=3)#(3,21)1～30
    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        print(f"Fold: {fold}")
        x_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        x_test, y_test = X.iloc[test_idx], y.iloc[test_idx]

        alg = model(**params)
        alg.fit(x_train, y_train,
                eval_set=[(x_test, y_test)],
                early_stopping_rounds=100,
                verbose=100)

        pred = alg.predict_proba(x_test)
        loss = log_loss(y_test, pred)
        print(f"Log loss: {loss}")
        print("-"*62)
    
    return alg

In [None]:
lgb_params= {'learning_rate': 0.04, #0.045
             'n_estimators': 20000, 
             'max_bin': 94,
             'num_leaves': 10,
             'max_depth': 8,
             'reg_alpha': 8.457,
             'reg_lambda': 6.853,
             'subsample': 0.749
             }

In [None]:
lgb_model = cross_val(X, y, LGBMClassifier, lgb_params)

# 5.Prediction

In [None]:
result = lgb_model.predict_proba(test)

# 6.Make submission file

In [None]:
sample_submission[['Class_1','Class_2', 'Class_3', 'Class_4','Class_5','Class_6', 'Class_7', 'Class_8', 'Class_9']] = result
sample_submission.to_csv(f'lgb.csv',index=False)

In [None]:
sample_submission