In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Preparation

## Data extraction

In [2]:
sample_submission = pd.read_csv("/kaggle/input/tabular-playground-series-oct-2021/sample_submission.csv")
train = pd.read_csv("/kaggle/input/tabular-playground-series-oct-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-oct-2021/test.csv")

In [3]:
sample_submission.shape

In [4]:
train.shape

In [5]:
test.shape

## Data concatenation

In [6]:
data = pd.concat([train, test], sort = False)
data.shape

In [7]:
data.info()

In [8]:
data.head()

## Null value check

In [9]:
null_cols = [col for col in data.iloc[: , : -1].columns if data[col].isnull().sum() != 0]
null_cols

## Discrete features definition

In [10]:
float_cols = [col for col in data.iloc[: , 1 : -1].columns if data[col].dtype == "float64"]
len(float_cols)

In [11]:
int_cols = [col for col in data.iloc[: , 1 : -1].columns if data[col].dtype == "int64"]
len(int_cols)

In [12]:
cols_binary = [col for col in data[int_cols].columns if np.all(data[col].unique() == [0, 1]) | np.all(data[col].unique() == [1, 0])]
int_cols == cols_binary

In [13]:
del data

In [14]:
from sklearn.model_selection import train_test_split

X = train.copy()
y = X.pop('target')
X_test = test.copy()

del train, test

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.3,
                                                      random_state =0, stratify = y)

In [None]:
import lightgbm as lgb
from sklearn.metrics import auc

lgb_train = lgb.Dataset(X_train, y_train, categorical_feature = cols_binary[ : -1])
lgb_eval = lgb.Dataset(X_valid, y_valid, categorical_feature = cols_binary)

params = {
    'objective': 'binary',
    'learning_rate': 0.01,
    'num_leaves': 40,
    'max_bin': 500,
    'metric': 'auc'
}

model = lgb.train(params,
                  lgb_train,
                  valid_sets = [lgb_train, lgb_eval],
                  verbose_eval = 10,
                  num_boost_round = 1000,
                  early_stopping_rounds = 10)


y_pred = model.predict(X_test, num_iteration = model.best_iteration)

In [None]:
sub = pd.read_csv("/kaggle/input/tabular-playground-series-oct-2021/sample_submission.csv")
sub['target'] = y_pred
sub.to_csv('submission_1st_trial.csv', index = False)

In [None]:
sub.head()

In [None]:
sample_submission