In [21]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Preparation

## Data extraction

In [None]:
sample_submission = pd.read_csv("/kaggle/input/tabular-playground-series-oct-2021/sample_submission.csv")
train = pd.read_csv("/kaggle/input/tabular-playground-series-oct-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-oct-2021/test.csv")

In [None]:
sample_submission.shape

In [None]:
train.shape

In [None]:
test.shape

## Data concatenation

In [None]:
data = pd.concat([train, test], sort = False)
data.shape

In [None]:
data.info()

In [None]:
data.head()

## Null value check

In [None]:
null_cols = [col for col in data.iloc[: , : -1].columns if data[col].isnull().sum() != 0]
null_cols

## Discrete features definition

In [None]:
float_cols = [col for col in data.iloc[: , 1 : -1].columns if data[col].dtype == "float64"]
len(float_cols)

In [None]:
int_cols = [col for col in data.iloc[: , 1 : -1].columns if data[col].dtype == "int64"]
len(int_cols)

In [None]:
#train_int_unique = [train[col].unique() for col in train[train_int_cols].columns]
#train_int_unique

cols_binary = [col for col in data[int_cols].columns if np.all(data[col].unique() == [0, 1]) | np.all(data[col].unique() == [1, 0])]
len(int_cols) == len(cols_binary)

In [None]:
del data

## Mutual information scoring

In [None]:
X = train.copy()
y = X.pop('target')

In [None]:
del train

In [None]:
discrete_features = []
[discrete_features.append(col in cols_binary) for col in X]
len(discrete_features)

In [None]:
from sklearn.feature_selection import mutual_info_classif


def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_classif(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores


mi_scores = make_mi_scores(X, y, discrete_features)

In [None]:
import matplotlib.pyplot as plt


def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")
    

plt.figure(dpi=100, figsize=(8, 5))
plt.subplot(1, 2, 1)
plot_mi_scores(mi_scores.head(20))
plt.subplot(1, 2, 2)
plot_mi_scores(mi_scores.tail(20))

In [None]:
y

In [None]:
#train_int_unique = [train[col].unique() for col in train[train_int_cols].columns]
#train_int_unique

train_cols_binary = [col for col in train[train_int_cols].columns if np.all(train[col].unique() == [0, 1]) | np.all(train[col].unique() == [1, 0])]
len(train_cols_binary)

In [None]:
test_cols_binary = [col for col in test[test_int_cols].columns if np.all(test[col].unique() == [0, 1]) | np.all(test[col].unique() == [1, 0])]
len(test_cols_binary)

In [None]:
from sklearn.model_selection import train_test_split

y_train = train['target']
X_train = train.drop('target', axis = 1)
X_test = test

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.3,
                                                      random_state =0, stratify = y_train)

In [None]:
import lightgbm as lgb
from sklearn.metrics import auc

lgb_train = lgb.Dataset(X_train, y_train, categorical_feature = train_cols_binary[ : -1])
lgb_eval = lgb.Dataset(X_valid, y_valid, categorical_feature = test_cols_binary)

params = {
    'objective': 'binary',
    'learning_rate': 0.05,
    'num_leaves': 40,
    'max_bin': 300
}

model = lgb.train(params, lgb_train,
                  valid_sets = [lgb_train, lgb_eval],
                  verbose_eval = 10,
                  num_boost_round = 1000,
                  early_stopping_rounds = 10)

y_pred = model.predict(X_test, num_iteration = model.best_iteration)

In [None]:
sub = pd.read_csv("/kaggle/input/tabular-playground-series-oct-2021/sample_submission.csv")
sub['target'] = y_pred
sub.to_csv('submission_1st_trial.csv', index = False)

In [None]:
sub.head()

In [None]:
sample_submission