In [None]:
import os
import gc
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from plotly import tools
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import lightgbm as lgb
plt.style.use('ggplot')
sns.set(font_scale=1)
pd.set_option('display.max_columns', 500)

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [None]:
features = [c for c in train.columns if c not in ['ID_code', 'target']]
features_t = [c for c in test.columns if c not in ['ID_code']]
y = train.target

In [None]:
train.head()

**Look at the number of unique entries in the columns, and sort to check if the data can be coded by category.
**

In [None]:
unique = []
for col in train.columns:
    if col == 'ID_code' or col == 'target' : pass
    else:
        unique.append([len(train[col].unique()),col])
    gc.collect()
unique = sorted(unique)

In [None]:
unique[:][:10]

**For examples, set the number of columns for visualization to 5, and we will build histograms and boxplots to look at the outliers**

In [None]:
def plot_col(train, y=y, rng=5):
    for i in range(rng):
        plt.figure(figsize=(20,8))
        if 'target' not in train.columns:
            train['target'] = y
        else: pass
        fig, axis = plt.subplots(nrows=1, ncols=2, figsize=(20, 10))
        sns.distplot(train[unique[i][1]].loc[train['target'] == 0], kde=True, label='target 0',ax=axis[0])
        sns.distplot(train[unique[i][1]].loc[train['target'] == 1], kde=True, label='target 1',ax=axis[0])
        sns.boxplot(train[unique[i][1]], ax = axis[1])
        plt.title('{}'.format(unique[i][1]))
        plt.legend()
        plt.show()

**Check how many clusters can be distinguished in the data. This can be done by plotting the distance of the points, and in the place of the "break" of this graph there should be an optimal number of clasetters
**

In [None]:
%%time
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
def clusters(train,y):
    distance = []
    for cluster in range(2,11,1):
        plt.figure(figsize=(10,7))
        print('Started checking {}'.format(cluster))
        kmeans = KMeans(cluster, random_state=2702)
        labels = kmeans.fit_predict(train)
        plt.title('Clusters {}'.format(cluster))
        sns.countplot(labels, hue=y)
        plt.show()
        distance.append(kmeans.inertia_)
        gc.collect()
    return distance
train_dist = clusters(train[features],train.target)

In [None]:
plt.figure(figsize=(15,6))
plt.plot(train_dist, 'go--')

**Make a function to draw the parameters, such as mean, standard deviation, minimum and maximum, to see how the transformations change our data**

In [None]:
def plot_data(train, test):
    vlas = train.columns.values
    vlas_t = test.columns.values
    plt.figure(figsize=(20,8))
    plt.title("Distribution of mean values per row in the train and test set")
    sns.distplot(train[vlas].mean(axis=1),color="green", kde=True,bins=100, label='train')
    sns.distplot(test[vlas_t].mean(axis=1), color='red', kde=True, bins=100, label='test')
    plt.legend()
    plt.show()
    
    plt.figure(figsize=(20,8))
    plt.title("Distribution of std values per row in the train and test set")
    sns.distplot(train[vlas].std(axis=1),color="green", kde=True,bins=100, label='train')
    sns.distplot(test[vlas_t].std(axis=1), color='red', kde=True, bins=100, label='test')
    plt.legend()
    plt.show()
    
    plt.figure(figsize=(20,8))
    plt.title("Distribution of max values per row in the train and test set")
    sns.distplot(train[vlas].max(axis=1), color="green", kde=True, bins=100, label='train')
    sns.distplot(test[vlas_t].max(axis=1), color='red', kde=True, bins=100, label='test')
    plt.legend()
    plt.show()
    
    plt.figure(figsize=(20,8))
    plt.title("Distribution of min values per row in the train and test set")
    sns.distplot(train[vlas].min(axis=1), color="green", kde=True, bins=100, label='train')
    sns.distplot(test[vlas_t].min(axis=1), color='red', kde=True, bins=100, label='test')
    plt.legend()
    plt.show()
    
plot_data(train[features], test[features_t])

In [None]:
plot_col(train)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(copy=False)
train_sc = pd.DataFrame(sc.fit_transform(train[features]), columns=features)
test_sc = pd.DataFrame(sc.fit_transform(test[features_t]), columns=features_t)
gc.collect()

In [None]:
plot_data(train_sc, test_sc)

In [None]:
plot_col(train_sc)

In [None]:
from sklearn.preprocessing import MinMaxScaler
mn = MinMaxScaler(copy=False)
train_mn = pd.DataFrame(mn.fit_transform(train[features]), columns=features)
test_mn = pd.DataFrame(mn.fit_transform(test[features_t]), columns=features_t)
gc.collect()

In [None]:
plot_data(train_mn, test_mn)

In [None]:
plot_col(train_mn)

In [None]:
from sklearn.preprocessing import QuantileTransformer
qt = QuantileTransformer(n_quantiles = 200)
train_qt = pd.DataFrame(mn.fit_transform(train[features]), columns=features)
test_qt = pd.DataFrame(mn.fit_transform(test[features_t]), columns=features_t)
gc.collect()

In [None]:
plot_data(train_qt, test_qt)

In [None]:
from sklearn.preprocessing import RobustScaler
rs = RobustScaler(copy=False)
train_rs = pd.DataFrame(rs.fit_transform(train[features]), columns=features)
test_rs = pd.DataFrame(rs.fit_transform(test[features_t]), columns=features_t)
gc.collect()

In [None]:
plot_data(train_rs, test_rs)

In [None]:
plot_col(train_rs)

**Сreate a base model to test it on all types of transformations, and find out the importance of features
**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import eli5
X_train, X_test, y_train, y_test = train_test_split(
    train[features], y, test_size=0.3,stratify = y,  random_state=2701)
model = lgb.LGBMClassifier(
        n_estimators = 5000,
        learning_rate= 0.1,
        metric='auc',
        )

model.fit(X_train, y_train)
eli5.explain_weights(model)

In [None]:
pred = model.predict(X_test)
print(f'AUC: {roc_auc_score(pred, y_test)}')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    train_sc[features], y, test_size=0.3,stratify = y,  random_state=2701)
model.fit(X_train, y_train)
eli5.explain_weights(model)

In [None]:
pred = model.predict(X_test)
print(f'AUC: {roc_auc_score(pred, y_test)}')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    train_qt[features], y, test_size=0.3,stratify = y,  random_state=2701)
model.fit(X_train, y_train)
eli5.explain_weights(model)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    train_rs[features], y, test_size=0.3,stratify = y,  random_state=2701)
model.fit(X_train, y_train)
eli5.explain_weights(model)

In [None]:
pred = model.predict(X_test)
print(f'AUC: {roc_auc_score(pred, y_test)}')

In [None]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.335,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.041,
    'learning_rate': 0.0083,
    'max_depth': -1,
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': -1
}

In [None]:
from sklearn.model_selection import StratifiedKFold
num_folds = 11

folds = StratifiedKFold(n_splits=num_folds, shuffle=False, random_state=2702)
oof = np.zeros(len(train))
predictions = np.zeros(len(y))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, y.values)):
    print("Fold idx:{}".format(fold_ + 1))
    trn_data = lgb.Dataset(train.iloc[trn_idx][features], label=y.iloc[trn_idx])
    val_data = lgb.Dataset(train.iloc[val_idx][features], label=y.iloc[val_idx])
    
    clf = lgb.train(param, trn_data, 1000000, valid_sets = [trn_data, val_data], verbose_eval=2000, early_stopping_rounds = 4000)
    oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(roc_auc_score(y, oof)))

In [None]:
sub = pd.DataFrame({"ID_code": test.ID_code.values})
sub["target"] = predictions
sub.to_csv('submission.csv', index=False)