In [1]:
## Importing the libraries we will need 
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold,KFold
import warnings
from six.moves import urllib
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings('ignore')
%matplotlib inline
plt.style.use('seaborn')
from scipy.stats import norm, skew

In [2]:
#Loading data files
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')


In [None]:
#seperating features for the target variable for splitting the data set:
features = [c for c in train.columns if c not in ['ID_code', 'target']]

In [3]:
#Creatig the Target and Train data sets
target = train['target']
train = train.drop(["ID_code", "target"], axis=1)

In [None]:
## We will be training a Light GBM model on our data set . The first step will be to do some changes in our input data set:

Feature Engineering part :

In [4]:
##Data Augmentation 

def augment(x,y,t=2):
    xs,xn = [],[]
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xs.append(x1)

    for i in range(t//2):
        mask = y==0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xn.append(x1)

    xs = np.vstack(xs)
    xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs,xn])
    y = np.concatenate([y,ys,yn])
    return x,y

In [None]:
##Model Prameters. The below values have been selected after a series our Grid Searchs and random searchs of optimal values:

In [5]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.335,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.041,
    'learning_rate': 0.0072,
    'max_depth': -1,
    'metric':'auc',
    'min_data_in_leaf': 685,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary',
    'verbosity': 1
        
    }


In [None]:
# Model Training :

In [6]:
num_folds = 15
features = [c for c in train.columns if c not in ['ID_code', 'target']]

folds = KFold(n_splits=num_folds, random_state=4322789)
oof = np.zeros(len(train))
getVal = np.zeros(len(train))
predictions = np.zeros(len(target))
feature_importance_df = pd.DataFrame()

print('Light GBM Model')
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    
    X_train, y_train = train.iloc[trn_idx][features], target.iloc[trn_idx]
    X_valid, y_valid = train.iloc[val_idx][features], target.iloc[val_idx]
    
    X_tr, y_tr = augment(X_train.values, y_train.values)
    X_tr = pd.DataFrame(X_tr)
    
    print("Fold idx:{}".format(fold_ + 1))
    trn_data = lgb.Dataset(X_tr, label=y_tr)
    val_data = lgb.Dataset(X_valid, label=y_valid)
    val_data = lgb.Dataset(train.iloc[val_idx][features], label=target.iloc[val_idx])
    
    clf = lgb.train(param, trn_data, 1000000, valid_sets = [trn_data, val_data], verbose_eval=5000, early_stopping_rounds = 7000)
    oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    getVal[val_idx]+= clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration) / folds.n_splits
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))

Light GBM Model
Fold idx:1
Training until validation scores don't improve for 7000 rounds.
[5000]	training's auc: 0.909575	valid_1's auc: 0.897224
[10000]	training's auc: 0.918344	valid_1's auc: 0.902479
[15000]	training's auc: 0.924576	valid_1's auc: 0.903856
[20000]	training's auc: 0.930085	valid_1's auc: 0.903835
Early stopping, best iteration is:
[15495]	training's auc: 0.925139	valid_1's auc: 0.903952
Fold idx:2
Training until validation scores don't improve for 7000 rounds.
[5000]	training's auc: 0.910387	valid_1's auc: 0.889362
[10000]	training's auc: 0.91913	valid_1's auc: 0.893268
[15000]	training's auc: 0.925229	valid_1's auc: 0.893912
[20000]	training's auc: 0.930739	valid_1's auc: 0.894144
[25000]	training's auc: 0.935933	valid_1's auc: 0.89392
Early stopping, best iteration is:
[19849]	training's auc: 0.930577	valid_1's auc: 0.89421
Fold idx:3
Training until validation scores don't improve for 7000 rounds.
[5000]	training's auc: 0.909662	valid_1's auc: 0.899152
[10000]	tra

In [None]:
## Creating the Submission file:

In [7]:
num_sub = X
print('Saving the Submission File')
sub = pd.DataFrame({"ID_code": test.ID_code.values})
sub["target"] = predictions
sub.to_csv('submission{}.csv'.format(num_sub), index=False)
getValue = pd.DataFrame(getVal)
getValue.to_csv("Validation_kfold.csv")

Saving the Submission File
