# 1. Importing libraries

In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import six.moves.urllib as urllib
import sklearn
import scipy
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
import lightgbm as lgb
%matplotlib inline

# 2. Reading Datasets

In [22]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [23]:
train.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Columns: 202 entries, ID_code to var_199
dtypes: float64(200), int64(1), object(1)
memory usage: 308.2+ MB


In [24]:
train.shape


(200000, 202)

In [25]:
train.head()


Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


# 3. Checking of Data

In [26]:
# check the missing values
data_na=(train.isnull().sum()/len(train))*100
data_na=data_na.drop(data_na[data_na==0].index).sort_values(ascending=False)
missing_data=pd.DataFrame({'MissingRatio':data_na})
print(missing_data)

Empty DataFrame
Columns: [MissingRatio]
Index: []


In [27]:
train.target.value_counts()

0    179902
1     20098
Name: target, dtype: int64

##### Note: we can see that the dataset is significantly unbalanced 

In [28]:
features=[col for col in train.columns if col not in ['ID_code','target']]


# 4. EDA

##### 4.1 Check the train and test distribution in all possible ways

In [None]:
# check the distribution
plt.figure(figsize=(18,10))
plt.title('Distribution of mean values per row in the train and test set')
sns.distplot(train[features].mean(axis=1),color='green',kde=True,bins=120,label='train')
sns.distplot(test[features].mean(axis=1),color='red',kde=True,bins=120,label='test')
plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(18,10))
plt.title('Distribution of mean values per column in the train and test set')
sns.distplot(train[features].mean(axis=0),color='purple',kde=True,bins=120,label='train')
sns.distplot(test[features].mean(axis=0),color='orange',kde=True,bins=120,label='test')
plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(18,10))
plt.title('Distribution of mean values per column in the train and test set')
sns.distplot(train[features].mean(axis=0),color='purple',kde=True,bins=120,label='train')
sns.distplot(test[features].mean(axis=0),color='orange',kde=True,bins=120,label='test')
plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(18,10))
plt.title('Distribution of std values per rows in the train and test set')
sns.distplot(train[features].std(axis=1),color='black',kde=True,bins=120,label='train')
sns.distplot(test[features].std(axis=1),color='yellow',kde=True,bins=120,label='test')
plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(18,10))
plt.title('Distribution of std values per column in the train and test set')
sns.distplot(train[features].std(axis=0),color='blue',kde=True,bins=120,label='train')
sns.distplot(test[features].std(axis=0),color='green',kde=True,bins=120,label='test')
plt.legend()
plt.show()


In [None]:
# check the feature correlation
corrmat=train.corr()
plt.subplots(figsize=(18,18))
sns.heatmap(corrmat,vmax=0.9,square=True)


In [None]:
%%time
correlations=train[features].corr().unstack().sort_values(kind='quicksort').reset_index()
correlations=correlations[correlations['level_0']!=correlations['level_1']]


In [None]:
correlations.tail(10)


In [None]:
correlations.head(10)


In [None]:
# check the distribution of each feature
def plot_features(df1,df2,label1,label2,features):
    sns.set_style('whitegrid')
    plt.figure()
    fig,ax=plt.subplots(10,20,figsize=(18,22))
    i=0
    for feature in features:
        i+=1
        plt.subplot(10,20,i)
        sns.distplot(df1[feature],hist=False,label=label1)
        sns.distplot(df2[feature],hist=False,label=label2)
        plt.xlabel(feature,fontsize=9)
        locs, labels=plt.xticks()
        plt.tick_params(axis='x',which='major',labelsize=6,pad=-6)
        plt.tick_params(axis='y',which='major',labelsize=6)
    plt.show()
        
t0=train.loc[train['target']==0]
t1=train.loc[train['target']==1]
features=train.columns.values[2:202]
plot_features(t0,t1,'0','1',features)


In [None]:
features=train.columns.values[2:202]
plot_features(train,test,'train','test',features)


In [None]:
# Distribution of min and max
t0=train.loc[train['target']==0]
t1=train.loc[train['target']==1]
plt.figure(figsize=(18,10))
plt.title('Distribution of min values per row in the train set')
sns.distplot(t0[features].min(axis=1),color='orange',kde=True,bins=120,label='0')
sns.distplot(t1[features].min(axis=1),color='red',kde=True,bins=120,label='1')
plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(18,10))
plt.title('Distribution of min values per column in the train set')
sns.distplot(t0[features].min(axis=0),color='blue',kde=True,bins=120,label='0')
sns.distplot(t1[features].min(axis=0),color='green',kde=True,bins=120,label='1')
plt.legend()
plt.plot()


In [None]:
plt.figure(figsize=(18,10))
plt.title('Distribution of max values per row in the train set')
sns.distplot(t0[features].max(axis=1),color='orange',kde=True,bins=120,label='0')
sns.distplot(t1[features].max(axis=1),color='red',kde=True,bins=120,label='1')
plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(18,10))
plt.title('Distribution of max values per column in the train set')
sns.distplot(t0[features].max(axis=0),color='blue',kde=True,bins=120,label='0')
sns.distplot(t1[features].max(axis=0),color='green',kde=True,bins=120,label='1')
plt.legend()
plt.show()


In [None]:
# skewness and kurtosis
plt.figure(figsize=(18,10))
plt.title('Distribution of skew values per row in the train set')
sns.distplot(t0[features].skew(axis=1),color='orange',kde=True,bins=120,label='0')
sns.distplot(t1[features].skew(axis=1),color='red',kde=True,bins=120,label='1')
plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(18,10))
plt.title('Distribution of skew values per column in the train set')
sns.distplot(t0[features].skew(axis=0),color='blue',kde=True,bins=120,label='0')
sns.distplot(t1[features].skew(axis=0),color='green',kde=True,bins=120,label='1')
plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(18,10))
plt.title('Distribution of kurtosis values per row in the train set')
sns.distplot(t0[features].kurtosis(axis=1),color='orange',kde=True,bins=120,label='0')
sns.distplot(t1[features].kurtosis(axis=1),color='red',kde=True,bins=120,label='1')
plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(18,10))
plt.title('Distribution of skew values per column in the train set')
sns.distplot(t0[features].skew(axis=0),color='blue',kde=True,bins=120,label='0')
sns.distplot(t1[features].skew(axis=0),color='green',kde=True,bins=120,label='1')
plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(18,10))
plt.title('Distribution of kurtosis values per row in the train set')
sns.distplot(t0[features].kurtosis(axis=1),color='orange',kde=True,bins=120,label='0')
sns.distplot(t1[features].kurtosis(axis=1),color='red',kde=True,bins=120,label='1')
plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(18,10))
plt.title('Distribution of kurtosis values per column in the train set')
sns.distplot(t0[features].kurtosis(axis=0),color='blue',kde=True,bins=120,label='0')
sns.distplot(t1[features].kurtosis(axis=0),color='green',kde=True,bins=120,label='1')
plt.legend()
plt.show()


# 5. Feature Engineering and Modelling


##### 5.1 Creating some new feature(statistical indicators) which might be useful 

In [29]:
# creating new features
idx=features=train.columns.values[2:202]
for df in [train,test]:
    df['sum']=df[idx].sum(axis=1)
    df['min']=df[idx].min(axis=1)
    df['max']=df[idx].max(axis=1)
    df['mean']=df[idx].mean(axis=1)
    df['std']=df[idx].std(axis=1)
    df['skew']=df[idx].skew(axis=1)
    df['kurt']=df[idx].kurtosis(axis=1)
    df['med']=df[idx].median(axis=1)
train[train.columns[202:]].head(10)


Unnamed: 0,sum,min,max,mean,std,skew,kurt,med
0,1456.3182,-21.4494,43.1127,7.281591,9.33154,0.10158,1.331023,6.7704
1,1415.3636,-47.3797,40.5632,7.076818,10.33613,-0.351734,4.110215,7.22315
2,1240.8966,-22.4038,33.882,6.204483,8.753387,-0.056957,0.546438,5.8994
3,1288.2319,-35.1659,38.1015,6.441159,9.594064,-0.480116,2.630499,6.7026
4,1354.231,-65.4863,41.1037,6.771155,11.287122,-1.463426,9.787399,6.94735
5,1272.3216,-44.7257,35.2664,6.361608,9.313012,-0.920439,4.581343,6.2379
6,1509.449,-29.9763,39.9599,7.547245,9.24613,-0.133489,1.816453,7.47605
7,1438.5083,-27.2543,31.9043,7.192542,9.162558,-0.300415,1.174273,6.973
8,1369.7375,-31.7855,42.4798,6.848687,9.83752,0.084047,1.99704,6.3287
9,1303.1155,-39.3042,34.464,6.515578,9.943238,-0.670024,2.52116,6.3632


In [None]:
test[test.columns[201:]].head(10)


##### 5.2 Check the distribution of new features

In [None]:
def plot_new_features(df1,df2,label1,label2,features):
    sns.set_style('whitegrid')
    plt.figure()
    fig,ax=plt.subplots(2,4,figsize=(18,8))
    i=0
    for feature in features:
        i+=1
        plt.subplot(2,4,i)
        sns.kdeplot(df1[feature],bw=0.5,label=label1)
        sns.kdeplot(df2[feature],bw=0.5,label=label2)
        plt.xlabel(feature,fontsize=11)
        locs,labels=plt.xticks()
        plt.tick_params(axis='x',which='major',labelsize=8)
        plt.tick_params(axis='y',which='major',labelsize=8)
    plt.show()
t0=train.loc[train['target']==0]
t1=train.loc[train['target']==1]
features=train.columns.values[202:]
plot_new_features(t0,t1,'0','1',features)


In [None]:
print('Columns in train_set:{} Columns in test_set:{}'.format(len(train.columns),len(test.columns)))


# 6. Training of LightGBM model

In [30]:
# training the model
features=[col for col in train.columns if col not in ['ID_code','target']]
target=train['target']
param={
    'bagging_freq':5,
    'bagging_fraction':0.4,
    'boost':'gbdt',
    'boost_from_average':'false',
    'feature_fraction':0.05,
    'learning_rate':0.01,
    'max_depth':-1,
    'metric':'auc',
    'min_data_in_leaf':80,
    'min_sum_hessian_in_leaf':10.0,
    'num_leaves':13,
    'num_threads':8,
    'tree_learner':'serial',
    'objective':'binary',
    'verbosity':1
}

In [34]:
folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=44000)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    print("Fold {}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx][features], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train.iloc[val_idx][features], label=target.iloc[val_idx])

    num_round = 1000000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 3000)
    oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))


Fold 0




[LightGBM] [Info] Number of positive: 18089, number of negative: 161911
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53040
[LightGBM] [Info] Number of data points in the train set: 180000, number of used features: 208
Training until validation scores don't improve for 3000 rounds
[1000]	training's auc: 0.885918	valid_1's auc: 0.875496
[2000]	training's auc: 0.903601	valid_1's auc: 0.887846
[3000]	training's auc: 0.914985	valid_1's auc: 0.894505
[4000]	training's auc: 0.922396	valid_1's auc: 0.898394
[5000]	training's auc: 0.928084	valid_1's auc: 0.900316
[6000]	training's auc: 0.932925	valid_1's auc: 0.901107
[7000]	training's auc: 0.937124	valid_1's auc: 0.901798
[8000]	training's auc: 0.941121	valid_1's auc: 0.902207
[9000]	training's auc: 0.944827	valid_1's auc: 0.90226
[10000]	training's auc: 0.948293	valid_1's auc: 0.90227
[11000]	training's auc: 0.951589	valid_1's auc: 0.902188
Early stopping, best iteration is:
[8864]	training's auc: 0.9

Early stopping, best iteration is:
[10356]	training's auc: 0.949799	valid_1's auc: 0.898259
Fold 6
[LightGBM] [Info] Number of positive: 18088, number of negative: 161912
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53040
[LightGBM] [Info] Number of data points in the train set: 180000, number of used features: 208
Training until validation scores don't improve for 3000 rounds
[1000]	training's auc: 0.886795	valid_1's auc: 0.868372
[2000]	training's auc: 0.903886	valid_1's auc: 0.881877
[3000]	training's auc: 0.915243	valid_1's auc: 0.889648
[4000]	training's auc: 0.92264	valid_1's auc: 0.893456
[5000]	training's auc: 0.928345	valid_1's auc: 0.895663
[6000]	training's auc: 0.933156	valid_1's auc: 0.897068
[7000]	training's auc: 0.937421	valid_1's auc: 0.897864
[8000]	training's auc: 0.941314	valid_1's auc: 0.898095
[9000]	training's auc: 0.945002	valid_1's auc: 0.898382
[10000]	training's auc: 0.948536	valid_1's auc: 0.898442
[11000]	training's

In [None]:
cols = (feature_importance_df[["Feature", "importance"]]
        .groupby("Feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:150].index)
best_features = feature_importance_df.loc[feature_importance_df.Feature.isin(cols)]

plt.figure(figsize=(14,28))
sns.barplot(x="importance", y="Feature", data=best_features.sort_values(by="importance",ascending=False))
plt.title('Features importance (averaged/folds)')
plt.show()


# 7. Final Predictions 

In [36]:
submission=pd.DataFrame({"ID_code":test['ID_code'].values})
submission['target']=predictions
submission.to_csv('submission.csv',index=False)