In [None]:
### load libs
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
from sklearn.model_selection import train_test_split

In [None]:
### load data
train=pd.read_csv('../input/train.csv')
test=pd.read_csv('../input/test.csv')


In [None]:
from tqdm import tqdm
df_test=test.drop(['ID_code'], axis=1)
df_test = df_test.values
unique_samples = []
unique_count = np.zeros_like(df_test)
for feature in tqdm(range(df_test.shape[1])):
    _, index_, count_ = np.unique(df_test[:, feature], return_counts=True, return_index=True)
    unique_count[index_[count_ == 1], feature] += 1

# Samples which have unique values are real the others are fake
real_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) > 0)[:, 0]
synthetic_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0]

df_test_real = df_test[real_samples_indexes].copy()
df_test_real=pd.DataFrame(df_test_real)
df_test_real=df_test_real.add_prefix('var_')
df_test_real.head()

In [None]:
import gc
train_value=train.drop(['ID_code', 'target'], axis=1)
df_combined=pd.concat([train_value, df_test_real])
del train_value
gc.collect()

In [None]:
df_combined.shape

In [None]:
# from sortedcontainers import SortedDict
# dictionary=df_combined['var_0'].value_counts().to_dict()
# np.diff(np.array(sorted(dictionary)))
#np.array(sorted(dictionary.keys()))

### Feature Engineering: 

- `We take counts from combined train+real test data and map to the original test and train. `

- `We assign a mathematical transformations as well. I used log, but others such as sqrt, etc can also work.`

In [None]:
for i in range(200):
    var='var_'+str(i)
    if i%25==0:
        print (i)
    dictionary=df_combined[var].value_counts().to_dict()
    dictionary2=df_combined[var].sort_values().diff(-1).to_dict()

    train['Ncount_'+var]   = train[var].map(dictionary)
    train['cMap_'+var]     = train[var]*np.log2(train['Ncount_'+var]+1)
    train['cMap2_'+var]    = train[var]/np.log2(train['Ncount_'+var]+1)
    train['cDiff_'+var]    = train.index.map(dictionary2)
    train.drop('Ncount_'+var, inplace=True, axis=1)
    
    dictionary3=df_test_real[var].sort_values().diff(-1).to_dict()
    test['Ncount_'+var] =  test[var].map(dictionary)
    test['cMap_'+var]   =  test[var]*np.log2(test['Ncount_'+var]+1)
    test['cMap2_'+var]  =  test[var]/np.log2(test['Ncount_'+var]+1)
    test['cDiff_'+var]  =  test.index.map(dictionary3)
    test.drop('Ncount_'+var, inplace=True, axis=1)

In [None]:
del df_combined
gc.collect()
train.tail()

In [None]:
from tqdm import tqdm
from scipy import stats
import gc

for i in tqdm(range(200)):
    var='var_'+str(i)
    if i%25==0:
        print (i)

    mask = np.zeros(train.shape[0])
    Bool = (train['cMap_'+var] == train['cMap2_'+var])
    Bool1 = (train['cMap_'+var] > train['cMap2_'+var])
    Bool2 = (train['cMap_'+var] < train['cMap2_'+var])
                 
    mask[Bool] = 1   ### equal
    mask[Bool1] = 2  ### Greater than
    mask[Bool2] = 3  ### less than
    train['Test_eq_test1_'+var] = mask
    train['Abs_Diff_test_test1_'+var] = abs(train['cMap_'+var] - train['cMap2_'+var])
    
    uq1 = np.unique(train['cMap_'+var])
    uq2 = np.unique(train['cMap2_'+var])
    
    train[var+'_perc1'] = [stats.percentileofscore(uq1, a, 'weak') for a in train[var].values]    
    train[var+'_perc2'] = [stats.percentileofscore(uq2, a, 'weak') for a in train[var].values]        
    
    mask = np.zeros(test.shape[0])
    Bool = (test['cMap_'+var] == test['cMap2_'+var])
    Bool1 = (test['cMap_'+var] > test['cMap2_'+var])
    Bool2 = (test['cMap_'+var] < test['cMap2_'+var])
    
    mask[Bool] = 1   ### equal
    mask[Bool1] = 2  ### Greater than
    mask[Bool2] = 3  ### less than
    test['Test_eq_test1_'+var] = mask
    test['Abs_Diff_test_test1_'+var] = abs(test['cMap_'+var] - test['cMap2_'+var])

    uq1 = np.unique(test['cMap_'+var])
    uq2 = np.unique(test['cMap2_'+var])

    test[var+'_perc1'] = [stats.percentileofscore(uq1, a, 'weak') for a in test[var].values]    
    test[var+'_perc2'] = [stats.percentileofscore(uq2, a, 'weak') for a in test[var].values]    
    del mask, Bool1,Bool,Bool2,uq1,uq2
    gc.collect()

In [None]:
pd.set_option('display.max_columns',100)
train.head()

In [None]:
ID_code=test['ID_code']
X_test = test.drop(drop,axis = 1)
X_test = X_test.drop(['ID_code'],axis = 1)
X_test.head()

In [None]:
del test
gc.collect()

In [None]:
y=train['target']
X = train.drop(drop, axis=1)
X = X.drop(['target', 'ID_code'], axis=1)
del train
X.head()
X.shape
gc.collect()


In [None]:
features = [c for c in X.columns if c not in ['ID_code', 'target']]

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

In [None]:
lgb_param = {
    'bagging_freq': 5,
        'bagging_fraction': 0.35,
        'boost_from_average':'false',
        'boost': 'gbdt',
        'feature_fraction': 0.0193,
        'learning_rate': 0.0291,
        'max_depth': -1,
        'metric':'auc',
        'min_data_in_leaf': 80,
        'min_sum_hessian_in_leaf': 12,
        'num_leaves': 8,    ### try 2; no interaction between variables makes sense here due to IID
        'tree_learner': 'serial',
        'objective': 'binary', 
        'verbosity': 1,
        "boost_from_average": "false"
}

In [None]:
n_splits = 7 # Number of K-fold Splits
splits = list(StratifiedKFold(n_splits=n_splits, shuffle=True ,random_state=1111).split(X, y))

In [None]:
oof = np.zeros(len(X))
predictions = np.zeros(len(X_test))
feature_importance_df=pd.DataFrame()
for i, (train_idx, valid_idx) in enumerate(splits):  
    
    print(f'Fold {i + 1}')

    x=np.array(X)
    y=np.array(y)
    
    trn_data = lgb.Dataset(x[train_idx], label=y[train_idx])
    val_data = lgb.Dataset(x[valid_idx], label=y[valid_idx])
    watchlist = [(trn_data, 'train'), (val_data, 'valid')]
                 
    lgb_clf=lgb.train(lgb_param, trn_data, 100000, valid_sets = [trn_data, val_data], early_stopping_rounds=10000, verbose_eval=1000)
    
    oof[valid_idx] = lgb_clf.predict(x[valid_idx], num_iteration=lgb_clf.best_iteration)
        
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = lgb_clf.feature_importance()
    fold_importance_df["fold"] = i+1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += lgb_clf.predict(X_test, num_iteration=lgb_clf.best_iteration) /n_splits

print("CV score: {:<8.5f}".format(roc_auc_score(y, oof)))


In [None]:
pd.set_option('display.max_rows', 600)
fold_importance_df.groupby('feature')['importance'].sort_values(by='importance',ascending=False)

In [None]:
submission=pd.DataFrame()
submission['ID_code']=ID_code
submission['target']=predictions
submission.to_csv('submission_lgb_freq_log2_eqfeats_v1.csv',index=False)

In [None]:
submission.head()

In [None]:
from IPython.display import FileLink
FileLink('submission_lgb_freq_log2_eqfeats_v1.csv')

In [None]:
### End of analysis ###