In [0]:
from google.colab import drive
drive.mount('/gdrive')

In [0]:
BASE_PATH = "/gdrive/My Drive/Colab Notebooks/Edelwise/"

In [0]:
import zipfile
print ("Uncompressing zip file...")
zip_ref = zipfile.ZipFile(BASE_PATH+"Foreclosure-Prediction-Dataset_.zip", 'r')
zip_ref.extractall('Edelwise/')
zip_ref.close()
print("Finished")

In [0]:
import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score
import lightgbm as lgbm
from sklearn.model_selection import StratifiedKFold
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

from sklearn.impute import SimpleImputer

In [0]:
DATA_PATH = "Edelwise/93fe009c-2-Foreclosure-Prediction-Dataset_/"

In [0]:
print("Reading data...")
train = pd.read_csv(DATA_PATH+"train_foreclosure.csv")
test = pd.read_csv(DATA_PATH+"test_foreclosure.csv")
transactions = pd.read_excel(DATA_PATH+"LMS_31JAN2019.xlsx")
emails = pd.read_excel(DATA_PATH+"RF_Final_Data.xlsx")
customers = pd.read_excel(DATA_PATH+"Customers_31JAN2019.xlsx")
print("Finished...")

Reading data...
Finished...


1.  BALANCE_TENURE --> 182.0
2.  CURRENT_TENOR --> 203.0



In [0]:
bt_imputer = SimpleImputer(strategy='constant', fill_value=182.)
ct_imputer = SimpleImputer(strategy='constant', fill_value=203.)

transactions.BALANCE_TENURE = bt_imputer.fit_transform(transactions.BALANCE_TENURE.values.reshape(-1, 1))
transactions.CURRENT_TENOR = ct_imputer.fit_transform(transactions.CURRENT_TENOR.values.reshape(-1, 1))

In [0]:
agg_func = {'AGREEMENTID':['count'],
           'LOAN_AMT':['max'],
           'NET_DISBURSED_AMT':['max'],
           'CURRENT_ROI':['mean', 'min', 'max'],
           'ORIGNAL_ROI':['mean'],
           'CURRENT_TENOR':['min', 'max', 'mean'],
           'ORIGNAL_TENOR':['max'],
           'PRE_EMI_DUEAMT':['mean'], 
           'PRE_EMI_RECEIVED_AMT':['mean'],
           'EMI_DUEAMT':['mean'],
           'EMI_RECEIVED_AMT':['mean'],
           'EMI_OS_AMOUNT':['sum'],
           'EXCESS_AVAILABLE':['sum'],
           'EXCESS_ADJUSTED_AMT':['sum'],
           'OUTSTANDING_PRINCIPAL':['sum'],
           'PAID_PRINCIPAL':['sum'],
           'PAID_INTEREST':['sum'],
           'MONTHOPENING':['min'],
           'NET_LTV':['mean'],
           'COMPLETED_TENURE':['max'],
           'FOIR':['mean'],
           'BALANCE_TENURE':['min'],
           'MOB':['max', 'mean']}

agg_transactions = transactions.groupby('AGREEMENTID').agg(agg_func)
agg_transactions.columns = ['_'.join(col).strip() for col in agg_transactions.columns.values]
agg_transactions.reset_index(inplace=True)

1.   LOAN_AMT - NET_DISBURSED_AMT (This is not a good feture)
2.   ORIGNAL_TENOR - CURRENT_TENOR
3.   OUTSTANDING_PRINCIPAL - PAID_PRINCIPAL
4.   COMPLETED_TENURE - BALANCE_TENURE
5.   EMI_DUEAMT - EMI_RECEIVED_AMT
6.   EXCESS_AVAILABLE - EXCESS_ADJUSTED_AMT
7.   PRE_EMI_DUEAMT - PRE_EMI_RECEIVED_AMT (This is not a good feture)



In [0]:
agg_transactions['OT-CT'] = agg_transactions.ORIGNAL_TENOR_max - agg_transactions.CURRENT_TENOR_mean
agg_transactions['OP-PP'] = agg_transactions.OUTSTANDING_PRINCIPAL_sum - agg_transactions.PAID_PRINCIPAL_sum
agg_transactions['CT-BT'] = agg_transactions.COMPLETED_TENURE_max -  agg_transactions.BALANCE_TENURE_min
agg_transactions['EMID-EMIR'] = agg_transactions.EMI_DUEAMT_mean - agg_transactions.EMI_RECEIVED_AMT_mean
agg_transactions['EAV-EAD'] = agg_transactions.EXCESS_AVAILABLE_sum - agg_transactions.EXCESS_ADJUSTED_AMT_sum

In [0]:
train_X = pd.merge(train, agg_transactions, on='AGREEMENTID', how='left')

X = train_X.drop(['AGREEMENTID', 'FORECLOSURE'], axis=1).values
y = train_X['FORECLOSURE'].values

from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X = ss.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.1,
                                                    random_state=2019)

test_pred = pd.merge(test, agg_transactions, on='AGREEMENTID', how='left')
test_X = test_pred.drop(['AGREEMENTID', 'FORECLOSURE'], axis=1).values
test_X = ss.transform(test_X)

In [0]:
params = {'max_depth':-1,
         'max_bin':50,
         'min_data_in_leaf':15,
         'min_sum_hessian_in_leaf':1e-2,
         'feature_fraction':0.8,
         'bagging_fraction':0.8,
         'bagging_freq':10, 
         'lambda_l1':0.03,
         'lambda_l2':0.03,
         'min_gain_to_split':0.5,
         'boosting_type':'dart',
         'metric':'auc',
         'objective':'binary',
         'learning_rate':0.007,
         'num_leaves':80}

n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2019)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))

for fold_, (trn_idx, val_idx) in enumerate(skf.split(X, y)):
  
    print("Fold Number --> {}".format(fold_+1))
    
    trn_data = lgbm.Dataset(X[trn_idx], y[trn_idx])
    val_data = lgbm.Dataset(X[val_idx], y[val_idx])

    num_round = 500
    clf = lgbm.train(params, trn_data, num_round, valid_sets=[trn_data, val_data],
                     verbose_eval=100)
    oof[val_idx] = clf.predict(X[val_idx], num_iteration=clf.best_iteration)
    
    lgbm.plot_importance(clf)
    plt.show()
    
    predictions += clf.predict(test_X, num_iteration=clf.best_iteration)/n_splits

print("CV score: {:<8.5f}".format(roc_auc_score(y, oof)))

Fold Number --> 1
[100]	training's auc: 0.953943	valid_1's auc: 0.930177
[200]	training's auc: 0.959883	valid_1's auc: 0.936262
[300]	training's auc: 0.963502	valid_1's auc: 0.941133


In [0]:
test.FORECLOSURE = predictions

In [0]:
attempt = 7
test.to_csv("submission{}.csv".format(attempt), index=False)