In [0]:
from google.colab import drive
drive.mount('/gdrive')

In [0]:
BASE_PATH = "/gdrive/My Drive/Colab Notebooks/LTFS/"

In [0]:
import zipfile

print("Uncompressing the zip file...")
zip_ref = zipfile.ZipFile(BASE_PATH+"LTFS.zip", "r")
zip_ref.extractall("LTFS/")
zip_ref.close()
print("Finished...")

In [0]:
DATA_PATH = "LTFS/LTFS/"

In [0]:
import pandas as pd
import numpy as np

In [0]:
train = pd.read_csv(DATA_PATH+"train.csv")
test = pd.read_csv(DATA_PATH+'test.csv')
sample_submission = pd.read_csv(DATA_PATH+"sample_submission.csv")

In [4]:
train.iloc[0:5, 0:10]

Unnamed: 0,UniqueID,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Date.of.Birth,Employment.Type
0,420825,50578,58400,89.55,67,22807,45,1441,01-01-84,Salaried
1,537409,47145,65550,73.23,67,22807,45,1502,31-07-85,Self employed
2,417566,53278,61360,89.63,67,22807,45,1497,24-08-85,Self employed
3,624493,57513,66113,88.48,67,22807,45,1501,30-12-93,Self employed
4,539055,52378,60300,88.39,67,22807,45,1495,09-12-77,Self employed


In [0]:
def impute_dummy_ET(row):
  
  if row == "Salaried":
    return 0
  elif row == "Self employed":
    return 1
  else:
    numb = np.random.rand()
    if numb > 0.5:
      return 0
    else:
      return 1
    
def calculate_age(row):
  
  tail = row[-2:]
  head = '19'
  
  years = int(head+tail)
  
  age = 2019-years
  
  return age

def convert_to_days(row):
  
  row_stripped = row.split(" ")
  one = row_stripped[0]
  two = row_stripped[1]
  
  if one[1] == 'y':
    years = int(one[0])
  else:
    years = int(one[0:2])
    
  if two[1] == 'm':
    months = int(two[0])
  else:
    months = int(two[0:2])
  
  days = (years*365)+(months*30)
  
  return days

In [0]:
train['AAA_Days'] = train['AVERAGE.ACCT.AGE'].apply(convert_to_days)
train['CHL_Days'] = train['CREDIT.HISTORY.LENGTH'].apply(convert_to_days)
train['DIS_AMT-AST_COST'] = train['disbursed_amount']-train['asset_cost']
train['AGE'] = train['Date.of.Birth'].apply(calculate_age)
train['ET'] = train['Employment.Type'].apply(impute_dummy_ET)

In [9]:
train.columns

Index(['UniqueID', 'disbursed_amount', 'asset_cost', 'ltv', 'branch_id',
       'supplier_id', 'manufacturer_id', 'Current_pincode_ID', 'Date.of.Birth',
       'Employment.Type', 'DisbursalDate', 'State_ID', 'Employee_code_ID',
       'MobileNo_Avl_Flag', 'Aadhar_flag', 'PAN_flag', 'VoterID_flag',
       'Driving_flag', 'Passport_flag', 'PERFORM_CNS.SCORE',
       'PERFORM_CNS.SCORE.DESCRIPTION', 'PRI.NO.OF.ACCTS', 'PRI.ACTIVE.ACCTS',
       'PRI.OVERDUE.ACCTS', 'PRI.CURRENT.BALANCE', 'PRI.SANCTIONED.AMOUNT',
       'PRI.DISBURSED.AMOUNT', 'SEC.NO.OF.ACCTS', 'SEC.ACTIVE.ACCTS',
       'SEC.OVERDUE.ACCTS', 'SEC.CURRENT.BALANCE', 'SEC.SANCTIONED.AMOUNT',
       'SEC.DISBURSED.AMOUNT', 'PRIMARY.INSTAL.AMT', 'SEC.INSTAL.AMT',
       'NEW.ACCTS.IN.LAST.SIX.MONTHS', 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS',
       'AVERAGE.ACCT.AGE', 'CREDIT.HISTORY.LENGTH', 'NO.OF_INQUIRIES',
       'loan_default', 'AAA_Days', 'CHL_Days', 'DIS_AMT-AST_COST', 'AGE',
       'ET'],
      dtype='object')

In [0]:
X = train.drop(['UniqueID', 'branch_id', 'supplier_id', 'manufacturer_id',
                'Current_pincode_ID', 'Date.of.Birth', 'Employment.Type',
                'DisbursalDate', 'State_ID', 'Employee_code_ID',
                'PERFORM_CNS.SCORE.DESCRIPTION', 'AVERAGE.ACCT.AGE',
                'CREDIT.HISTORY.LENGTH', 'loan_default'], axis=1).values
y = train['loan_default'].values

In [0]:
test['AAA_Days'] = test['AVERAGE.ACCT.AGE'].apply(convert_to_days)
test['CHL_Days'] = test['CREDIT.HISTORY.LENGTH'].apply(convert_to_days)
test['DIS_AMT-AST_COST'] = test['disbursed_amount']-test['asset_cost']
test['AGE'] = test['Date.of.Birth'].apply(calculate_age)
test['ET'] = test['Employment.Type'].apply(impute_dummy_ET)

In [0]:
X_test = test.drop(['UniqueID', 'branch_id', 'supplier_id', 'manufacturer_id',
                'Current_pincode_ID', 'Date.of.Birth', 'Employment.Type',
                'DisbursalDate', 'State_ID', 'Employee_code_ID',
                'PERFORM_CNS.SCORE.DESCRIPTION', 'AVERAGE.ACCT.AGE',
                'CREDIT.HISTORY.LENGTH'], axis=1).values

In [0]:
from sklearn.metrics import roc_auc_score
import lightgbm as lgbm
from sklearn.model_selection import StratifiedKFold

In [19]:
params = {'boosting_type':'gbdt',
          'metric':'auc',
          'objective':'binary',
          'max_bin':115,
          'num_leaves':7}

n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2019)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))

for fold_, (trn_idx, val_idx) in enumerate(skf.split(X, y)):
  
    print("Fold Number --> {}".format(fold_+1))
    
    trn_data = lgbm.Dataset(X[trn_idx], y[trn_idx])
    val_data = lgbm.Dataset(X[val_idx], y[val_idx])

    num_round = 1000
    clf = lgbm.train(params, trn_data, num_round, valid_sets=[trn_data, val_data],
                     verbose_eval=100)
    oof[val_idx] = clf.predict(X[val_idx], num_iteration=clf.best_iteration)
    
    predictions += clf.predict(X_test, num_iteration=clf.best_iteration)/n_splits

print("CV score: {:<8.5f}".format(roc_auc_score(y, oof)))

Fold Number --> 1
[100]	training's auc: 0.64521	valid_1's auc: 0.636211
[200]	training's auc: 0.652045	valid_1's auc: 0.637538
[300]	training's auc: 0.656925	valid_1's auc: 0.637709
[400]	training's auc: 0.660954	valid_1's auc: 0.638255
[500]	training's auc: 0.664747	valid_1's auc: 0.638321
[600]	training's auc: 0.66842	valid_1's auc: 0.638164
[700]	training's auc: 0.671653	valid_1's auc: 0.638184
[800]	training's auc: 0.674985	valid_1's auc: 0.638269
[900]	training's auc: 0.677838	valid_1's auc: 0.638186
[1000]	training's auc: 0.680641	valid_1's auc: 0.638126
Fold Number --> 2
[100]	training's auc: 0.644693	valid_1's auc: 0.636117
[200]	training's auc: 0.651595	valid_1's auc: 0.637833
[300]	training's auc: 0.656981	valid_1's auc: 0.638447
[400]	training's auc: 0.661173	valid_1's auc: 0.638328
[500]	training's auc: 0.665283	valid_1's auc: 0.638672
[600]	training's auc: 0.668842	valid_1's auc: 0.63857
[700]	training's auc: 0.672067	valid_1's auc: 0.638682
[800]	training's auc: 0.675187	

In [0]:
sample_submission.loan_default = predictions
attempt = 3
sample_submission.to_csv("Submission_{}.csv".format(attempt), index=False)