In [1]:
import datetime as dt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

import warnings
warnings.filterwarnings('ignore') 


## 1. Loading Data

In [2]:
pos = pd.read_csv('../dataset/POS_CASH_balance.csv')
pos.head()

Unnamed: 0,SK_ID_PREV,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF,SK_ID_CURR
0,1803195,-31,48.0,45.0,Active,0,0,185279
1,1803195,-17,48.0,31.0,Active,0,0,185279
2,1803195,-21,48.0,35.0,Active,0,0,185279
3,1803195,-8,48.0,21.0,Active,0,0,185279
4,1803195,-4,48.0,17.0,Active,0,0,185279


In [3]:
pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8543375 entries, 0 to 8543374
Data columns (total 8 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   SK_ID_PREV             int64  
 1   MONTHS_BALANCE         int64  
 2   CNT_INSTALMENT         float64
 3   CNT_INSTALMENT_FUTURE  float64
 4   NAME_CONTRACT_STATUS   object 
 5   SK_DPD                 int64  
 6   SK_DPD_DEF             int64  
 7   SK_ID_CURR             int64  
dtypes: float64(2), int64(5), object(1)
memory usage: 521.4+ MB


In [4]:
pos.columns

Index(['SK_ID_PREV', 'MONTHS_BALANCE', 'CNT_INSTALMENT',
       'CNT_INSTALMENT_FUTURE', 'NAME_CONTRACT_STATUS', 'SK_DPD', 'SK_DPD_DEF',
       'SK_ID_CURR'],
      dtype='object')

In [5]:
app_train = pd.read_csv('../dataset/application_train.csv')
prev_app = pd.read_csv('../dataset/previous_application.csv')

## 2. Processing

### a.About Data

In [6]:
pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8543375 entries, 0 to 8543374
Data columns (total 8 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   SK_ID_PREV             int64  
 1   MONTHS_BALANCE         int64  
 2   CNT_INSTALMENT         float64
 3   CNT_INSTALMENT_FUTURE  float64
 4   NAME_CONTRACT_STATUS   object 
 5   SK_DPD                 int64  
 6   SK_DPD_DEF             int64  
 7   SK_ID_CURR             int64  
dtypes: float64(2), int64(5), object(1)
memory usage: 521.4+ MB


In [7]:
pos.describe()

Unnamed: 0,SK_ID_PREV,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,SK_DPD,SK_DPD_DEF,SK_ID_CURR
count,8543375.0,8543375.0,8521512.0,8521497.0,8543375.0,8543375.0,8543375.0
mean,1903489.0,-34.94812,17.03348,10.46808,11.90473,0.5835209,153535.6
std,535938.7,26.05372,12.0043,11.14408,134.0302,30.4211,88716.61
min,1000001.0,-96.0,1.0,0.0,0.0,0.0,0.0
25%,1434475.0,-54.0,10.0,3.0,0.0,0.0,76622.0
50%,1896878.0,-28.0,12.0,7.0,0.0,0.0,153315.0
75%,2369096.0,-13.0,24.0,14.0,0.0,0.0,230293.0
max,2843499.0,-1.0,84.0,84.0,4231.0,3373.0,307510.0


### b.Check values

In [8]:
categorical_col = pos.select_dtypes('object').columns.to_list()
numerical_col = [col for col in pos.columns if col not in categorical_col and col not in ['SK_ID_CURR', 'SK_ID_PREV']]

In [9]:
for col in categorical_col:
    print(pos[col].value_counts(), '\n')

NAME_CONTRACT_STATUS
Active                   7818577
Completed                 634872
Signed                     74625
Demand                      6110
Returned to the store       4591
Approved                    4221
Amortized debt               365
Canceled                      12
XNA                            2
Name: count, dtype: int64 



#### DUMP HERE

In [10]:
data = pos.copy()

In [11]:
#making the MONTHS_BALANCE Positive
data['MONTHS_BALANCE'] = np.abs(data['MONTHS_BALANCE'])
#sorting the DataFrame according to the month of status from oldest to latest, for rolling computations
data = data.sort_values(by=['SK_ID_PREV', 'MONTHS_BALANCE'], ascending=False)

#computing Exponential Moving Average for some features based on MONTHS_BALANCE
col_ema = ['CNT_INSTALMENT', 'CNT_INSTALMENT_FUTURE']
exp_columns = ['EXP_' + x for x in col_ema]
data[exp_columns] = data.groupby('SK_ID_PREV')[col_ema].transform(lambda x: x.ewm(alpha = 0.6).mean())

#creating new features based on Domain Knowledge
data['SK_DPD_RATIO'] = data['SK_DPD'] / (data['SK_DPD_DEF'] + 0.00001)
data['TOTAL_TERM'] = data['CNT_INSTALMENT'] + data['CNT_INSTALMENT_FUTURE']
data['EXP_POS_TOTAL_TERM'] = data['EXP_CNT_INSTALMENT'] + data['EXP_CNT_INSTALMENT_FUTURE']


In [12]:
# aggregating over SK_ID_PREV
overall_aggregations = {
    'SK_ID_CURR': ['first'],
    'MONTHS_BALANCE': ['max'],
    'CNT_INSTALMENT': ['mean', 'max', 'min'],
    'CNT_INSTALMENT_FUTURE': ['mean', 'max', 'min'],
    'SK_DPD': ['max', 'sum'],
    'SK_DPD_DEF': ['max', 'sum'],
    'EXP_CNT_INSTALMENT': ['last'],
    'EXP_CNT_INSTALMENT_FUTURE': ['last'],
    'SK_DPD_RATIO': ['mean', 'max'],
    'TOTAL_TERM': ['mean', 'max', 'last'],
    'EXP_POS_TOTAL_TERM': ['mean']
}
aggregations_for_year = {
    'CNT_INSTALMENT': ['mean', 'max', 'min'],
    'CNT_INSTALMENT_FUTURE': ['mean', 'max', 'min'],
    'SK_DPD': ['max', 'sum'],
    'SK_DPD_DEF': ['max', 'sum'],
    'EXP_CNT_INSTALMENT': ['last'],
    'EXP_CNT_INSTALMENT_FUTURE': ['last'],
    'SK_DPD_RATIO': ['mean', 'max'],
    'TOTAL_TERM': ['mean', 'max'],
    'EXP_POS_TOTAL_TERM': ['last']
}
aggregations_for_categories = {
    'CNT_INSTALMENT': ['mean', 'max', 'min'],
    'CNT_INSTALMENT_FUTURE': ['mean', 'max', 'min'],
    'SK_DPD': ['max', 'sum'],
    'SK_DPD_DEF': ['max', 'sum'],
    'EXP_CNT_INSTALMENT': ['last'],
    'EXP_CNT_INSTALMENT_FUTURE': ['last'],
    'SK_DPD_RATIO': ['mean', 'max'],
    'TOTAL_TERM': ['mean', 'max'],
    'EXP_POS_TOTAL_TERM': ['last']
}
# performing overall aggregations over SK_ID_PREV
data_aggregated_overall = data.groupby('SK_ID_PREV').agg(overall_aggregations)
data_aggregated_overall.columns = ['_'.join(x).upper() for x in data_aggregated_overall.columns]
data_aggregated_overall.rename(columns={'SK_ID_CURR_FIRST': 'SK_ID_CURR'}, inplace=True)

# yearwise aggregations
data['YEAR_BALANCE'] = data['MONTHS_BALANCE'] // 12
# aggregating over SK_ID_PREV for each last 2 years
data_aggregated_year = pd.DataFrame()
for year in range(2):
    group = data[data['YEAR_BALANCE'] == year].groupby('SK_ID_PREV').agg(aggregations_for_year)
    group.columns = ['_'.join(ele).upper() + '_YEAR_' + str(year) for ele in group.columns]
    if year == 0:
        data_aggregated_year = group
    else:
        data_aggregated_year = data_aggregated_year.merge(group, on='SK_ID_PREV', how='outer')

# aggregating over SK_ID_PREV for rest of the years
data_aggregated_rest_years = data[data['YEAR_BALANCE'] >= 2].groupby('SK_ID_PREV').agg(aggregations_for_year)
data_aggregated_rest_years.columns = ['_'.join(ele).upper() + '_YEAR_REST' for ele in data_aggregated_rest_years.columns]
# merging all the years aggregations
data_aggregated_year = data_aggregated_year.merge(data_aggregated_rest_years, on='SK_ID_PREV', how='outer')
data = data.drop(['YEAR_BALANCE'], axis=1)

# aggregating over SK_ID_PREV for each of NAME_CONTRACT_STATUS categories
contract_type_categories = ['Active', 'Completed']
data_aggregated_contract = pd.DataFrame()
for i, contract_type in enumerate(contract_type_categories):
    group = data[data['NAME_CONTRACT_STATUS'] == contract_type].groupby('SK_ID_PREV').agg(aggregations_for_categories)
    group.columns = ['_'.join(ele).upper() + '_' + contract_type.upper() for ele in group.columns]
    if i == 0:
        data_aggregated_contract = group
    else:
        data_aggregated_contract = data_aggregated_contract.merge(group, on='SK_ID_PREV', how='outer')

data_aggregated_rest_contract = data[(data['NAME_CONTRACT_STATUS'] != 'Active') &
                                     (data['NAME_CONTRACT_STATUS'] != 'Completed')].groupby('SK_ID_PREV').agg(aggregations_for_categories)
data_aggregated_rest_contract.columns = ['_'.join(ele).upper() + '_REST' for ele in data_aggregated_rest_contract.columns]
# merging the categorical aggregations
data_aggregated_contract = data_aggregated_contract.merge(data_aggregated_rest_contract, on='SK_ID_PREV', how='outer')

# merging all the aggregations
data_aggregated = data_aggregated_overall.merge(data_aggregated_year, on='SK_ID_PREV', how='outer')
data_aggregated = data_aggregated.merge(data_aggregated_contract, on='SK_ID_PREV', how='outer')

# one-hot encoding the categorical feature NAME_CONTRACT_TYPE
name_contract_dummies = pd.get_dummies(data['NAME_CONTRACT_STATUS'], prefix='CONTRACT')
contract_names = name_contract_dummies.columns.tolist()
# concatenating one-hot encoded categories with main table
data = pd.concat([data, name_contract_dummies], axis=1)
# aggregating these over SK_ID_PREV as well
aggregated_cc_contract = data[['SK_ID_PREV'] + contract_names].groupby('SK_ID_PREV').mean()

# merging with the final aggregations
data_aggregated = data_aggregated.merge(aggregated_cc_contract, on='SK_ID_PREV', how='outer')

# aggregating over SK_ID_CURR
columns_to_aggregate = data_aggregated.columns[1:]
# defining the aggregations to perform
aggregations_final = {}
for col in columns_to_aggregate:
    if 'MEAN' in col:
        aggregates = ['mean', 'sum', 'max']
    else:
        aggregates = ['mean']
    aggregations_final[col] = aggregates

data_aggregated_final = data_aggregated.groupby('SK_ID_CURR').agg(aggregations_final)
data_aggregated_final.columns = ['_'.join(ele).upper() for ele in data_aggregated_final.columns]


In [None]:
#making the MONTHS_BALANCE Positive
pos['MONTHS_BALANCE'] = np.abs(pos['MONTHS_BALANCE'])
#sorting the DataFrame according to the month of status from oldest to latest, for rolling computations
pos = pos.sort_values(by=['SK_ID_PREV', 'MONTHS_BALANCE'], ascending=False)

#computing Exponential Moving Average for some features based on MONTHS_BALANCE
col_ema = ['CNT_INSTALMENT', 'CNT_INSTALMENT_FUTURE']
exp_columns = ['EXP_' + x for x in col_ema]
pos[exp_columns] = pos.groupby('SK_ID_PREV')[col_ema].transform(lambda x: x.ewm(alpha = 0.6).mean())

#creating new features based on Domain Knowledge
pos['SK_DPD_RATIO'] = pos['SK_DPD'] / (pos['SK_DPD_DEF'] + 0.00001)
pos['TOTAL_TERM'] = pos['CNT_INSTALMENT'] + pos['CNT_INSTALMENT_FUTURE']
pos['EXP_POS_TOTAL_TERM'] = pos['EXP_CNT_INSTALMENT'] + pos['EXP_CNT_INSTALMENT_FUTURE']

In [None]:
#aggregating over SK_ID_PREV
overall_aggregations = {
    'SK_ID_CURR' : ['first'],
    'MONTHS_BALANCE' : ['max'],
    'CNT_INSTALMENT' : ['mean', 'max','min'],
    'CNT_INSTALMENT_FUTURE' : ['mean','max','min'],
    'SK_DPD' : ['max','sum'],
    'SK_DPD_DEF' : ['max','sum'],
    'EXP_CNT_INSTALMENT' : ['last'],
    'EXP_CNT_INSTALMENT_FUTURE' : ['last'],
    'SK_DPD_RATIO' : ['mean','max'],
    'TOTAL_TERM' : ['mean','max','last'],
    'EXP_POS_TOTAL_TERM' : ['mean'] 
}
aggregations_for_year = {
    'CNT_INSTALMENT' : ['mean', 'max','min'],
    'CNT_INSTALMENT_FUTURE' : ['mean','max','min'],
    'SK_DPD' : ['max','sum'],
    'SK_DPD_DEF' : ['max','sum'],
    'EXP_CNT_INSTALMENT' : ['last'],
    'EXP_CNT_INSTALMENT_FUTURE' : ['last'],
    'SK_DPD_RATIO' : ['mean','max'],
    'TOTAL_TERM' : ['mean','max'],
    'EXP_POS_TOTAL_TERM' : ['last'] 
}
aggregations_for_categories = {
    'CNT_INSTALMENT' : ['mean', 'max','min'],
    'CNT_INSTALMENT_FUTURE' : ['mean','max','min'],
    'SK_DPD' : ['max','sum'],
    'SK_DPD_DEF' : ['max','sum'],
    'EXP_CNT_INSTALMENT' : ['last'],
    'EXP_CNT_INSTALMENT_FUTURE' : ['last'],
    'SK_DPD_RATIO' : ['mean','max'],
    'TOTAL_TERM' : ['mean','max'],
    'EXP_POS_TOTAL_TERM' : ['last']
}
#performing overall aggregations over SK_ID_PREV
pos_cash_aggregated_overall = pos.groupby('SK_ID_PREV').agg(overall_aggregations)
pos_cash_aggregated_overall.columns = ['_'.join(x).upper() for x in pos_cash_aggregated_overall.columns]
pos_cash_aggregated_overall.rename(columns = {'SK_ID_CURR_FIRST': 'SK_ID_CURR'}, inplace = True)

#yearwise aggregations
pos['YEAR_BALANCE'] = pos['MONTHS_BALANCE'] //12
#aggregating over SK_ID_PREV for each last 2 years
pos_cash_aggregated_year = pd.DataFrame()
for year in range(2):
    group = pos[pos['YEAR_BALANCE'] == year].groupby('SK_ID_PREV').agg(aggregations_for_year)
    group.columns = ['_'.join(ele).upper() + '_YEAR_' + str(year) for ele in group.columns]
    if year == 0:
        pos_cash_aggregated_year = group
    else:
        pos_cash_aggregated_year = pos_cash_aggregated_year.merge(group, on = 'SK_ID_PREV', how = 'outer')

#aggregating over SK_ID_PREV for rest of the years
pos_cash_aggregated_rest_years = pos[pos['YEAR_BALANCE'] >= 2].groupby('SK_ID_PREV').agg(aggregations_for_year)
pos_cash_aggregated_rest_years.columns = ['_'.join(ele).upper() + '_YEAR_REST' for ele in pos_cash_aggregated_rest_years.columns]
#merging all the years aggregations
pos_cash_aggregated_year = pos_cash_aggregated_year.merge(pos_cash_aggregated_rest_years, on = 'SK_ID_PREV', how = 'outer')
pos = pos.drop(['YEAR_BALANCE'], axis = 1)

#aggregating over SK_ID_PREV for each of NAME_CONTRACT_STATUS categories
contract_type_categories = ['Active', 'Completed']
pos_cash_aggregated_contract = pd.DataFrame()
for i, contract_type in enumerate(contract_type_categories):
    group = pos[pos['NAME_CONTRACT_STATUS'] == contract_type].groupby('SK_ID_PREV').agg(aggregations_for_categories)
    group.columns = ['_'.join(ele).upper() + '_' + contract_type.upper() for ele in group.columns]
    if i == 0:
        pos_cash_aggregated_contract = group
    else:
        pos_cash_aggregated_contract = pos_cash_aggregated_contract.merge(group, on = 'SK_ID_PREV', how = 'outer')

pos_cash_aggregated_rest_contract = pos[(pos['NAME_CONTRACT_STATUS'] != 'Active') & 
                                (pos['NAME_CONTRACT_STATUS'] != 'Completed')].groupby('SK_ID_PREV').agg(aggregations_for_categories)
pos_cash_aggregated_rest_contract.columns = ['_'.join(ele).upper() + '_REST' for ele in pos_cash_aggregated_rest_contract.columns]
#merging the categorical aggregations
pos_cash_aggregated_contract = pos_cash_aggregated_contract.merge(pos_cash_aggregated_rest_contract, on = 'SK_ID_PREV', how = 'outer')    

#merging all the aggregations
pos_cash_aggregated = pos_cash_aggregated_overall.merge(pos_cash_aggregated_year, on = 'SK_ID_PREV', how = 'outer')
pos_cash_aggregated = pos_cash_aggregated.merge(pos_cash_aggregated_contract, on = 'SK_ID_PREV', how = 'outer')

#onehot encoding the categorical feature NAME_CONTRACT_TYPE
name_contract_dummies = pd.get_dummies(pos['NAME_CONTRACT_STATUS'], prefix='CONTRACT')
contract_names = name_contract_dummies.columns.tolist()
#concatenating one-hot encoded categories with main table
pos = pd.concat([pos, name_contract_dummies], axis=1)
#aggregating these over SK_ID_PREV as well
aggregated_cc_contract = pos[['SK_ID_PREV'] + contract_names].groupby('SK_ID_PREV').mean()    

#merging with the final aggregations
pos_cash_aggregated = pos_cash_aggregated.merge(aggregated_cc_contract, on = 'SK_ID_PREV', how = 'outer')
        


In [None]:
#aggregating over SK_ID_CURR
columns_to_aggregate = pos_cash_aggregated.columns[1:]
#defining the aggregations to perform
aggregations_final = {}
for col in columns_to_aggregate:
    if 'MEAN' in col:
        aggregates = ['mean','sum','max']
    else:
        aggregates = ['mean']
    aggregations_final[col] = aggregates
pos_cash_aggregated_final = pos_cash_aggregated.groupby('SK_ID_CURR').agg(aggregations_final)
pos_cash_aggregated_final.columns = ['_'.join(ele).upper() for ele in pos_cash_aggregated_final.columns]
