In [1]:
import pandas as pd
import numpy as np 
from datetime import date
from dateutil.relativedelta import relativedelta
from pandas.tseries.offsets import MonthEnd
from functools import reduce

import warnings
warnings.filterwarnings("ignore")

### import MoM data

In [5]:
df_mom = pd.read_csv('D:/Varthana/MOM_DATA_28-Sep-20.csv')

##### Convert data variable to date format

In [7]:
df_mom['DT_BUSINESSDATE'] = pd.to_datetime(df_mom['DT_BUSINESSDATE'])
df_mom['DT_INSTALLMENTDUE'] = pd.to_datetime(df_mom['DT_INSTALLMENTDUE'])
df_mom['DT_LAST_PAYMENT'] = pd.to_datetime(df_mom['DT_LAST_PAYMENT'])
df_mom['DT_CREATED'] = pd.to_datetime(df_mom['DT_CREATED'])
df_mom['DT_LAST_UPDATE'] = pd.to_datetime(df_mom['DT_LAST_UPDATE'])


### Adding disb date to MoM data

In [9]:
LMS_det_all = pd.read_csv('D:/Varthana/LMS_DETAILS_11-Sep-20.csv')

In [12]:
LMS_det_all_1 = pd.DataFrame(LMS_det_all[['LOAN_ID','FIRST_DISB_DATE']])

In [13]:
LMS_det_all_1['FIRST_DISB_DATE'] = pd.to_datetime(LMS_det_all_1['FIRST_DISB_DATE'])

In [14]:
df_mom_2  = pd.merge(df_mom, LMS_det_all_1,how="left",left_on="SZ_LOAN_ACCOUNT_NO", right_on="LOAN_ID")

In [15]:
## disb date not available
list(df_mom_2[df_mom_2['FIRST_DISB_DATE'].isnull()]['SZ_LOAN_ACCOUNT_NO'].unique())

['PUR7108U',
 'S18BLR-BLR-002142',
 'S19NAS-NAS-007192',
 'S20NAG-GON-013978',
 'U18HYD-HYD-000835']

#### Filter and retain only [0-30] DPD accounts for snapshots

In [16]:
# df_mom_0_30 = df_mom_2[(df_mom_2['I_DPD'] >=0) & (df_mom_2['I_DPD'] <=29) ]

In [17]:
df_mom_0_30 = df_mom_2

#### Retain only Dec'19, Sep'19, Jun'19, Mar'19 Snapshots for analysis

In [92]:
bus_dates = ['2019-07-31']
    #'2020-04-30', '2020-01-31','2019-07-31','2019-09-30','2019-10-31','2019-12-31']
# df_mom_0_30['DT_BUSINESSDATE'].head()

In [93]:
df_mom_4_snaps = df_mom_0_30[df_mom_0_30['DT_BUSINESSDATE'].isin(bus_dates)]

In [95]:
df_mom_4_snaps['DT_BUSINESSDATE'].value_counts()

2019-07-31    4609
Name: DT_BUSINESSDATE, dtype: int64

In [24]:
df_mom_4_snaps["I_DPD"].max()

715

### MoM data for perf - post Earliest Snapshot

In [25]:
df_perf = df_mom_2[df_mom_2['DT_BUSINESSDATE'] >=min(bus_dates)]

### N-Mob perf creation WIP

In [26]:
accts_dat = pd.DataFrame(df_mom_4_snaps[['SZ_LOAN_ACCOUNT_NO','DT_BUSINESSDATE']])
accts_dat.reset_index(inplace=True,drop=True)

In [27]:
accts_dat_1  = accts_dat.rename(columns = {"DT_BUSINESSDATE": "Snapshot_dt"})

In [29]:
accts_dat_1['SZ_LOAN_ACCOUNT_NO'].nunique()

5891

In [30]:
accts_dat_1['Snapshot_dt'].value_counts()

2020-04-30    5201
2020-01-31    5082
2019-12-31    5053
2019-10-31    4994
2019-09-30    4894
2019-07-31    4609
Name: Snapshot_dt, dtype: int64

In [31]:
accts_dat_1['end_date'] = pd.to_datetime(accts_dat_1['Snapshot_dt'], format="%Y%m") + MonthEnd(3)

###### perf 3Months merge

In [33]:
perf_merge = pd.merge(accts_dat_1,df_mom_2, on='SZ_LOAN_ACCOUNT_NO' )

In [34]:
perf_merge_2 = perf_merge[(perf_merge['DT_BUSINESSDATE'] <= perf_merge['end_date']) & 
                         (perf_merge['DT_BUSINESSDATE'] > perf_merge['Snapshot_dt'])]

In [35]:
difflist = np.setdiff1d(perf_merge['SZ_LOAN_ACCOUNT_NO'].unique(),
                        perf_merge_2['SZ_LOAN_ACCOUNT_NO'].unique())
# difflist

In [37]:
perf_merge_3 = perf_merge_2[['SZ_LOAN_ACCOUNT_NO',
                             'Snapshot_dt',
                             'DT_BUSINESSDATE',
                            'I_DPD']]

In [38]:
perf_merge_3['bad_30p'] = np.where(perf_merge_3['I_DPD'] >= 30,1,0)

In [39]:
perf_merge_4 = perf_merge_3.groupby(['SZ_LOAN_ACCOUNT_NO','Snapshot_dt']).agg(ever_30p = ("bad_30p","max"))


In [40]:
perf_merge_5 = perf_merge_4.reset_index()
perf_merge_5.head()

Unnamed: 0,SZ_LOAN_ACCOUNT_NO,Snapshot_dt,ever_30p
0,AGR07934S,2019-07-31,0
1,AGR07934S,2019-09-30,0
2,AGR07934S,2019-10-31,0
3,AGR07934S,2019-12-31,0
4,AGR07934S,2020-01-31,0


In [41]:
perf_merge_5['Snapshot_dt'].value_counts()

2020-04-30    5179
2020-01-31    5002
2019-12-31    4979
2019-10-31    4906
2019-09-30    4800
2019-07-31    4526
Name: Snapshot_dt, dtype: int64

### function to get bad rates

In [43]:
accounts_data_cop = accts_dat_1.drop('end_date',axis=1)

In [45]:

def bad_rate(df,perf_m,bad_def):
    df['end_date'] = pd.to_datetime(df['Snapshot_dt'], format="%Y%m") + MonthEnd(perf_m)
    perf_merge = pd.merge(df,df_mom_2, on='SZ_LOAN_ACCOUNT_NO' )
    perf_merge_2 = perf_merge[(perf_merge['DT_BUSINESSDATE'] <= perf_merge['end_date']) & 
                             (perf_merge['DT_BUSINESSDATE'] > perf_merge['Snapshot_dt'])]
    
    perf_merge_3 = perf_merge_2[['SZ_LOAN_ACCOUNT_NO',
                                 'Snapshot_dt',
                                 'DT_BUSINESSDATE',
                                'I_DPD']]
    perf_merge_3['bad'] = np.where(perf_merge_3['I_DPD'] >= bad_def,1,0)
    perf_merge_4 = perf_merge_3.groupby(['SZ_LOAN_ACCOUNT_NO','Snapshot_dt']).agg(ever_bad = ("bad","max"))
    perf_merge_5 = perf_merge_4.reset_index()
    print(perf_m,"-",bad_def,"-",perf_merge_5.shape)
#     print(perf_merge_5.shape)
    
    return perf_merge_5
    

In [46]:
bad_data = bad_rate(accounts_data_cop,12, 30)

12 - 30 - (29392, 3)


In [47]:
#### bad 12 - 30+ 
# bad_data.to_csv('/home/siddarth/LMS_data/Bad_12_30p_v1.csv')

In [48]:
bad_data['Snapshot_dt'].value_counts()


2020-04-30    5179
2020-01-31    5002
2019-12-31    4979
2019-10-31    4906
2019-09-30    4800
2019-07-31    4526
Name: Snapshot_dt, dtype: int64

In [50]:
bad_data_3_30 = bad_rate(accounts_data_cop,3, 30)

3 - 30 - (29392, 3)


In [51]:
bad_data_6_30 = bad_rate(accounts_data_cop,6, 30)

6 - 30 - (29392, 3)


### Bad rates for bureau sync snapshots 

In [63]:
bad_data_3_30 = bad_rate(accounts_data_cop,3, 30)
bad_data_3_60 = bad_rate(accounts_data_cop,3, 60)
bad_data_3_90 = bad_rate(accounts_data_cop,3, 90)

bad_data_4_30 = bad_rate(accounts_data_cop,4, 30)
bad_data_4_60 = bad_rate(accounts_data_cop,4, 60)
bad_data_4_90 = bad_rate(accounts_data_cop,4, 90)

bad_data_5_30 = bad_rate(accounts_data_cop,5, 30)
bad_data_5_60 = bad_rate(accounts_data_cop,5, 60)
bad_data_5_90 = bad_rate(accounts_data_cop,5, 90)

bad_data_6_30 = bad_rate(accounts_data_cop,6, 30)
bad_data_6_60 = bad_rate(accounts_data_cop,6, 60)
bad_data_6_90 = bad_rate(accounts_data_cop,6, 90)

bad_data_7_30 = bad_rate(accounts_data_cop,7, 30)
bad_data_7_60 = bad_rate(accounts_data_cop,7, 60)
bad_data_7_90 = bad_rate(accounts_data_cop,7, 90)

bad_data_8_30 = bad_rate(accounts_data_cop,8, 30)
bad_data_8_60 = bad_rate(accounts_data_cop,8, 60)
bad_data_8_90 = bad_rate(accounts_data_cop,8, 90)

bad_data_9_30 = bad_rate(accounts_data_cop,9, 30)
bad_data_9_60 = bad_rate(accounts_data_cop,9, 60)
bad_data_9_90 = bad_rate(accounts_data_cop,9, 90)

3 - 30 - (29392, 3)
3 - 60 - (29392, 3)
3 - 90 - (29392, 3)
4 - 30 - (29392, 3)
4 - 60 - (29392, 3)
4 - 90 - (29392, 3)
5 - 30 - (29392, 3)
5 - 60 - (29392, 3)
5 - 90 - (29392, 3)
6 - 30 - (29392, 3)
6 - 60 - (29392, 3)
6 - 90 - (29392, 3)
7 - 30 - (29392, 3)
7 - 60 - (29392, 3)
7 - 90 - (29392, 3)
8 - 30 - (29392, 3)
8 - 60 - (29392, 3)
8 - 90 - (29392, 3)
9 - 30 - (29392, 3)
9 - 60 - (29392, 3)
9 - 90 - (29392, 3)


In [64]:
bad_data_3_30_v2 = bad_data_3_30.rename(columns = {'ever_bad':'ever_bad_3_30'})
bad_data_3_60_v2 = bad_data_3_60.rename(columns = {'ever_bad':'ever_bad_3_60'})
bad_data_3_90_v2 = bad_data_3_90.rename(columns = {'ever_bad':'ever_bad_3_90'})

bad_data_4_30_v2 = bad_data_4_30.rename(columns = {'ever_bad':'ever_bad_4_30'})
bad_data_4_60_v2 = bad_data_4_60.rename(columns = {'ever_bad':'ever_bad_4_60'})
bad_data_4_90_v2 = bad_data_4_90.rename(columns = {'ever_bad':'ever_bad_4_90'})

bad_data_5_30_v2 = bad_data_5_30.rename(columns = {'ever_bad':'ever_bad_5_30'})
bad_data_5_60_v2 = bad_data_5_60.rename(columns = {'ever_bad':'ever_bad_5_60'})
bad_data_5_90_v2 = bad_data_5_90.rename(columns = {'ever_bad':'ever_bad_5_90'})

bad_data_6_30_v2 = bad_data_6_30.rename(columns = {'ever_bad':'ever_bad_6_30'})
bad_data_6_60_v2 = bad_data_6_60.rename(columns = {'ever_bad':'ever_bad_6_60'})
bad_data_6_90_v2 = bad_data_6_90.rename(columns = {'ever_bad':'ever_bad_6_90'})

bad_data_7_30_v2 = bad_data_7_30.rename(columns = {'ever_bad':'ever_bad_7_30'})
bad_data_7_60_v2 = bad_data_7_60.rename(columns = {'ever_bad':'ever_bad_7_60'})
bad_data_7_90_v2 = bad_data_7_90.rename(columns = {'ever_bad':'ever_bad_7_90'})

bad_data_8_30_v2 = bad_data_8_30.rename(columns = {'ever_bad':'ever_bad_8_30'})
bad_data_8_60_v2 = bad_data_8_60.rename(columns = {'ever_bad':'ever_bad_8_60'})
bad_data_8_90_v2 = bad_data_8_90.rename(columns = {'ever_bad':'ever_bad_8_90'})

bad_data_9_30_v2 = bad_data_9_30.rename(columns = {'ever_bad':'ever_bad_9_30'})
bad_data_9_60_v2 = bad_data_9_60.rename(columns = {'ever_bad':'ever_bad_9_60'})
bad_data_9_90_v2 = bad_data_9_90.rename(columns = {'ever_bad':'ever_bad_9_90'})

In [65]:
bad_data_9_90['Snapshot_dt'].value_counts()

2020-04-30    5179
2020-01-31    5002
2019-12-31    4979
2019-10-31    4906
2019-09-30    4800
2019-07-31    4526
Name: Snapshot_dt, dtype: int64

In [66]:
mer_list = [ bad_data_3_30_v2,bad_data_3_60_v2,bad_data_3_90_v2,
            bad_data_4_30_v2,bad_data_4_60_v2,bad_data_4_90_v2,
            bad_data_5_30_v2,bad_data_5_60_v2,bad_data_5_90_v2,
           bad_data_6_30_v2,bad_data_6_60_v2,bad_data_6_90_v2,
           bad_data_7_30_v2,bad_data_7_60_v2,bad_data_7_90_v2,
           bad_data_8_30_v2,bad_data_8_60_v2,bad_data_8_90_v2,
           bad_data_9_30_v2,bad_data_9_60_v2,bad_data_9_90_v2] 

In [67]:
df_merged_2 = reduce(lambda  left,right: pd.merge(left,right,on=['SZ_LOAN_ACCOUNT_NO','Snapshot_dt'],
                                            how='outer'), mer_list)

In [70]:
df_merged_2["Snapshot_dt"].value_counts()

2020-04-30    5179
2020-01-31    5002
2019-12-31    4979
2019-10-31    4906
2019-09-30    4800
2019-07-31    4526
Name: Snapshot_dt, dtype: int64

### addition on delinquency bucket and MOB to the data 

In [71]:
df_mom_4_snaps['DT_BUSINESSDATE'].value_counts()

2020-04-30    5201
2020-01-31    5082
2019-12-31    5053
2019-10-31    4994
2019-09-30    4894
2019-07-31    4609
Name: DT_BUSINESSDATE, dtype: int64

In [72]:
df_tempx = df_mom_4_snaps.copy()

##### MOB creation

In [75]:
df_tempx["MOB"] =  round(((df_tempx["DT_BUSINESSDATE"] - df_tempx["FIRST_DISB_DATE"])/np.timedelta64(1, 'M')))

###### Delq bucket creation


In [77]:

df_tempx.loc[df_tempx['I_DPD'] == 0, 'DEL_BUCKET'] = '1.Current'
df_tempx.loc[(df_tempx['I_DPD'] >=1) & (df_tempx['I_DPD'] <=29) , 'DEL_BUCKET'] = '2.[1-29]'
df_tempx.loc[(df_tempx['I_DPD'] >=30) & (df_tempx['I_DPD'] <=59) , 'DEL_BUCKET'] = '3.[30-59]'
df_tempx.loc[(df_tempx['I_DPD'] >=60) & (df_tempx['I_DPD'] <=89) , 'DEL_BUCKET'] = '4.[60-89]'
df_tempx.loc[(df_tempx['I_DPD'] >=90)  , 'DEL_BUCKET'] = '5.[90+)'

In [78]:
df_tempx['DEL_BUCKET'].value_counts()

1.Current    26071
4.[60-89]     1241
5.[90+)       1037
2.[1-29]       883
3.[30-59]      601
Name: DEL_BUCKET, dtype: int64

In [80]:
df_mer_3 = pd.merge(df_merged_2,df_tempx.loc[:,['SZ_LOAN_ACCOUNT_NO','DT_BUSINESSDATE','DEL_BUCKET','MOB']], 
                    how = 'left', 
                    left_on = ['SZ_LOAN_ACCOUNT_NO','Snapshot_dt'],
                    right_on = ['SZ_LOAN_ACCOUNT_NO','DT_BUSINESSDATE'])

In [81]:
df_mer_3['DEL_BUCKET'].value_counts()

1.Current    25663
4.[60-89]     1232
5.[90+)       1026
2.[1-29]       873
3.[30-59]      598
Name: DEL_BUCKET, dtype: int64

In [82]:
df_mer_3['Snapshot_dt'].value_counts()

2020-04-30    5179
2020-01-31    5002
2019-12-31    4979
2019-10-31    4906
2019-09-30    4800
2019-07-31    4526
Name: Snapshot_dt, dtype: int64

In [84]:
#df_mer_3.to_csv('D:/Varthana/All_perf_4ss_v4_cibil.csv')

### 30p in 3mths 

In [2]:
import pandas as pd

In [85]:
df = pd.read_csv('D:/Varthana/All_perf_4ss_v4_cibil.csv')

In [87]:
df_30_3 = df[(df['DEL_BUCKET']=='1.Current') &
            (df['MOB'] >=6)].loc[:,["SZ_LOAN_ACCOUNT_NO","Snapshot_dt","ever_bad_9_30","ever_bad_9_60"]]

In [89]:
df_30_3['Appl_datekey'] = df_30_3["SZ_LOAN_ACCOUNT_NO"].str.cat(df_30_3["Snapshot_dt"].astype(str), sep="_")

In [96]:
df_30_3[df_30_3['Snapshot_dt']=='2019-07-31'].shape

(3056, 5)

In [91]:
df_30_3.to_csv("D:/Varthana/All_ss_30p_60p_9mths.csv")