In [None]:
def get_sql_conn(user_id, password):
    from sqlalchemy import create_engine
    from urllib.parse import quote      
    server = create_engine('''mysql+pymysql://{0}:{1}s@10.95.60.125:3306/ypre'''.format(user_id,'%') % quote(password))
    return server
def header(name):
    colorstr = """<h4><center>{}</center></h4>""".format(name)
    display(Markdown(colorstr))

def preprocess_cohort_data(df_):
    print('Data preprocessing starts')
    df=df_.copy()
    df['due_month'] = pd.to_datetime(df['emi_due_date']).apply(lambda x: x.strftime("%Y%m") if pd.isna(x)!=True else np.nan)
    df['disbursed_month'] = pd.to_datetime(df['disbursed_date']).apply(lambda x: x.strftime("%Y%m") if pd.isna(x)!=True else np.nan)
    df['source_month'] = pd.to_datetime(df['createdOn']).apply(lambda x: x.strftime("%Y%m") if pd.isna(x)!=True else np.nan)
    df['ppmt'] = np.where(df['delay']<=0,1,0)
    header('Null Summary')
    null_smry=df[['due_month','disbursed_month','source_month','ppmt','delay']].isnull().sum().reset_index()
    null_smry.columns=['Column Names','# of Null Rows']
    display(null_smry)
    print('End of data preprocessing')
    return df

    

In [None]:
def cohort_fetch(start_date,end_date,sql_userid='',sql_password=''):
    server=get_sql_conn(sql_userid, sql_password)
    if(server==None):
        return 400
    query=f''' select 
    userid, loan_id, user_band,product_name,principal,installment_due,loan_number,installment_number,
    total_paid,penalty_paid,unpaid_emi,state,delay,createdOn, disbursed_date, emi_due_date,emi_paid_date 
    from ypdynamic.yp_emi_data_tbl 

    where emi_due_date >= "{start_date}" and emi_due_date <"{end_date}" and loan_number=1 and installment_number=1
   ;
    '''
    data=pd.read_sql(query, server)
    print(f'Data Fetch completed. Shape of data fetch: {data.shape}')
    server.dispose()
    data=preprocess_cohort_data(data)
    print(f'Final shape of data after preprocessing: {data.shape}')
    return data

In [None]:
def run_sync_loan_application_v1(df_, conn):
    ''' This function takes userIds/cid & loanIds to fetch syncIds from yp.yp_user_sync_data based on loan_application logic.
    This code runs for one batch.
    Input: 
    userIds : list of customer ids, list
    loanIds : list of loan ids, list
    batch_size : Batch size, int >0 & should be in multiples of 10,100,1000,10000 and so on.....
    conn= SQL create_engine statement, e.g. create_engine('-------conn_string----------') 
    
    Output:
    DataFrame containing userid & syncid.
    '''
    if(conn==None ):
        print('Check connection string or length of userids/oanids passed.')
        return 400
    import datetime as dt
    user_lst=list(df_.userid.values.tolist()) 
    loanId_lst=list(df_.loan_id.values.tolist())
    sync_batch_query='''select cid as userid, max(syncId) as syncId,appType
                from yp.yp_user_sync_data
                where cid in {0} and source = 'loan_application' and sourceId in {1} 
                group by cid;'''  ## Query to fetch syncId based on loan_application logic
    print(f'Starting Fetch Logic: Loan Application')
    t1=dt.datetime.now()
    temp_query=sync_batch_query.format(tuple(user_lst),tuple(loanId_lst))
    temp_data= pd.read_sql(temp_query, conn)
    temp_data['fetch_logic']='loan_application'
    print('Fetch Done')
    elapsed=dt.datetime.now()-t1
    print("df shape: " + str(temp_data.shape) + "Took H:M:S - : %02d:%02d:%02d" % (elapsed.seconds // 3600, elapsed.seconds // 60 % 60, elapsed.seconds % 60))
    return temp_data

def run_sync_7day_logic_v1(df_,conn):
    ''' This function takes dataframe containig userid & disburse_date to fetch syncIds from yp.yp_user_sync_data based on Approximation_7d_lag_from_disb_date logic.
    This code runs for one batch.
    Input: 
    df_ : dataframe containig userid & disburse_date
    batch_size : Batch size, int >0 & should be in multiples of 10,100,1000,10000 and so on.....
    conn= SQL create_engine statement, e.g. create_engine('-------conn_string----------') 
    
    Output:
    DataFrame containing userid & syncid.
    '''
    if(conn==None):
        return 400
    import datetime as dt
    df=df_.copy()
    df.disbursed_date=df.disbursed_date.astype('str')    # Converting disbursed date into string format. This will be used to create a temp table for filteration of data
    user_lst=list(df.userid.values.tolist())             
    sync_7day_query='''with base_data as (select column_0 as cid, column_1 as disbursed_date from ( values {0} ) as temp) 

            select t1.cid as userid, max(t1.syncId) as syncId,appType
            from yp.yp_user_sync_data t1
            join base_data t2 on t1.cid in {1} and t1.cid = t2.cid and 
            t1.updatedAt>= DATE_ADD(t2.disbursed_date, INTERVAL -7 DAY) and t1.updatedAt<=t2.disbursed_date
            and source!='session'
            group by 1;'''                     # Query to fetch syncId based on 7 day logic 

    print(f'Starting Fetch Logic: Approximation_7d_lag_from_disb_date')
    t1=dt.datetime.now()
    temp_table_data=','.join(['row'+str(tuple(i)) for i in df[df.userid.isin(user_lst)][['userid','disbursed_date']].values])   # data creation for temp table (table created in select statement)
    temp_query=sync_7day_query.format(temp_table_data,tuple(user_lst))
    temp_data= pd.read_sql(temp_query, conn)
    temp_data['fetch_logic']='Approximation_7d_lag_from_disb_date'
    print('Fetch Done')
    elapsed=dt.datetime.now()-t1
    print("df shape: " + str(temp_data.shape) + "Took H:M:S - : %02d:%02d:%02d" % (elapsed.seconds // 3600, elapsed.seconds // 60 % 60, elapsed.seconds % 60))
    return temp_data
             

def fetch_sync_v1(df_,batch_size=10000,sql_userid='',sql_password=''):
    import datetime as dt
    ''' This function takes dataframe containig atleast userid,loanid & disburse_date columns to fetch syncIds from yp.yp_user_sync_data based on loan_application & 7days from disburse date logic.
    This code runs in batchwise mode.
    Input: 
    df_          : dataframe containig atleast userid,loanid & disburse_date columns
    batch_size   : Batch size, int >0 & should be in multiples of 10,100,1000,10000 and so on.....
    sql_userid   : SQL user_id, default ='' No params i.e. params stored in credential.txt
    sql_password : SQL password, default ='' No params i.e. params stored in credential.txt
    
    Output:
    DataFrame containing all existing columns along with syncid.
    '''
    req_cols=['userid','loan_id','disbursed_date']
    header('-'*5+'Fetching SyncIds for Data'+'-'*5)
    df=df_.copy()
    server=get_sql_conn(sql_userid,sql_password)    # Getting SQL create_engine statement
    if(server==None):
        return 400
    if(not all([i in df.columns for i in req_cols])):
        header(' Column Error ')
        print(f'Please make sure all required columns : {req_cols} are present in data & with same name.')
        return 400
    try:
        final_data=pd.DataFrame()
        user_lst=df.userid.values.tolist()
       
        shape_lst=get_shape_lst(len(user_lst),batch_size)  # getting shape list based on size of data & batch_run_size

        for i in range(len(shape_lst)-1):
     
            header_left(f'Start : End - {shape_lst[i]}:{shape_lst[i+1]}')
            sub_df=df[df.userid.isin(user_lst[shape_lst[i]:shape_lst[i+1]])][req_cols]
           
            sync_loanApp=run_sync_loan_application_v1(sub_df, conn=server)    # Fetching sync based on loan_application _logic

            null_sync_df=sub_df[~sub_df.userid.isin(sync_loanApp.userid.tolist())]  
              # filteration of data where sync is null
            sync_7day=run_sync_7day_logic_v1(null_sync_df[['userid','disbursed_date']],conn=server) # Fetching sync based on Approximation_7d_lag_from_disb_date _logic
    
            sub_df=sub_df.merge(sync_loanApp.append(sync_7day),on='userid',how='left')
            final_data=final_data.append(sub_df)
            print(f'\n{sub_df[sub_df.syncId.notnull()].shape[0]}/{sub_df.shape[0]} , {round(100*sub_df[sub_df.syncId.notnull()].shape[0]/sub_df.shape[0],2)}% syncIds found.\n')
            
        print('End of Batch Execution.')
    except Exception as e:
        header('ERROR Occured')
        print(f'Error : {e}')
    if(final_data.shape[0]>0):
        print(f'\nTotal {final_data[final_data.syncId.notnull()].shape[0]}/{final_data.shape[0]},{round(100*final_data[final_data.syncId.notnull()].shape[0]/final_data.shape[0],2)} syncIds found.')
        header('-'*5+'Fetch Sync Complete.'+'-'*5)
        return final_data

In [None]:
def run_batch(start_idx, end_idx,data1):
    sync_7day_query='''with base_data as (select column_0 as cid, column_1 as disbursed_date from ( values {0} ) as temp)

    select t1.cid,max(t1.bureau_customer_id) bureau_customer_id
    from ypre.bureau_customer as t1
    join base_data as t2 on t1.cid=t2.cid
    join ypre.bureau_cibil_header_segment as t3 on t1.bureau_customer_id = t3.bureau_customer_id 
    and t3.date_processed<=t2.disbursed_date
    group by 1;'''

    temp_table_data = ','.join(['row' + str(tuple(i)) for i in data1[start_idx:end_idx][['userid', 'disbursed_date']].values])  # data creation for temp table (table created in select statement)
    temp_query = sync_7day_query.format(temp_table_data)
    cust_id_data=pd.read_sql(temp_query,server)    
    return cust_id_data

def fetch_bureau_custid(data1,batch_size=10000): 
    '''
    syncid_df=datframe containing syncids
    tablename= table name from where data need to be fetched
    '''
    import datetime as dt
    df = pd.DataFrame()
    print('data fetch start')
    t1=dt.datetime.now()

    for i in range(0, int(data1.shape[0] / batch_size) + 1):
        try:
            t1=dt.datetime.now()
            start = i * batch_size
            end = (i+1) * batch_size 
            a=run_batch(start,end,data1)
            df = pd.concat([df,a],axis=0)
            print(i,dt.datetime.now()-t1)
        except:
            header('ERROR Occured')
            print(f'Error : {e}') 

            
    print('Fetch is done')
    elapsed=dt.datetime.now()-t1
    print("df shape: " + str(df.shape) + "Took H:M:S - : %02d:%02d:%02d" % (elapsed.seconds // 3600, elapsed.seconds // 60 % 60, elapsed.seconds % 60))
    return df

In [None]:
def run_batch_score(start_idx, end_idx,data1):
    score='''select bureau_customer_id, score_date, score, score_v3, createdOn from ypre.bureau_cibil_score_segment 
where bureau_customer_id in {0};'''  

    scores=pd.read_sql(score.format(tuple(data1[start_idx:end_idx]['bureau_customer_id'])),server)    
    return scores

def fetch_cibil_score(data1,batch_size=100000): 
    '''
    syncid_df=datframe containing syncids
    tablename= table name from where data need to be fetched
    '''
    import datetime as dt
    df = pd.DataFrame()
    print('data fetch start')
    t1=dt.datetime.now()

    for i in range(0, int(data1.shape[0] / batch_size) + 1):
        try:
            t1=dt.datetime.now()
            start = i * batch_size
            end = (i+1) * batch_size 
            a=run_batch_score(start,end,data1)

            df = pd.concat([df,a],axis=0)
            print(i,dt.datetime.now()-t1)
        except:
            header('ERROR Occured')
            print(f'Error : {e}') 

            
    print('Fetch is done')
    elapsed=dt.datetime.now()-t1
    print("df shape: " + str(df.shape) + "Took H:M:S - : %02d:%02d:%02d" % (elapsed.seconds // 3600, elapsed.seconds // 60 % 60, elapsed.seconds % 60))
    return df

In [None]:
def Distribution_ppmt_event(data,distribution_by='due_month'):
    due=pd.DataFrame()
    due['cx_count']=data.groupby(distribution_by)['cid'].agg('count')
    due['ppmt']=data.groupby(distribution_by)['ppmt'].agg('sum')
    due['ppmt_rate']=due['ppmt']/due['cx_count']
    due['event']=data.groupby(distribution_by)['event'].agg('sum')
    due['event_rate']=due['event']/due['cx_count']
    due['avg_ticketsize']=data.groupby(distribution_by)['principal'].agg('mean')
    return due



In [None]:
def get_collectionRates(ds,delay_column,dpd_list = list(range(0,16))): 

    '''
    This function generate a dataframe with collection rate and cummulative collection rate at each delay values
    param :
        ds : data frame with delay/dpd column
        delay_column : str , feature name for the dpd column
        dpd_list : list, list for dpd values at which collection rates are needed
    output :
        dataframe
    demo :
        get_collectionRates(ds = base_pop,delay_column='delay',dpd_list = list(range(0,14)))
    '''
    
    dpd_list.extend([21,30,60,90,120,180,240,360])
    
    try:
        for i in dpd_list:
            ds1 = ds.loc[ds[delay_column] == i]
            ds2 = ds.loc[ds[delay_column] >= i]
            ds3 = ds.loc[ds[delay_column] <= i]
            
            collection_rate = round(100*ds1.shape[0]/ds2.shape[0],2)               
            cumt_rate = round((ds3.shape[0]/ds.shape[0])*100,2)
            temp = pd.DataFrame([{'dpd':i,'Collection_rate':collection_rate,'Cumulative_collectionRate':cumt_rate}])
            
            if i == 0:
                temp['Collection_rate'] = temp['Cumulative_collectionRate']
                master_data = temp
            else:
                master_data = pd.concat([master_data,temp],axis=0)
                
            master_data['delta_gain'] = master_data['Cumulative_collectionRate'] - master_data['Cumulative_collectionRate'].shift(1)

    except ZeroDivisionError as e:
        print('Cx with delay values ' + str(i) + ' or more are not present, hence output is limited till available delay values')
        pass
            
    return master_data

In [None]:
def run_batch_crif(start_idx, end_idx,data1):
    
    score='''select 
a.uid as cid
, b.user_id as crif_bureau_customer_id
, c.bureau_customer_id as cibil_bureau_customer_id
, a.crif_score 
, a.cibil_score
, a.cibil_score_v3
, a.cibil_score_used
from yp.yp_re_data as a
join ypre.crif_user_initial_request as b  ON a.crif_table_id = b.user_id 
join ypre.bureau_cibil_request as c  ON a.uid = c.bureau_cibil_request_id
where uid in {0};'''  

    scores=pd.read_sql(score.format(tuple(data1[start_idx:end_idx]['cid'])),server)    
    return scores

def fetch_crif_score(data1,batch_size=10000): 
    '''
    syncid_df=datframe containing syncids
    tablename= table name from where data need to be fetched
    '''
    import datetime as dt
    df = pd.DataFrame()
    print('data fetch start')
    t1=dt.datetime.now()

    for i in range(0, int(data1.shape[0] / batch_size) + 1):
        try:
            t1=dt.datetime.now()
            start = i * batch_size
            end = (i+1) * batch_size 
            a=run_batch_crif(start,end,data1)

            df = pd.concat([df,a],axis=0)
            print(i,dt.datetime.now()-t1)
           
        except:
            header('ERROR Occured')
            print(f'Error : {e}') 

            
    print('Fetch is done')
    elapsed=dt.datetime.now()-t1
    print("df shape: " + str(df.shape) + "Took H:M:S - : %02d:%02d:%02d" % (elapsed.seconds // 3600, elapsed.seconds // 60 % 60, elapsed.seconds % 60))
    return df

In [None]:
def get_woe_charts(woe_values,cols_list):
    '''
    This function create plots bin range vs Population DISTN and Event_rate
    woe_values : dataframe of woe values
    cols_list : list of features for which woe charts needed
        
    '''
    import matplotlib.pyplot as plt
    for i in cols_list:
        
        a=woe_values[woe_values.VAR_NAME==i].fillna('NAN')     
        fig, ax = plt.subplots(figsize=(10,5))
        chart=a.plot(use_index=True,kind='bar', x='Cuts', y='DIST_POP', ax=ax, color='b')
        a.plot(use_index=True,x='Cuts',y='EVENT_RATE', ax=ax,secondary_y=True, color='r')
        for p in ax.patches:
            height = p.get_height() 
            ax.text(x = p.get_x()+(p.get_width()), y = height, s = height.round(decimals=2), ha = "left", color="black")

        for x,y in zip(range(len(a['Cuts'])), a["EVENT_RATE"]):
            plt.text(x, y, '{:.03}'.format(y))
        ax.set_ylabel('Population Distribution',fontsize=12) 
        ax.set_xlabel('Bin Range',fontsize=12)     
        plt.title(i,fontsize=15)
        plt.show()

In [7]:
def Unzipe(file_path,file_types='csv'):
    import zipfile
    import pandas as pd
    zf = zipfile.ZipFile(path)
    filenames=list(zf.namelist())
    return filenames

#temp_df=pd.read_csv(zf.open('sql_creds - Copy.csv'))

In [None]:
def Cohort_and_score_fetch(data,dpd,df):
    import datetime as dt
    from sqlalchemy import create_engine
    from urllib.parse import quote     
    from dateutil import relativedelta 
    #server = create_engine('''mysql+pymysql://{0}:{1}s@10.95.60.192:3306/ypre'''.format(creds[0],'%') % quote(creds[1]))    
    
    df=df.drop_duplicates('cid',keep='last')
    count=dict()
    data=data[~(data.userid.isin(df[df.sc_status==0]['cid'].to_list()))] # excluding score where sc_status=0 from cohort
    count['Total_cx']=data.shape[0]
    data=data.replace(-9999,np.nan)
    data=data[~(np.isnan(data.delay))]  # excluding cx where delay is PH
    count['Cx_payment_available']=data.shape[0]    
    df=df[df.sc_status!=0]      # excluding cx where sc_status is 0 from scores
    count['CX_scores_available']=df.shape[0] 
    data['PPMT']=np.where(data.delay<=0,1,0)      # creating ppmt flag
    count['PPMT_count']=data[data.PPMT==1].shape[0]
    data=data.rename(columns={'userid':'cid'})  
    df_score_payment=pd.merge(data,df,on='cid',how='inner')      #merging score and cohort df 
    count['CX_scores_&_payment_available']=df_score_payment.shape[0]
    df_score_payment['target']=np.where(df_score_payment['delay']>=dpd,1,np.where(df_score_payment['delay']<=0,0,2)) #  1: event, 0:non_event, 2:grey
    df_score_payment=df_score_payment[df_score_payment.target!=2]  # excluding grey cx
    count['Cx_#_score&payment_avail_excl_grey']=df_score_payment.shape[0]
    return df_score_payment,count
   
    




In [None]:
def scoreband_mapper(df,score_range):
    """This function maps the scores to a bands based on the score range given
    df : data with scores
    score_range : score_range for bands starting from worst scores"""
    df=df.copy()
    scores=[float(i.split(", ")[1][0:-1]) for i in score_range]
    bands=[i for i in range(len(scores),0,-1)]
    print(df['score'].max(),df['score'].min(),df['score'].quantile(0.25))
    def mapper(x):
        for i,j in zip(scores,bands):
            if x<i:
                return j
    df['scoreband']=df['score'].apply(lambda x :mapper(x))
   
    return df

def psi_value(expected,actual):
    """this function calculates the psi value
    expected : counts of cx in bands during development
    actual :counts of cx in bands current"""
    psi_df=pd.DataFrame({'expected':expected,'actual':actual})
    psi_df['expected_%'] = psi_df['expected']/psi_df['expected'].sum()
    psi_df['actual_%'] = psi_df['actual']/psi_df['actual'].sum()
    psi_df['psi'] = (psi_df['expected_%'] - psi_df['actual_%']) * np.log(psi_df['expected_%'] / psi_df['actual_%'])
    psi =np.round(psi_df['psi'].sum(),4)
    return psi


In [None]:
def bandwise_metrics(df,count,start,score_range,expected,sample_type,model_name,month):
    """
    this function returns bandwise and overall performance for the given cohort data
    """
    if month==None:
        day_of_month = pd.to_datetime(start).day
        w = (day_of_month - 1) // 7 + 1
        current_month=pd.to_datetime(start).strftime('%b %y')+' W'+str(w)
    else:
        current_month=pd.to_datetime(start).strftime('%b %y')

    expected=[float(i) for i in expected]
    
    base=pd.DataFrame({'Total_cx':df.groupby('scoreband').agg('count')['cid'],
              'Event':df.groupby('scoreband')['target'].agg('sum'),
              'PPMT':df.groupby('scoreband')['PPMT'].agg('sum')}).reset_index()
    base=base.sort_values('scoreband',ascending=False)
    print(base.shape, len(score_range))
    print(base)
    base['score_bin_range']=score_range
    base['Cohort Time']=current_month
    base['Model_name']=[model_name for i in range(base.shape[0])]
    base['sample_type']=[sample_type for i in range(base.shape[0])]
    

    # ks_gini_calculation_bandwise
    base['Non_Event']=base['Total_cx']-base['Event']
    base['Cumulative_Non_Event']=base['Non_Event'].cumsum()
    base['Cumulative_Event']=base['Event'].cumsum()
    base['Population_%']=np.round((base['Total_cx']/base['Total_cx'].sum()),2)
    base['Cumulative_Non_Event_%']=np.round((base['Cumulative_Non_Event']/base['Non_Event'].sum()),2)
    base['Cumulative_Event_%']=np.round((base['Cumulative_Event']/base['Event'].sum()),2)
    base['KS%']=np.round(abs(base['Cumulative_Non_Event_%']-base['Cumulative_Event_%']),2)
    base['Event_rate_%']=np.round((base['Event']/base['Total_cx']),2)
    base['PPMT_rate_%']=np.round(base['PPMT']/base['Total_cx'],2)
    base["Gini%"] =np.round((((base["Cumulative_Event_%"]+(base["Cumulative_Event_%"]).shift(1).fillna(0))/2) \
    *((base["Cumulative_Non_Event_%"])-(base["Cumulative_Non_Event_%"]).shift(1).fillna(0))),2)
    
    base=base[['Model_name','Cohort Time','sample_type','scoreband','Total_cx','Population_%','PPMT_rate_%','Event_rate_%','PPMT','Event','Non_Event','Cumulative_Non_Event','Cumulative_Event','Cumulative_Non_Event_%','Cumulative_Event_%','KS%',"Gini%","score_bin_range"]]

    #   overall ks_gini_calculation
    model_metric=dict()
    model_metric['Model_Name']=model_name
    model_metric['Cohort']=current_month
    model_metric['Sample_Type']=sample_type
    model_metric['Total_cx']=count['Total_cx']
    model_metric['Paybehav_avail']=count['Cx_payment_available']
    model_metric['Score_avail']=count['CX_scores_available']
    model_metric['both_paybehav_&_score_avail']=count['CX_scores_&_payment_available'] 
    model_metric['nongrey_paybhev_&_score_avail']=count['Cx_#_score&payment_avail_excl_grey']
    model_metric['drop_in_cx']= np.round((1-(count['CX_scores_&_payment_available']/count['Total_cx'])),2)
    model_metric['Gini%']=np.round((2*((base['Gini%'].sum()))-1),2)
    model_metric['KS%']=np.round(max(base['KS%']),2)
    model_metric['Roc_Auc%']=np.round((base['Gini%'].sum()),2)
    model_metric['PSI']=psi_value(expected,base['Total_cx'])
    model_metric['Event Rate%']=(base['Event'].sum()/base['Total_cx'].sum())
    model_metric['PPMT Rate%']=(base['PPMT'].sum()/base['Total_cx'].sum())
    model_metric['Event']=base.Event.sum() 
    model_metric['PSI_class']=np.where(model_metric['PSI']<0.1,0,np.where(model_metric['PSI']>=0.2,2,1))
    model_metric['Drop_in_payment_availability%']= np.round((1-(count['Cx_payment_available']/count['Total_cx'])),2)
    model_metric['Drop_in_score_availability%']=np.round((1-(count['CX_scores_available']/count['Total_cx'])),2)
    model_metric['Drop_excl_grey%']=np.round((1-(count['Cx_#_score&payment_avail_excl_grey']/count['CX_scores_&_payment_available'])),2)
    
     
    model_df=pd.DataFrame(model_metric,index=range(1))

   
     
    return base,model_df


In [None]:
def woe_bins_pop(df,cols_list):
    """the function calculates the woe bins population and event rate in df"""
    data=pd.DataFrame()
    for i in cols_list:
        df1=pd.DataFrame(df[[i,'event']].groupby(i).agg('count')/df.shape[0])
        df1['event_rate']=df[[i,'event']].groupby(i).agg('sum')/df['event'].sum()
        df1.columns=['Total','Event_rate']
        df1['var']=i
        data=pd.concat([data,df1],axis=0)
    data['woe']=data.index
    return data

In [2]:
def get_psi(data,expected='expected', actual='actual'):
    ''' data   -> Dataframe containing band dev_count & current_count.
        expected   -> dev_count column name
        actual -> current_count column name
    '''
    if(not isinstance(data,pd.DataFrame)) or (data.empty):
        return -1
    data['%expected'] = data[expected]/data[expected].sum()
    data['%actual'] = data[actual]/data[actual].sum()
    psi_df = data.copy()
    psi_df['psi'] = (psi_df['%expected'] - psi_df['%actual']) * np.log(psi_df['%expected'] / psi_df['%actual'])
    psi = psi_df['psi'].sum()
    
    return psi_df,psi
def visualize_psi(df,psi,pair="Train - Test"):
    melted_df=pd.melt(df, id_vars =['band'], value_vars =['%expected', '%actual'],var_name='',value_name='Pop%')
    melted_df['Pop%']=[round(i,1) for i in melted_df['Pop%']*100]
    plt.figure(1, figsize = (8,5))
    sns.set_style("whitegrid", {"grid.color": ".7", "grid.linestyle": ":"})
    chart= sns.barplot(data=melted_df,y='Pop%',x='band',hue='',palette='Paired') #Paired, rocket
    ax = plt.gca()
    plt.xticks(rotation=45)
    plt.legend( loc='upper left')
    plt.title("""{0} Population Stability Index PSI - {1}""".format(pair,psi))
    # chart.bar_label(chart.containers[0])
    # chart.bar_label(chart.containers[1])
    plt.xlabel('')
    plt.ylabel('Population%')
    plt.tight_layout()
    #plt.savefig(f'{file_name}.png')
    plt.show()

In [2]:
def woe_bins_pop(df,cols_list,):
    """the function calculates the woe bins population and event rate in df"""
    
    data=pd.DataFrame()
    for i in cols_list:
        df1=pd.DataFrame(df[[i,'event']].groupby(i).agg('count')/df.shape[0])
        df1.columns=['Total']
        df1['var']=i
        data=pd.concat([data,df1],axis=0)
    data['woe']=data.index
    return data

In [3]:
def MOM_woe_bins_pop(df,woe):
    total_var_pop=pd.DataFrame()
    col_list=df.columns.to_list()
    col_list.remove('cid')
    for var in col_list:
        var_data=df[['cid',var]]
        var_pop=pd.DataFrame()
        
        for m,k in zip([nov,dec,jan,feb,mar,apr,may,june],range(8)):
            a=var_data[var_data.cid.isin(m.cid.to_list())]
            df1=pd.DataFrame(a[[var,'cid']].groupby(var).agg('count')/a.shape[0]*100)
            df1.columns=['month_'+str(k)]
            var_pop=pd.concat([var_pop,df1],axis=1)
            var_pop['WOE']=var_pop.index
        bins=woe[woe.VAR_NAME==var].sort_values('WOE',ascending=True)[['WOE','Cuts']]
        var_pop=pd.merge(var_pop,bins,on='WOE',how='left')
        var_pop['var']=var
        total_var_pop=pd.concat([total_var_pop,var_pop],axis=0)
    return total_var_pop[['var','Cuts','WOE','month_0','month_1','month_2','month_3','month_4','month_5','month_6','month_7']]

In [None]:
def correlation(d,poscutoff=0.65,negcutoff=-0.65):
    v1=[]
    v2=[]
    corr=[]
    for a in d.columns:
        for b in d.drop(labels=a,axis=1).columns:
            if d[a].corr(d[b])>poscutoff:
                v1.append(a)
                v2.append(b)
                corr.append(d[a].corr(d[b]))
            elif d[a].corr(d[b])<negcutoff:
                v1.append(a)
                v2.append(b)
                corr.append(d[a].corr(d[b]))
    df=pd.DataFrame({'v1':v1,'v2':v2,'corr':corr})
    
    return df

In [None]:
def handling_correlation_by_IV(df,iv):
    """ df = correlation dataframe having v1,v2 and correlation
    iv : iv_details obtained from get_iv_woe_conversion"""
    cols_to_drop=[]
    for i in range(df.shape[0]):
        a=float(iv_details[iv_details.VAR_NAME==df['v1'].loc[i]]['IV'].values)
        b=float(iv_details[iv_details.VAR_NAME==df['v2'].loc[i]]['IV'].values)
        if a>b :            
            if i[1] not in cols_drop:
                cols_drop.append(i[1])
        elif i[0] not in cols_drop:
            cols_drop.append(i[0]) 
    print('Number of features to drop :-',len(cols_to_drop))
    return cols_to_drop

In [None]:
def handling_correlation_by_IV_hrlper(df=corel_dict['Correl_vars'],iv):
    cols_to_drop=[]
    for i in df:
        a=float(iv_details[iv_details.VAR_NAME==i[0]]['IV'].values)
        b=float(iv_details[iv_details.VAR_NAME==i[1]]['IV'].values)
        if a>b :            
            if i[1] not in cols_drop:
                cols_drop.append(i[1])
        elif i[0] not in cols_drop:
            cols_drop.append(i[0])            
    return cols_to_drop