In [None]:
import numpy as np
import pandas as pd
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_columns', None)

In [None]:
def one_hot_encoder(df_prev, nan_as_category = True):
    original_columns = list(df_prev.columns)
    categorical_columns = [col for col in df_prev.columns if df_prev[col].dtype == 'object']
    df_prev = pd.get_dummies(df_prev, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df_prev.columns if c not in original_columns]
    return df_prev, new_columns

In [None]:
df_prev = pd.read_csv('../input/home-credit-default-risk/previous_application.csv')
def rare_encoder(dataframe, rare_perc):
    temp_df = df_prev.copy()
    rare_columns = [col for col in temp_df.columns if temp_df[col].dtypes == 'O'
                    and (temp_df[col].value_counts() / len(temp_df) < rare_perc).any(axis=None)]
    for var in rare_columns:
        tmp = temp_df[var].value_counts() / len(temp_df)
        rare_labels = tmp[tmp < rare_perc].index
        temp_df[var] = np.where(temp_df[var].isin(rare_labels), 'Rare', temp_df[var])
    return temp_df

df_prev=rare_encoder(df_prev, 0.01)

In [None]:
 df_prev[df_prev["NAME_TYPE_SUITE"]=='Rare'].head()

In [None]:
def previous_applications(num_rows = None, nan_as_category = True):
   
    df_prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
    df_prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
    df_prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
    df_prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
    df_prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)
    
  
    df_prev['NEW_APP_CREDIT_RATE'] = df_prev['AMT_APPLICATION'] / df_prev['AMT_CREDIT']
    
    df_prev["NEW_APP_CREDIT_RATE_RATIO"] = df_prev["NEW_APP_CREDIT_RATE"].apply(lambda x: 1 if(x<=1) else 0)
    df_prev['NEW_AMT_PAYMENT_RATE'] = df_prev['AMT_CREDIT'] / df_prev['AMT_ANNUITY']

    df_prev['NEW_APP_GOODS_RATE'] = df_prev['AMT_APPLICATION'] / df_prev['AMT_GOODS_PRICE']

    df_prev['NEW_CREDIT_GOODS_RATE'] = df_prev['AMT_CREDIT'] / df_prev['AMT_GOODS_PRICE']

    df_prev['NEW_RETURN_DAY'] =  df_prev['DAYS_DECISION'] + df_prev['CNT_PAYMENT'] * 30

    df_prev['NEW_DAYS_TERMINATION_DIFF'] = df_prev['DAYS_TERMINATION'] - df_prev['NEW_RETURN_DAY']

    df_prev['NEW_DAYS_DUE_DIFF'] = df_prev['DAYS_LAST_DUE_1ST_VERSION'] - df_prev['DAYS_FIRST_DUE'] 
    
    df_prev["NEW_CNT_PAYMENT"] = pd.cut(x=df_prev['CNT_PAYMENT'], bins=[0, 12, 60,120], labels=["Kısa", "Orta", "Uzun"])
    
    df_prev["NEW_END_DIFF"] = df_prev["DAYS_TERMINATION"] - df_prev["DAYS_LAST_DUE"]

    weekend = ["SATURDAY","SUNDAY"]                              
    df_prev["WEEKDAY_APPR_PROCESS_START"] = df_prev["WEEKDAY_APPR_PROCESS_START"].apply(lambda x : "WEEKEND" if (x in weekend) else "WEEKDAY")
    
      
    df_prev['NFLAG_LAST_APPL_IN_DAY'] = df_prev['NFLAG_LAST_APPL_IN_DAY'].astype("O")
    df_prev['FLAG_LAST_APPL_PER_CONTRACT'] = df_prev['FLAG_LAST_APPL_PER_CONTRACT'].astype("O")
    df_prev["NEW_CNT_PAYMENT"] = df_prev['NEW_CNT_PAYMENT'].astype("O")
    df_prev['NEW_APP_CREDIT_RATE_RATIO'] = df_prev['NEW_APP_CREDIT_RATE_RATIO'].astype('O')
    newCoding = {"0": "Yes", "1": "No"}
    df_prev['NEW_APP_CREDIT_RATE_RATIO'] = df_prev['NEW_APP_CREDIT_RATE_RATIO'].replace(newCoding)
    
    df_prev, cat_cols = one_hot_encoder(df_prev, nan_as_category= True)
    
    # Aggregation for numeric features
    num_aggregations = {
        'SK_ID_PREV' : 'count',
        'AMT_ANNUITY': ['min', 'max', 'median', 'mean'], 
        'AMT_APPLICATION': ['min', 'max', 'mean', 'median'],
        'AMT_CREDIT': ['min', 'max', 'mean', 'median'],
        'AMT_DOWN_PAYMENT': ['min', 'max', 'mean', 'median'],
        'AMT_GOODS_PRICE': ['min', 'max', 'mean', 'median'],
        'HOUR_APPR_PROCESS_START': ['min', 'max', 'mean'],
        'RATE_DOWN_PAYMENT': ['min', 'max', 'mean','median'], 
        'DAYS_DECISION': ['min', 'max', 'mean','median'],
        'NEW_APP_CREDIT_RATE': ['min', 'max', 'mean', 'var'],
        'NEW_AMT_PAYMENT_RATE': ['min', 'max', 'mean'],
        'NEW_APP_GOODS_RATE':['min', 'max', 'mean'],
        'NEW_CREDIT_GOODS_RATE': ['min', 'max', 'mean'],
        'NEW_RETURN_DAY': ['min', 'max', 'mean', 'var'],
        'NEW_DAYS_TERMINATION_DIFF': ['min', 'max', 'mean'],
        'NEW_END_DIFF': ['min', 'max', 'mean'],
        'NEW_APP_CREDIT_RATE_RATIO': ['min', 'max', 'mean'],
        'NEW_DAYS_DUE_DIFF': ['min', 'max', 'mean']       
    }
     
    # Aggregation for categorical features
    cat_aggregations = {}
    for cat in cat_cols:
        cat_aggregations[cat] = ['mean']
    
    prev_agg = df_prev.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])
    
    # Approved Applications - Aggregation for numeric features
    approved = df_prev[df_prev['NAME_CONTRACT_STATUS_Approved'] == 1]
    approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations)
    approved_agg.columns = pd.Index(['APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
    prev_agg = prev_agg.join(approved_agg, how='left', on='SK_ID_CURR')
    
    #Refused Applications - Aggregation for numeric features
    refused = df_prev[df_prev['NAME_CONTRACT_STATUS_Refused'] == 1]
    refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations)
    refused_agg.columns = pd.Index(['REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
    prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR')
    
    del refused, refused_agg, approved, approved_agg, df_prev
    gc.collect()
    return prev_agg