In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import warnings
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
#import lightgbm as lgb
import gc
import sys
warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')

In [None]:
#Función cuenta variables categoricas
def count_categoricas(df, group_var, df_name):
    """Calcula conteos y conteos normalizados para cada observación
    de `group_var` de cada categoría única en cada variable categórica
    
    Parámetros
    --------
    df : dataframe 
         El marco de datos para calcular el valor cuenta.
        
    group_var : string
        La variable por la cual agrupar el marco de datos. Para cada unico
        valor de esta variable, el marco de datos final tendrá una fila
        
    df_name : string
        Variable agregada al frente de los nombres de columna para realizar un seguimiento de las columnas

    
    Return
    --------
    categorical : dataframe
        Un marco de datos con recuentos y recuentos normalizados de cada categoría única en cada variable categórica
        con una fila por cada valor único de `group_var`
        
    """
    
    #Seleccion de las columnas categóricas
    categorica = pd.get_dummies(df.select_dtypes('object'))

    # Pone la de identificación en la columna
    categorica[group_var] = df[group_var]

    # Agrupa por el grupo var y calcula la suma y la media
    categorica = categorica.groupby(group_var).agg(['sum', 'mean'])
    
    column_names = []
    
    # Itera a través de las columnas en el nivel 0
    for var in categorica.columns.levels[0]:
        # Iterar a través de las estadísticas en el nivel 1
        for stat in ['count', 'count_norm']:
            # Genera un nuevo nombre de columna
            column_names.append('%s_%s_%s' % (df_name, var, stat))
    
    categorica.columns = column_names
    
    return categorica

In [None]:
#Función cuenta variables numéricas
def agg_numericas(df, group_var, df_name):
    """Agrega los valores numéricos en un marco de datos. Esto 
    se utilizará para crear características para cada instancia de la variable de agrupación.
    
    Parameters
    --------
        df (dataframe): 
            el marco de datos para calcular las estadísticas
        group_var (string): 
            la variable por la cual agrupar df
        df_name (string):            
            la variable utilizada para renombrar las columnas
        
    Return
    --------
        agg (dataframe): 
            un marco de datos con las estadísticas agregadas para
            Todas las columnas numéricas. Cada instancia de la variable de agrupación tendrá
            las estadísticas (media, min, max, suma; actualmente admitidas) calculadas.
            Las columnas también se renombran para realizar un seguimiento de las características creadas.
    
    """
    # Elimina variables de identificación que no sean variables de agrupación
    for col in df:
        if col != group_var and 'SK_ID' in col:
            df = df.drop(columns = col)
            
    group_ids = df[group_var]
    numeric_df = df.select_dtypes('number')
    numeric_df[group_var] = group_ids

    # Agrupa por la variable especificada y calcula las estadísticas
    agg = numeric_df.groupby(group_var).agg(['count', 'mean']).reset_index()

   #Crea nuevos nombres de columna
    columns = [group_var]

   #Itera a través de los nombres de las variables
    for var in agg.columns.levels[0]:
        # Salta la variable de agrupación
        if var != group_var:
            #Itera a través de los nombres de estadísticas
            for stat in agg.columns.levels[1][:-1]:
                # Hace un nuevo nombre de columna para la variable y estadística
                columns.append('%s_%s_%s' % (df_name, var, stat))

    agg.columns = columns
    return agg

In [None]:
#Funcion para cambiar nombres en el df
# Lista de nombres de columna
columns = ['SK_ID_CURR']

#Itera a través de los nombres de las variables
for var in Bureau_agg.columns.levels[0]:
    # Salta el nombre de identificación
    if var != 'SK_ID_CURR':
        
        # Iterate through the stat names
            # Itera a través de los nombres de estadísticas
            columns.append('bureau_%s_%s' % (var, stat))

In [None]:
def agg_numeric(df, parent_var, df_name):
    """
    Agrupa y agrega los valores numéricos en un marco de datos hijo
    por la variable padre.
    
    Parameters
    --------
        df (dataframe): 
            el marco de datos hijo para calcular las estadísticas en
        parent_var (string): 
            La variable principal utilizada para agrupar y agregar
        df_name (string): 
            la variable utilizada para renombrar las columnas
        
    Return
    --------
        agg (dataframe): 
            un marco de datos con las estadísticas agregadas por el `parent_var` para
            Todas las columnas numéricas. Cada observación de la variable principal tendrá
            una fila en el marco de datos con la variable principal como índice.
            Las columnas también se renombran usando el `df_name`. Columnas con todos los duplicados.
            Se eliminan los valores.
    
    """
      
    # Elimina variables de identificación que no sean variables de agrupación
    for col in df:
        if col != parent_var and 'SK_ID' in col:
            df = df.drop(columns = col)
            
    # Solo toma las variables numéricas
    parent_ids = df[parent_var].copy()
    numeric_df = df.select_dtypes('number').copy()
    numeric_df[parent_var] = parent_ids
    
    # Agrupa por la variable especificada y calcula las estadísticas
    agg = numeric_df.groupby(parent_var).agg(['count', 'mean'])

    # Necesita crear nuevos nombres de columna
    columns = []

    # Itera a través de los nombres de las variables
    for var in agg.columns.levels[0]:
        if var != parent_var:
            # Itera a través de los nombres de estadísticas
            for stat in agg.columns.levels[1]:
                # Genera un nuevo nombre de columna para la variable y estadística
                columns.append('%s_%s_%s' % (df_name, var, stat))
    
    agg.columns = columns
    
    # Elimina las columnas con todos los valores redundantes
    _, idx = np.unique(agg, axis = 1, return_index=True)
    agg = agg.iloc[:, idx]
    
    return agg

In [None]:
def agg_categorical(df, parent_var, df_name):
    """
    Agrega las características categóricas en un marco de datos hijo
    para cada observación de la variable principal.
    
    Parameters
    --------
    df : dataframe 
        El marco de datos para calcular el valor cuenta.
        
    parent_var : string
        La variable por la cual agrupar y agregar el marco de datos. Para cada unico
        valor de esta variable, el marco de datos final tendrá una fila
        
    df_name : string
       Variable agregada al frente de los nombres de columna para realizar un seguimiento de las columnas

    
    Return
    --------
    categorical : dataframe
        Un marco de datos con estadísticas agregadas para cada observación de parent_var
        Las columnas también se renombran y las columnas con valores duplicados se eliminan.
        
    """
       
    # Selecciona las columnas categóricas
    categorical = pd.get_dummies(df.select_dtypes('category'))

    # Asegura poner la identificación de identificación en la columna
    categorical[parent_var] = df[parent_var]

    # Agrupa por el grupo var y calcula la suma y la media
    categorical = categorical.groupby(parent_var).agg(['count', 'mean'])
    
    column_names = []
    
    # Iterar a través de las columnas en el nivel 0
    for var in categorical.columns.levels[0]:
        # Iterate through the stats in level 1
        for stat in ['count', 'mean']:
            # Iterar a través de las estadísticas en el nivel 1
            column_names.append('%s_%s_%s' % (df_name, var, stat))
    
    categorical.columns = column_names
        
    # Elimina columnas duplicadas por valores
    _, idx = np.unique(categorical, axis = 1, return_index = True)
    categorical = categorical.iloc[:, idx]
    
    return categorical

In [None]:
def aggregate_client(df, group_vars, df_names):
    """Agregar un marco de datos con datos a nivel de préstamo
    a nivel del cliente
    
    Args:
        df (dataframe): datos a nivel de préstamo
        group_vars (lista de dos cadenas): agrupando variables para el préstamo
        y luego el cliente (ejemplo ['SK_ID_PREV', 'SK_ID_CURR'])
        nombres (lista de dos cadenas): nombres para llamar a las columnas resultantes
        (ejemplo ['efectivo', 'cliente'])
        
    Returns:
        df_client (dataframe): estadísticas numéricas agregadas a nivel del cliente.
       Cada cliente tendrá una sola fila con todos los datos numéricos agregados
    """
    
    # Agrega las columnas numéricas
    df_agg = agg_numeric(df, parent_var = group_vars[0], df_name = df_names[0])
    
    # Si hay variables categóricas
    if any(df.dtypes == 'category'):
    
        # Cuenta las columnas categóricas
        df_counts = agg_categorical(df, parent_var = group_vars[0], df_name = df_names[0])

        # Fusiona la numérica y categórica
        df_by_loan = df_counts.merge(df_agg, on = group_vars[0], how = 'outer')

        gc.enable()
        del df_agg, df_counts
        gc.collect()

        # Combina para obtener la identificación del cliente en el marco de datos
        df_by_loan = df_by_loan.merge(df[[group_vars[0], group_vars[1]]], on = group_vars[0], how = 'left')

        # Elimina la identificación del préstamo
        df_by_loan = df_by_loan.drop(columns = [group_vars[0]])

        # Agrega estadísticas numéricas por columna
        df_by_client = agg_numeric(df_by_loan, parent_var = group_vars[1], df_name = df_names[1])

        
    # No hay variables categóricas
    else:
        # Combina para obtener la identificación del cliente en el marco de datos
        df_by_loan = df_agg.merge(df[[group_vars[0], group_vars[1]]], on = group_vars[0], how = 'left')
        
        gc.enable()
        del df_agg
        gc.collect()
        
        # Elimina la identificación del préstamo
        df_by_loan = df_by_loan.drop(columns = [group_vars[0]])
                
        # Agrega estadísticas numéricas por columna
        df_by_client = agg_numeric(df_by_loan, parent_var = group_vars[1], df_name = df_names[1])
        
    # Gestión de la memoria
    gc.enable()
    del df, df_by_loan
    gc.collect()

    return df_by_client

In [41]:
#1- Cargo fichero bureau_balance.csv y veo cabecera 
BureauBalance = pd.read_csv('/Users/LENOVO/Downloads/bureau_balance.csv', delimiter= ',' , header=0)
#BureauBalance.head()
BureauBalance

Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS
0,5715448,0,C
1,5715448,-1,C
2,5715448,-2,C
3,5715448,-3,C
4,5715448,-4,C
...,...,...,...
27299920,5041336,-47,X
27299921,5041336,-48,X
27299922,5041336,-49,X
27299923,5041336,-50,X


In [42]:
BureauBalance.dtypes.value_counts()

int64     2
object    1
dtype: int64

In [45]:
#Recuento de cada tipo de estado de las variables CATEGORICAS para cada préstamo anterior por campo 'SK_ID_BUREAU'
BureauBalance_counts = count_categoricas(BureauBalance, group_var = 'SK_ID_BUREAU', df_name = 'bureau_balance')
#BureauBalance_counts.head()
BureauBalance_counts

Unnamed: 0_level_0,bureau_balance_STATUS_0_count,bureau_balance_STATUS_0_count_norm,bureau_balance_STATUS_1_count,bureau_balance_STATUS_1_count_norm,bureau_balance_STATUS_2_count,bureau_balance_STATUS_2_count_norm,bureau_balance_STATUS_3_count,bureau_balance_STATUS_3_count_norm,bureau_balance_STATUS_4_count,bureau_balance_STATUS_4_count_norm,bureau_balance_STATUS_5_count,bureau_balance_STATUS_5_count_norm,bureau_balance_STATUS_C_count,bureau_balance_STATUS_C_count_norm,bureau_balance_STATUS_X_count,bureau_balance_STATUS_X_count_norm
SK_ID_BUREAU,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
5001709,0,0.000000,0,0.000000,0,0.0,0,0.0,0,0.0,0,0.0,86,0.886598,11,0.113402
5001710,5,0.060241,0,0.000000,0,0.0,0,0.0,0,0.0,0,0.0,48,0.578313,30,0.361446
5001711,3,0.750000,0,0.000000,0,0.0,0,0.0,0,0.0,0,0.0,0,0.000000,1,0.250000
5001712,10,0.526316,0,0.000000,0,0.0,0,0.0,0,0.0,0,0.0,9,0.473684,0,0.000000
5001713,0,0.000000,0,0.000000,0,0.0,0,0.0,0,0.0,0,0.0,0,0.000000,22,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6842884,9,0.187500,0,0.000000,0,0.0,0,0.0,0,0.0,0,0.0,20,0.416667,19,0.395833
6842885,12,0.500000,0,0.000000,0,0.0,0,0.0,0,0.0,12,0.5,0,0.000000,0,0.000000
6842886,8,0.242424,0,0.000000,0,0.0,0,0.0,0,0.0,0,0.0,25,0.757576,0,0.000000
6842887,6,0.162162,0,0.000000,0,0.0,0,0.0,0,0.0,0,0.0,31,0.837838,0,0.000000


In [46]:
# Calculo estadísticas de recuento de variables NUMERICAS para cada 'SK_ID_BUREAU' 
BureauBalance_agg = agg_numericas(BureauBalance, group_var = 'SK_ID_BUREAU', df_name = 'bureau_balance')
#dfBureau_balance_agg.head()
BureauBalance_agg

Unnamed: 0,SK_ID_BUREAU,bureau_balance_MONTHS_BALANCE_count,bureau_balance_MONTHS_BALANCE_mean
0,5001709,97,-48.0
1,5001710,83,-41.0
2,5001711,4,-1.5
3,5001712,19,-9.0
4,5001713,22,-10.5
...,...,...,...
817390,6842884,48,-23.5
817391,6842885,24,-11.5
817392,6842886,33,-16.0
817393,6842887,37,-18.0


In [47]:
# Dataframe agrupando BureauBalance_agg con BureauBalance_counts por préstamo anterior
BureauBalance_by_loan = BureauBalance_agg.merge(BureauBalance_counts, right_index = True, left_on = 'SK_ID_BUREAU', how = 'outer')
BureauBalance_by_loan

Unnamed: 0,SK_ID_BUREAU,bureau_balance_MONTHS_BALANCE_count,bureau_balance_MONTHS_BALANCE_mean,bureau_balance_STATUS_0_count,bureau_balance_STATUS_0_count_norm,bureau_balance_STATUS_1_count,bureau_balance_STATUS_1_count_norm,bureau_balance_STATUS_2_count,bureau_balance_STATUS_2_count_norm,bureau_balance_STATUS_3_count,bureau_balance_STATUS_3_count_norm,bureau_balance_STATUS_4_count,bureau_balance_STATUS_4_count_norm,bureau_balance_STATUS_5_count,bureau_balance_STATUS_5_count_norm,bureau_balance_STATUS_C_count,bureau_balance_STATUS_C_count_norm,bureau_balance_STATUS_X_count,bureau_balance_STATUS_X_count_norm
0,5001709,97,-48.0,0,0.000000,0,0.000000,0,0.0,0,0.0,0,0.0,0,0.0,86,0.886598,11,0.113402
1,5001710,83,-41.0,5,0.060241,0,0.000000,0,0.0,0,0.0,0,0.0,0,0.0,48,0.578313,30,0.361446
2,5001711,4,-1.5,3,0.750000,0,0.000000,0,0.0,0,0.0,0,0.0,0,0.0,0,0.000000,1,0.250000
3,5001712,19,-9.0,10,0.526316,0,0.000000,0,0.0,0,0.0,0,0.0,0,0.0,9,0.473684,0,0.000000
4,5001713,22,-10.5,0,0.000000,0,0.000000,0,0.0,0,0.0,0,0.0,0,0.0,0,0.000000,22,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
817390,6842884,48,-23.5,9,0.187500,0,0.000000,0,0.0,0,0.0,0,0.0,0,0.0,20,0.416667,19,0.395833
817391,6842885,24,-11.5,12,0.500000,0,0.000000,0,0.0,0,0.0,0,0.0,12,0.5,0,0.000000,0,0.000000
817392,6842886,33,-16.0,8,0.242424,0,0.000000,0,0.0,0,0.0,0,0.0,0,0.0,25,0.757576,0,0.000000
817393,6842887,37,-18.0,6,0.162162,0,0.000000,0,0.0,0,0.0,0,0.0,0,0.0,31,0.837838,0,0.000000


In [48]:
#2- Cargo fichero bureau.csv
Bureau = pd.read_csv('/Users/LENOVO/Downloads/bureau.csv', delimiter= ',' , header=0)

In [49]:
# Fusion para incluir la SK_ID_CURR
BureauBureauBalance_by_loan = BureauBalance_by_loan.merge(Bureau[['SK_ID_BUREAU', 'SK_ID_CURR']], on = 'SK_ID_BUREAU', how = 'left')
#BureauBureauBalance_by_loan
BureauBureauBalance_by_loan.head()

Unnamed: 0,SK_ID_BUREAU,bureau_balance_MONTHS_BALANCE_count,bureau_balance_MONTHS_BALANCE_mean,bureau_balance_STATUS_0_count,bureau_balance_STATUS_0_count_norm,bureau_balance_STATUS_1_count,bureau_balance_STATUS_1_count_norm,bureau_balance_STATUS_2_count,bureau_balance_STATUS_2_count_norm,bureau_balance_STATUS_3_count,bureau_balance_STATUS_3_count_norm,bureau_balance_STATUS_4_count,bureau_balance_STATUS_4_count_norm,bureau_balance_STATUS_5_count,bureau_balance_STATUS_5_count_norm,bureau_balance_STATUS_C_count,bureau_balance_STATUS_C_count_norm,bureau_balance_STATUS_X_count,bureau_balance_STATUS_X_count_norm,SK_ID_CURR
0,5001709,97,-48.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,86,0.886598,11,0.113402,
1,5001710,83,-41.0,5,0.060241,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,48,0.578313,30,0.361446,162368.0
2,5001711,4,-1.5,3,0.75,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,0.25,162368.0
3,5001712,19,-9.0,10,0.526316,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,9,0.473684,0,0.0,162368.0
4,5001713,22,-10.5,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,22,1.0,150635.0


In [50]:
##***** Agrega las estadísticas de prestamos por 'SK_ID_CURR'. Elimina columna 'SK_ID_BUREAU'****
BureauBureauBalance_by_client = agg_numericas(BureauBureauBalance_by_loan.drop(columns = ['SK_ID_BUREAU']), group_var = 'SK_ID_CURR', df_name = 'client')
#dfBureau_balance_by_client.head()
BureauBureauBalance_by_client

Unnamed: 0,SK_ID_CURR,client_bureau_balance_MONTHS_BALANCE_count_count,client_bureau_balance_MONTHS_BALANCE_count_mean,client_bureau_balance_MONTHS_BALANCE_mean_count,client_bureau_balance_MONTHS_BALANCE_mean_mean,client_bureau_balance_STATUS_0_count_count,client_bureau_balance_STATUS_0_count_mean,client_bureau_balance_STATUS_0_count_norm_count,client_bureau_balance_STATUS_0_count_norm_mean,client_bureau_balance_STATUS_1_count_count,...,client_bureau_balance_STATUS_5_count_norm_count,client_bureau_balance_STATUS_5_count_norm_mean,client_bureau_balance_STATUS_C_count_count,client_bureau_balance_STATUS_C_count_mean,client_bureau_balance_STATUS_C_count_norm_count,client_bureau_balance_STATUS_C_count_norm_mean,client_bureau_balance_STATUS_X_count_count,client_bureau_balance_STATUS_X_count_mean,client_bureau_balance_STATUS_X_count_norm_count,client_bureau_balance_STATUS_X_count_norm_mean
0,100001.0,7,24.571429,7,-11.785714,7,4.428571,7,0.336651,7,...,7,0.0,7,15.714286,7,0.441240,7,4.285714,7,0.214590
1,100002.0,8,13.750000,8,-21.875000,8,5.625000,8,0.406960,8,...,8,0.0,8,2.875000,8,0.175426,8,1.875000,8,0.161932
2,100005.0,3,7.000000,3,-3.000000,3,4.666667,3,0.735043,3,...,3,0.0,3,1.666667,3,0.128205,3,0.666667,3,0.136752
3,100010.0,2,36.000000,2,-46.000000,2,10.000000,2,0.277778,2,...,2,0.0,2,26.000000,2,0.722222,2,0.000000,2,0.000000
4,100013.0,4,57.500000,4,-28.250000,4,19.750000,4,0.320718,4,...,4,0.0,4,25.750000,4,0.397036,4,10.250000,4,0.254545
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134537,456247.0,11,29.090909,11,-19.863636,11,6.000000,11,0.325528,11,...,11,0.0,11,19.909091,11,0.505634,11,3.181818,11,0.168838
134538,456250.0,3,29.000000,3,-14.000000,3,4.000000,3,0.130259,3,...,3,0.0,3,8.333333,3,0.252525,3,16.666667,3,0.617216
134539,456253.0,4,29.250000,4,-14.125000,4,11.750000,4,0.404906,4,...,4,0.0,4,14.250000,4,0.459677,4,3.250000,4,0.135417
134540,456254.0,1,37.000000,1,-18.000000,1,8.000000,1,0.216216,1,...,1,0.0,1,29.000000,1,0.783784,1,0.000000,1,0.000000


In [51]:
#BureauBalance.select_dtypes('object').apply(pd.Series.nunique, axis = 0)

In [52]:
#BureauBalance_counts.select_dtypes('object').apply(pd.Series.nunique, axis = 0)

In [53]:
#BureauBalance_agg.select_dtypes('object').apply(pd.Series.nunique, axis = 0)

In [54]:
#BureauBureauBalance_by_loan.select_dtypes('object').apply(pd.Series.nunique, axis = 0)

In [55]:
#BureauBureauBalance_by_client.select_dtypes('object').apply(pd.Series.nunique, axis = 0)

In [56]:
#Bureau.dtypes.value_counts()

In [57]:
#2- Cargo fichero bureau.csv y veo cabecera
#Bureau = pd.read_csv('/Users/LENOVO/Downloads/bureau.csv', delimiter= ',' , header=0)
#Bureau.head()
Bureau

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.00,0.0,,0.0,Consumer credit,-131,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.00,171342.0,,0.0,Credit card,-20,
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.50,,,0.0,Consumer credit,-16,
3,215354,5714465,Active,currency 1,-203,0,,,,0,90000.00,,,0.0,Credit card,-16,
4,215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.00,,,0.0,Consumer credit,-21,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1716423,259355,5057750,Active,currency 1,-44,0,-30.0,,0.0,0,11250.00,11250.0,0.0,0.0,Microloan,-19,
1716424,100044,5057754,Closed,currency 1,-2648,0,-2433.0,-2493.0,5476.5,0,38130.84,0.0,0.0,0.0,Consumer credit,-2493,
1716425,100044,5057762,Closed,currency 1,-1809,0,-1628.0,-970.0,,0,15570.00,,,0.0,Consumer credit,-967,
1716426,246829,5057770,Closed,currency 1,-1878,0,-1513.0,-1513.0,,0,36000.00,0.0,0.0,0.0,Consumer credit,-1508,


In [58]:
#Se cuentan los prestamos anteriores por campo SK_ID_BUREAU y se agrupan por 'SK_ID_BUREAU'
Previous_loan_counts = Bureau.groupby('SK_ID_CURR', as_index=False)['SK_ID_BUREAU'].count().rename(columns = {'SK_ID_BUREAU': 'previous_loan_counts'})
Previous_loan_counts

Unnamed: 0,SK_ID_CURR,previous_loan_counts
0,100001,7
1,100002,8
2,100003,4
3,100004,2
4,100005,3
...,...,...
305806,456249,13
305807,456250,3
305808,456253,4
305809,456254,1


In [59]:
#Genera recuento y estadisticas de las variables para cada préstamo anterior eliminando columna 'SK_ID_BUREAU'y
#agrupando por campo 'SK_ID_CURR'
Bureau_agg = Bureau.drop(columns = ['SK_ID_BUREAU']).groupby('SK_ID_CURR',
as_index = False).agg(['count', 'mean']).reset_index() 
#Bureau_agg.head()
Bureau_agg

Unnamed: 0_level_0,SK_ID_CURR,DAYS_CREDIT,DAYS_CREDIT,CREDIT_DAY_OVERDUE,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,...,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,AMT_CREDIT_SUM_OVERDUE,DAYS_CREDIT_UPDATE,DAYS_CREDIT_UPDATE,AMT_ANNUITY,AMT_ANNUITY
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,count,mean,count,mean,count,mean,count,...,count,mean,count,mean,count,mean,count,mean,count,mean
0,100001,7,-735.000000,7,0.0,7,82.428571,4,-825.500000,0,...,7,85240.928571,6,0.00000,7,0.0,7,-93.142857,7,3545.357143
1,100002,8,-874.000000,8,0.0,6,-349.000000,6,-697.500000,5,...,5,49156.200000,4,7997.14125,8,0.0,8,-499.875000,7,0.000000
2,100003,4,-1400.750000,4,0.0,4,-544.500000,3,-1097.333333,4,...,4,0.000000,4,202500.00000,4,0.0,4,-816.000000,0,
3,100004,2,-867.000000,2,0.0,2,-488.500000,2,-532.500000,1,...,2,0.000000,2,0.00000,2,0.0,2,-532.000000,0,
4,100005,3,-190.666667,3,0.0,3,439.333333,1,-123.000000,1,...,3,189469.500000,3,0.00000,3,0.0,3,-54.333333,3,1420.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305806,456249,13,-1667.076923,13,0.0,12,-1232.333333,12,-1364.750000,5,...,10,16307.100000,7,0.00000,13,0.0,13,-1064.538462,0,
305807,456250,3,-862.000000,3,0.0,3,1288.333333,1,-760.000000,2,...,3,744013.365000,3,19422.79500,3,0.0,3,-60.333333,3,154567.965000
305808,456253,4,-867.500000,4,0.0,4,280.500000,2,-794.000000,0,...,4,448958.250000,4,0.00000,4,0.0,4,-253.250000,3,58369.500000
305809,456254,1,-1104.000000,1,0.0,1,-859.000000,1,-859.000000,0,...,1,0.000000,0,,1,0.0,1,-401.000000,1,0.000000


In [61]:
#**** Asigna la lista de nombres de columnas como los nombres de columna del marco de datos***
Bureau_agg.columns = columns
#dfBureau_agg.head()
Bureau_agg

Unnamed: 0,SK_ID_CURR,bureau_DAYS_CREDIT_count,bureau_DAYS_CREDIT_mean,bureau_CREDIT_DAY_OVERDUE_count,bureau_CREDIT_DAY_OVERDUE_mean,bureau_DAYS_CREDIT_ENDDATE_count,bureau_DAYS_CREDIT_ENDDATE_mean,bureau_DAYS_ENDDATE_FACT_count,bureau_DAYS_ENDDATE_FACT_mean,bureau_AMT_CREDIT_MAX_OVERDUE_count,...,bureau_AMT_CREDIT_SUM_DEBT_count,bureau_AMT_CREDIT_SUM_DEBT_mean,bureau_AMT_CREDIT_SUM_LIMIT_count,bureau_AMT_CREDIT_SUM_LIMIT_mean,bureau_AMT_CREDIT_SUM_OVERDUE_count,bureau_AMT_CREDIT_SUM_OVERDUE_mean,bureau_DAYS_CREDIT_UPDATE_count,bureau_DAYS_CREDIT_UPDATE_mean,bureau_AMT_ANNUITY_count,bureau_AMT_ANNUITY_mean
0,100001,7,-735.000000,7,0.0,7,82.428571,4,-825.500000,0,...,7,85240.928571,6,0.00000,7,0.0,7,-93.142857,7,3545.357143
1,100002,8,-874.000000,8,0.0,6,-349.000000,6,-697.500000,5,...,5,49156.200000,4,7997.14125,8,0.0,8,-499.875000,7,0.000000
2,100003,4,-1400.750000,4,0.0,4,-544.500000,3,-1097.333333,4,...,4,0.000000,4,202500.00000,4,0.0,4,-816.000000,0,
3,100004,2,-867.000000,2,0.0,2,-488.500000,2,-532.500000,1,...,2,0.000000,2,0.00000,2,0.0,2,-532.000000,0,
4,100005,3,-190.666667,3,0.0,3,439.333333,1,-123.000000,1,...,3,189469.500000,3,0.00000,3,0.0,3,-54.333333,3,1420.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305806,456249,13,-1667.076923,13,0.0,12,-1232.333333,12,-1364.750000,5,...,10,16307.100000,7,0.00000,13,0.0,13,-1064.538462,0,
305807,456250,3,-862.000000,3,0.0,3,1288.333333,1,-760.000000,2,...,3,744013.365000,3,19422.79500,3,0.0,3,-60.333333,3,154567.965000
305808,456253,4,-867.500000,4,0.0,4,280.500000,2,-794.000000,0,...,4,448958.250000,4,0.00000,4,0.0,4,-253.250000,3,58369.500000
305809,456254,1,-1104.000000,1,0.0,1,-859.000000,1,-859.000000,0,...,1,0.000000,0,,1,0.0,1,-401.000000,1,0.000000


In [65]:
#3-Cargo fichero installments_payments.csv y veo cabecera
installments = pd.read_csv('/Users/LENOVO/Downloads/installments_payments.csv', delimiter= ',' , header=0)
#installments.head()
installments

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.360,6948.360
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525
2,2085231,193053,2.0,1,-63.0,-63.0,25425.000,25425.000
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.130,24350.130
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.040,2160.585
...,...,...,...,...,...,...,...,...
13605396,2186857,428057,0.0,66,-1624.0,,67.500,
13605397,1310347,414406,0.0,47,-1539.0,,67.500,
13605398,1308766,402199,0.0,43,-7.0,,43737.435,
13605399,1062206,409297,0.0,43,-1986.0,,67.500,


In [66]:
installments.dtypes.value_counts()

float64    5
int64      3
dtype: int64

In [67]:
##***** Agrega las estadísticas de prestamos por 'SK_ID_CURR'. Elimina columna 'SK_ID_BUREAU'****
installments_by_client = aggregate_client(installments, group_vars = ['SK_ID_PREV', 'SK_ID_CURR'], df_names = ['installments', 'client'])
#installments_by_client.head()
installments_by_client

Unnamed: 0_level_0,client_installments_DAYS_ENTRY_PAYMENT_mean_mean,client_installments_DAYS_INSTALMENT_mean_mean,client_installments_NUM_INSTALMENT_VERSION_mean_mean,client_installments_NUM_INSTALMENT_NUMBER_mean_mean,client_installments_DAYS_ENTRY_PAYMENT_count_mean,client_installments_NUM_INSTALMENT_VERSION_count_mean,client_installments_DAYS_ENTRY_PAYMENT_mean_count,client_installments_DAYS_INSTALMENT_mean_count,client_installments_AMT_PAYMENT_mean_mean,client_installments_AMT_INSTALMENT_mean_mean
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
100001,-2195.000000,-2187.714286,1.142857,2.714286,3.571429,3.571429,7,7,5885.132143,5885.132143
100002,-315.421053,-295.000000,1.052632,10.000000,19.000000,19.000000,19,19,11559.247105,11559.247105
100003,-1385.320000,-1378.160000,1.040000,5.080000,9.160000,9.160000,25,25,64754.586000,64754.586000
100004,-761.666667,-754.000000,1.333333,2.000000,3.000000,3.000000,3,3,7096.155000,7096.155000
100005,-609.555556,-586.000000,1.111111,5.000000,9.000000,9.000000,9,9,6240.205000,6240.205000
...,...,...,...,...,...,...,...,...,...,...
456251,-156.285714,-120.000000,1.142857,4.000000,7.000000,7.000000,7,7,7492.924286,7492.924286
456252,-2393.833333,-2391.000000,1.000000,3.500000,6.000000,6.000000,6,6,10069.867500,10069.867500
456253,-2387.428571,-2372.928571,1.000000,4.785714,5.000000,5.000000,14,14,4115.915357,4399.707857
456254,-161.263158,-142.263158,1.000000,5.263158,9.526316,9.526316,19,19,10239.832895,10239.832895


In [68]:
#4-Cargo fichero POS_CASH_balance.csv y veo cabecera
cash = pd.read_csv('/Users/LENOVO/Downloads/POS_CASH_balance.csv', delimiter= ',' , header=0)
#cash.head()
cash

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,1803195,182943,-31,48.0,45.0,Active,0,0
1,1715348,367990,-33,36.0,35.0,Active,0,0
2,1784872,397406,-32,12.0,9.0,Active,0,0
3,1903291,269225,-35,48.0,42.0,Active,0,0
4,2341044,334279,-35,36.0,35.0,Active,0,0
...,...,...,...,...,...,...,...,...
10001353,2448283,226558,-20,6.0,0.0,Active,843,0
10001354,1717234,141565,-19,12.0,0.0,Active,602,0
10001355,1283126,315695,-21,10.0,0.0,Active,609,0
10001356,1082516,450255,-22,12.0,0.0,Active,614,0


In [69]:
cash.dtypes.value_counts()

int64      5
float64    2
object     1
dtype: int64

In [70]:
##***** Agrega las estadísticas de cash por 'SK_ID_CURR'.****
cash_by_client = aggregate_client(cash, group_vars = ['SK_ID_PREV', 'SK_ID_CURR'], df_names = ['cash', 'client'])
#cash_by_client.head()
cash_by_client

Unnamed: 0_level_0,client_cash_MONTHS_BALANCE_mean_mean,client_cash_SK_DPD_DEF_mean_mean,client_cash_SK_DPD_mean_mean,client_cash_CNT_INSTALMENT_FUTURE_mean_mean,client_cash_CNT_INSTALMENT_mean_mean,client_cash_CNT_INSTALMENT_count_mean,client_cash_CNT_INSTALMENT_FUTURE_count_mean,client_cash_MONTHS_BALANCE_count_mean,client_cash_CNT_INSTALMENT_FUTURE_mean_count,client_cash_MONTHS_BALANCE_mean_count
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
100001,-72.555556,0.777778,0.777778,1.444444,4.000000,4.555556,4.555556,4.555556,9,9
100002,-10.000000,0.000000,0.000000,15.000000,24.000000,19.000000,19.000000,19.000000,19,19
100003,-43.785714,0.000000,0.000000,5.785714,10.107143,9.714286,9.714286,9.714286,28,28
100004,-25.500000,0.000000,0.000000,2.250000,3.750000,4.000000,4.000000,4.000000,4,4
100005,-20.000000,0.000000,0.000000,7.200000,11.700000,10.000000,10.000000,11.000000,11,11
...,...,...,...,...,...,...,...,...,...,...
456251,-5.000000,0.000000,0.000000,4.375000,7.875000,8.000000,8.000000,9.000000,9,9
456252,-79.000000,0.000000,0.000000,3.000000,6.000000,7.000000,7.000000,7.000000,7,7
456253,-79.235294,0.294118,0.294118,2.000000,6.705882,5.941176,5.941176,5.941176,17,17
456254,-5.550000,0.000000,0.000000,10.350000,14.900000,10.100000,10.100000,10.100000,20,20


In [71]:
#5-Cargo fichero credit_card_balance.csv y veo cabecera
credit = pd.read_csv('/Users/LENOVO/Downloads/credit_card_balance.csv', delimiter= ',' , header=0)
#credit.head()
credit

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,...,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,2562384,378907,-6,56.970,135000,0.0,877.5,0.0,877.5,1700.325,...,0.000,0.000,0.0,1,0.0,1.0,35.0,Active,0,0
1,2582071,363914,-1,63975.555,45000,2250.0,2250.0,0.0,0.0,2250.000,...,64875.555,64875.555,1.0,1,0.0,0.0,69.0,Active,0,0
2,1740877,371185,-7,31815.225,450000,0.0,0.0,0.0,0.0,2250.000,...,31460.085,31460.085,0.0,0,0.0,0.0,30.0,Active,0,0
3,1389973,337855,-4,236572.110,225000,2250.0,2250.0,0.0,0.0,11795.760,...,233048.970,233048.970,1.0,1,0.0,0.0,10.0,Active,0,0
4,1891521,126868,-1,453919.455,450000,0.0,11547.0,0.0,11547.0,22924.890,...,453919.455,453919.455,0.0,1,0.0,1.0,101.0,Active,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3840307,1036507,328243,-9,0.000,45000,,0.0,,,0.000,...,0.000,0.000,,0,,,0.0,Active,0,0
3840308,1714892,347207,-9,0.000,45000,0.0,0.0,0.0,0.0,0.000,...,0.000,0.000,0.0,0,0.0,0.0,23.0,Active,0,0
3840309,1302323,215757,-9,275784.975,585000,270000.0,270000.0,0.0,0.0,2250.000,...,273093.975,273093.975,2.0,2,0.0,0.0,18.0,Active,0,0
3840310,1624872,430337,-10,0.000,450000,,0.0,,,0.000,...,0.000,0.000,,0,,,0.0,Active,0,0


In [72]:
credit.dtypes.value_counts()

float64    15
int64       7
object      1
dtype: int64

In [73]:
##***** Agrega las estadísticas de credit_card_balance por 'SK_ID_CURR'.
credit_by_client = aggregate_client(credit, group_vars = ['SK_ID_PREV', 'SK_ID_CURR'], df_names = ['credit', 'client'])
#credit_by_client.head()
credit_by_client

Unnamed: 0_level_0,client_credit_MONTHS_BALANCE_mean_mean,client_credit_SK_DPD_DEF_mean_mean,client_credit_SK_DPD_mean_mean,client_credit_CNT_DRAWINGS_CURRENT_mean_mean,client_credit_CNT_INSTALMENT_MATURE_CUM_mean_mean,client_credit_AMT_DRAWINGS_ATM_CURRENT_count_mean,client_credit_AMT_PAYMENT_CURRENT_count_mean,client_credit_AMT_PAYMENT_CURRENT_mean_count,client_credit_CNT_DRAWINGS_OTHER_CURRENT_mean_count,client_credit_AMT_DRAWINGS_CURRENT_mean_mean,...,client_credit_MONTHS_BALANCE_count_mean,client_credit_MONTHS_BALANCE_mean_count,client_credit_AMT_CREDIT_LIMIT_ACTUAL_mean_mean,client_credit_CNT_DRAWINGS_OTHER_CURRENT_mean_mean,client_credit_AMT_DRAWINGS_OTHER_CURRENT_mean_mean,client_credit_CNT_DRAWINGS_POS_CURRENT_mean_mean,client_credit_AMT_DRAWINGS_POS_CURRENT_mean_mean,client_credit_CNT_DRAWINGS_ATM_CURRENT_mean_mean,client_credit_AMT_DRAWINGS_ATM_CURRENT_mean_mean,client_credit_AMT_PAYMENT_CURRENT_mean_mean
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100006,-3.5,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0,0,0.000000,...,6.0,6,270000.000000,,,,,,,
100011,-38.5,0.000000,0.000000,0.054054,25.767123,74.0,74.0,74,74,2432.432432,...,74.0,74,164189.189189,0.0,0.0,0.000000,0.000000,0.054054,2432.432432,4843.064189
100013,-48.5,0.010417,0.010417,0.239583,18.719101,90.0,96.0,96,96,5953.125000,...,96.0,96,131718.750000,0.0,0.0,0.000000,0.000000,0.255556,6350.000000,7168.346250
100021,-10.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0,0,0.000000,...,17.0,17,675000.000000,,,,,,,
100023,-7.5,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0,0,0.000000,...,8.0,8,135000.000000,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456244,-21.0,0.000000,0.000000,1.365854,13.600000,41.0,41.0,41,41,26842.388049,...,41.0,41,296341.463415,0.0,0.0,0.317073,2363.015854,1.048780,24475.609756,32720.544878
456246,-5.5,0.000000,0.000000,2.500000,3.500000,8.0,7.0,8,8,15199.256250,...,8.0,8,135000.000000,0.0,0.0,2.500000,15199.256250,0.000000,0.000000,18778.275000
456247,-49.0,0.021053,0.031579,0.147368,26.494737,95.0,95.0,95,95,2149.506474,...,95.0,95,144000.000000,0.0,0.0,0.031579,13.190684,0.115789,2136.315789,4883.755263
456248,-13.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0,0,0.000000,...,23.0,23,900000.000000,,,,,,,


In [74]:
#6-Cargo fichero previous_application.csv y veo cabecera
previous = pd.read_csv('/Users/LENOVO/Downloads/previous_application.csv', delimiter= ',' , header=0)
#previous.head()
previous

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,...,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,Consumer loans,1730.430,17145.0,17145.0,0.0,17145.0,SATURDAY,15,...,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,2802425,108129,Cash loans,25188.615,607500.0,679671.0,,607500.0,THURSDAY,11,...,XNA,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0
2,2523466,122040,Cash loans,15060.735,112500.0,136444.5,,112500.0,TUESDAY,11,...,XNA,12.0,high,Cash X-Sell: high,365243.0,-271.0,59.0,365243.0,365243.0,1.0
3,2819243,176158,Cash loans,47041.335,450000.0,470790.0,,450000.0,MONDAY,7,...,XNA,12.0,middle,Cash X-Sell: middle,365243.0,-482.0,-152.0,-182.0,-177.0,1.0
4,1784265,202054,Cash loans,31924.395,337500.0,404055.0,,337500.0,THURSDAY,9,...,XNA,24.0,high,Cash Street: high,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1670209,2300464,352015,Consumer loans,14704.290,267295.5,311400.0,0.0,267295.5,WEDNESDAY,12,...,Furniture,30.0,low_normal,POS industry with interest,365243.0,-508.0,362.0,-358.0,-351.0,0.0
1670210,2357031,334635,Consumer loans,6622.020,87750.0,64291.5,29250.0,87750.0,TUESDAY,15,...,Furniture,12.0,middle,POS industry with interest,365243.0,-1604.0,-1274.0,-1304.0,-1297.0,0.0
1670211,2659632,249544,Consumer loans,11520.855,105237.0,102523.5,10525.5,105237.0,MONDAY,12,...,Consumer electronics,10.0,low_normal,POS household with interest,365243.0,-1457.0,-1187.0,-1187.0,-1181.0,0.0
1670212,2785582,400317,Cash loans,18821.520,180000.0,191880.0,,180000.0,WEDNESDAY,9,...,XNA,12.0,low_normal,Cash X-Sell: low,365243.0,-1155.0,-825.0,-825.0,-817.0,1.0


In [75]:
previous.dtypes.value_counts()

object     16
float64    15
int64       6
dtype: int64

In [76]:
# Calculate aggregate statistics for each numeric column
previous_agg = agg_numeric(previous, 'SK_ID_CURR', 'previous')
#previous_agg.head()
previous_agg

Unnamed: 0_level_0,previous_DAYS_DECISION_mean,previous_DAYS_FIRST_DUE_mean,previous_DAYS_LAST_DUE_mean,previous_DAYS_TERMINATION_mean,previous_DAYS_LAST_DUE_1ST_VERSION_mean,previous_RATE_INTEREST_PRIMARY_count,previous_NFLAG_INSURED_ON_APPROVAL_mean,previous_RATE_DOWN_PAYMENT_mean,previous_NFLAG_LAST_APPL_IN_DAY_mean,previous_AMT_DOWN_PAYMENT_count,...,previous_HOUR_APPR_PROCESS_START_mean,previous_SELLERPLACE_AREA_mean,previous_AMT_DOWN_PAYMENT_mean,previous_AMT_ANNUITY_mean,previous_AMT_CREDIT_mean,previous_AMT_APPLICATION_mean,previous_AMT_GOODS_PRICE_mean,previous_DAYS_FIRST_DRAWING_mean,previous_RATE_INTEREST_PRIMARY_mean,previous_RATE_INTEREST_PRIVILEGED_mean
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,-1740.000,-1709.000000,-1619.000000,-1612.000000,-1499.000000,0,0.000000,0.104326,1.0,1,...,13.000000,23.000,2520.00,3951.000000,23787.00,24835.500,24835.500,365243.0,,
100002,-606.000,-565.000000,-25.000000,-17.000000,125.000000,0,0.000000,0.000000,1.0,1,...,9.000000,500.000,0.00,9251.775000,179055.00,179055.000,179055.000,365243.0,,
100003,-1305.000,-1274.333333,-1054.333333,-1047.333333,-1004.333333,0,0.666667,0.050030,1.0,2,...,14.666667,533.000,3442.50,56553.990000,484191.00,435436.500,435436.500,365243.0,,
100004,-815.000,-784.000000,-724.000000,-714.000000,-694.000000,0,0.000000,0.212008,1.0,1,...,5.000000,30.000,4860.00,5357.250000,20106.00,24282.000,24282.000,365243.0,,
100005,-536.000,-706.000000,-466.000000,-460.000000,-376.000000,0,0.000000,0.108964,1.0,1,...,10.500000,18.000,4464.00,4813.200000,20076.75,22308.750,44617.500,365243.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456251,-273.000,-210.000000,-30.000000,-25.000000,0.000000,0,0.000000,0.000000,1.0,1,...,17.000000,30.000,0.00,6605.910000,40455.00,40455.000,40455.000,365243.0,,
456252,-2497.000,-2466.000000,-2316.000000,-2311.000000,-2316.000000,0,1.000000,0.062443,1.0,1,...,10.000000,190.000,3456.00,10074.465000,56821.50,57595.500,57595.500,365243.0,,
456253,-2380.000,-2339.000000,-2219.000000,-2212.500000,-2219.000000,0,0.500000,0.214316,1.0,2,...,11.500000,22.000,4403.25,4770.405000,20625.75,24162.750,24162.750,365243.0,,
456254,-299.500,-269.000000,365243.000000,365243.000000,151.000000,0,0.500000,0.000000,1.0,2,...,15.000000,1578.000,0.00,10681.132500,134439.75,121317.750,121317.750,365243.0,,


In [77]:
# Calculate value counts for each categorical column
previous_counts = count_categoricas(previous, 'SK_ID_CURR', 'previous')
#previous_counts.head()
previous_counts

Unnamed: 0_level_0,previous_NAME_CONTRACT_TYPE_Cash loans_count,previous_NAME_CONTRACT_TYPE_Cash loans_count_norm,previous_NAME_CONTRACT_TYPE_Consumer loans_count,previous_NAME_CONTRACT_TYPE_Consumer loans_count_norm,previous_NAME_CONTRACT_TYPE_Revolving loans_count,previous_NAME_CONTRACT_TYPE_Revolving loans_count_norm,previous_NAME_CONTRACT_TYPE_XNA_count,previous_NAME_CONTRACT_TYPE_XNA_count_norm,previous_WEEKDAY_APPR_PROCESS_START_FRIDAY_count,previous_WEEKDAY_APPR_PROCESS_START_FRIDAY_count_norm,...,previous_PRODUCT_COMBINATION_POS industry without interest_count,previous_PRODUCT_COMBINATION_POS industry without interest_count_norm,previous_PRODUCT_COMBINATION_POS mobile with interest_count,previous_PRODUCT_COMBINATION_POS mobile with interest_count_norm,previous_PRODUCT_COMBINATION_POS mobile without interest_count,previous_PRODUCT_COMBINATION_POS mobile without interest_count_norm,previous_PRODUCT_COMBINATION_POS other with interest_count,previous_PRODUCT_COMBINATION_POS other with interest_count_norm,previous_PRODUCT_COMBINATION_POS others without interest_count,previous_PRODUCT_COMBINATION_POS others without interest_count_norm
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,0,0.000000,1,1.000000,0,0.000,0,0.0,1,1.000000,...,0,0.0,1,1.00,0,0.0,0,0.0,0,0.0
100002,0,0.000000,1,1.000000,0,0.000,0,0.0,0,0.000000,...,0,0.0,0,0.00,0,0.0,1,1.0,0,0.0
100003,1,0.333333,2,0.666667,0,0.000,0,0.0,1,0.333333,...,0,0.0,0,0.00,0,0.0,0,0.0,0,0.0
100004,0,0.000000,1,1.000000,0,0.000,0,0.0,1,1.000000,...,0,0.0,0,0.00,1,1.0,0,0.0,0,0.0
100005,1,0.500000,1,0.500000,0,0.000,0,0.0,1,0.500000,...,0,0.0,1,0.50,0,0.0,0,0.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456251,0,0.000000,1,1.000000,0,0.000,0,0.0,0,0.000000,...,0,0.0,1,1.00,0,0.0,0,0.0,0,0.0
456252,0,0.000000,1,1.000000,0,0.000,0,0.0,0,0.000000,...,0,0.0,0,0.00,0,0.0,0,0.0,0,0.0
456253,0,0.000000,2,1.000000,0,0.000,0,0.0,0,0.000000,...,0,0.0,2,1.00,0,0.0,0,0.0,0,0.0
456254,0,0.000000,2,1.000000,0,0.000,0,0.0,0,0.000000,...,0,0.0,1,0.50,0,0.0,0,0.0,0,0.0


In [78]:
# Dataframe agrupando BureauBalance_agg con BureauBalance_counts por préstamo anterior
previous_application_agg = previous_agg.merge(previous_counts, right_index = True, left_on = 'SK_ID_CURR', how = 'outer')
previous_application_agg

Unnamed: 0_level_0,previous_DAYS_DECISION_mean,previous_DAYS_FIRST_DUE_mean,previous_DAYS_LAST_DUE_mean,previous_DAYS_TERMINATION_mean,previous_DAYS_LAST_DUE_1ST_VERSION_mean,previous_RATE_INTEREST_PRIMARY_count,previous_NFLAG_INSURED_ON_APPROVAL_mean,previous_RATE_DOWN_PAYMENT_mean,previous_NFLAG_LAST_APPL_IN_DAY_mean,previous_AMT_DOWN_PAYMENT_count,...,previous_PRODUCT_COMBINATION_POS industry without interest_count,previous_PRODUCT_COMBINATION_POS industry without interest_count_norm,previous_PRODUCT_COMBINATION_POS mobile with interest_count,previous_PRODUCT_COMBINATION_POS mobile with interest_count_norm,previous_PRODUCT_COMBINATION_POS mobile without interest_count,previous_PRODUCT_COMBINATION_POS mobile without interest_count_norm,previous_PRODUCT_COMBINATION_POS other with interest_count,previous_PRODUCT_COMBINATION_POS other with interest_count_norm,previous_PRODUCT_COMBINATION_POS others without interest_count,previous_PRODUCT_COMBINATION_POS others without interest_count_norm
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,-1740.000,-1709.000000,-1619.000000,-1612.000000,-1499.000000,0,0.000000,0.104326,1.0,1,...,0,0.0,1,1.00,0,0.0,0,0.0,0,0.0
100002,-606.000,-565.000000,-25.000000,-17.000000,125.000000,0,0.000000,0.000000,1.0,1,...,0,0.0,0,0.00,0,0.0,1,1.0,0,0.0
100003,-1305.000,-1274.333333,-1054.333333,-1047.333333,-1004.333333,0,0.666667,0.050030,1.0,2,...,0,0.0,0,0.00,0,0.0,0,0.0,0,0.0
100004,-815.000,-784.000000,-724.000000,-714.000000,-694.000000,0,0.000000,0.212008,1.0,1,...,0,0.0,0,0.00,1,1.0,0,0.0,0,0.0
100005,-536.000,-706.000000,-466.000000,-460.000000,-376.000000,0,0.000000,0.108964,1.0,1,...,0,0.0,1,0.50,0,0.0,0,0.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456251,-273.000,-210.000000,-30.000000,-25.000000,0.000000,0,0.000000,0.000000,1.0,1,...,0,0.0,1,1.00,0,0.0,0,0.0,0,0.0
456252,-2497.000,-2466.000000,-2316.000000,-2311.000000,-2316.000000,0,1.000000,0.062443,1.0,1,...,0,0.0,0,0.00,0,0.0,0,0.0,0,0.0
456253,-2380.000,-2339.000000,-2219.000000,-2212.500000,-2219.000000,0,0.500000,0.214316,1.0,2,...,0,0.0,2,1.00,0,0.0,0,0.0,0,0.0
456254,-299.500,-269.000000,365243.000000,365243.000000,151.000000,0,0.500000,0.000000,1.0,2,...,0,0.0,1,0.50,0,0.0,0,0.0,0,0.0


In [79]:
#7-Cargo fichero application_train.csv y veo cabecera
Train = pd.read_csv('/Users/LENOVO/Downloads/application_train.csv', delimiter= ',' , header=0)
Train

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0,Cash loans,M,N,N,0,157500.0,254700.0,27558.0,...,0,0,0,0,,,,,,
307507,456252,0,Cash loans,F,N,Y,0,72000.0,269550.0,12001.5,...,0,0,0,0,,,,,,
307508,456253,0,Cash loans,F,N,Y,0,153000.0,677664.0,29979.0,...,0,0,0,0,1.0,0.0,0.0,1.0,0.0,1.0
307509,456254,1,Cash loans,F,N,Y,0,171000.0,370107.0,20205.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [80]:
Train.dtypes.value_counts()

float64    65
int64      41
object     16
dtype: int64

In [81]:
#8-Cargo fichero application_test.csv y veo cabecera
Test = pd.read_csv('/Users/LENOVO/Downloads/application_test.csv', delimiter= ',' , header=0)
Test

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100001,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005,Cash loans,M,N,Y,0,99000.0,222768.0,17370.0,180000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,100013,Cash loans,M,Y,Y,0,202500.0,663264.0,69777.0,630000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
3,100028,Cash loans,F,N,Y,2,315000.0,1575000.0,49018.5,1575000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,100038,Cash loans,M,Y,N,1,180000.0,625500.0,32067.0,625500.0,...,0,0,0,0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48739,456221,Cash loans,F,N,Y,0,121500.0,412560.0,17473.5,270000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
48740,456222,Cash loans,F,N,N,2,157500.0,622413.0,31909.5,495000.0,...,0,0,0,0,,,,,,
48741,456223,Cash loans,F,Y,Y,1,202500.0,315000.0,33205.5,315000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,3.0,1.0
48742,456224,Cash loans,M,N,N,0,225000.0,450000.0,25128.0,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0


In [82]:
Test.dtypes.value_counts()

float64    65
int64      40
object     16
dtype: int64

In [83]:
# Create a label encoder object
le = preprocessing.LabelEncoder()
le_count = 0

In [84]:
# Iterate through the columns
for col in Train:
    if Train[col].dtype == 'object':
        # If 2 or fewer unique categories
        if len(list(Train[col].unique())) <= 2:
            # Train on the training data
            le.fit(Train[col])
            # Transform both training and testing data
            Train[col] = le.transform(Train[col])
            Test[col] = le.transform(Test[col])
            
            # Keep track of how many columns were label encoded
            le_count += 1
            
print('%d columns were label encoded.' % le_count)

3 columns were label encoded.


In [85]:
# one-hot encoding of categorical variables
Train = pd.get_dummies(Train)
Test  = pd.get_dummies(Test )

print('Training Features shape: ', Train.shape)
print('Testing Features shape: ', Test.shape)

Training Features shape:  (307511, 243)
Testing Features shape:  (48744, 239)


In [86]:
Train_labels = Train['TARGET']

# Align the training and testing data, keep only columns present in both dataframes
Train, Test = Train.align(Test, join = 'inner', axis = 1)

# Add the target back in
Train['TARGET'] = Train_labels

print('Training Features shape: ', Train.shape)
print('Testing Features shape: ', Test.shape)

Training Features shape:  (307511, 240)
Testing Features shape:  (48744, 239)


In [87]:
# Join to the training dataframe
#train = pd.read_csv('../input/application_train.csv')
Train = Train.merge(BureauBureauBalance_by_client, on = 'SK_ID_CURR', how = 'left')

# Fill the missing values with 0 
#train['SK_ID_CURR'] = train['SK_ID_CURR'].fillna(0)#
#train.head()

In [88]:
Train

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,client_bureau_balance_STATUS_5_count_norm_count,client_bureau_balance_STATUS_5_count_norm_mean,client_bureau_balance_STATUS_C_count_count,client_bureau_balance_STATUS_C_count_mean,client_bureau_balance_STATUS_C_count_norm_count,client_bureau_balance_STATUS_C_count_norm_mean,client_bureau_balance_STATUS_X_count_count,client_bureau_balance_STATUS_X_count_mean,client_bureau_balance_STATUS_X_count_norm_count,client_bureau_balance_STATUS_X_count_norm_mean
0,100002,0,0,1,0,202500.0,406597.5,24700.5,351000.0,0.018801,...,8.0,0.0,8.0,2.875000,8.0,0.175426,8.0,1.875000,8.0,0.161932
1,100003,0,0,0,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,...,,,,,,,,,,
2,100004,1,1,1,0,67500.0,135000.0,6750.0,135000.0,0.010032,...,,,,,,,,,,
3,100006,0,0,1,0,135000.0,312682.5,29686.5,297000.0,0.008019,...,,,,,,,,,,
4,100007,0,0,1,0,121500.0,513000.0,21865.5,513000.0,0.028663,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0,0,0,0,157500.0,254700.0,27558.0,225000.0,0.032561,...,,,,,,,,,,
307507,456252,0,0,1,0,72000.0,269550.0,12001.5,225000.0,0.025164,...,,,,,,,,,,
307508,456253,0,0,1,0,153000.0,677664.0,29979.0,585000.0,0.005002,...,4.0,0.0,4.0,14.250000,4.0,0.459677,4.0,3.250000,4.0,0.135417
307509,456254,0,0,1,0,171000.0,370107.0,20205.0,319500.0,0.005313,...,1.0,0.0,1.0,29.000000,1.0,0.783784,1.0,0.000000,1.0,0.000000


In [89]:
Train.dtypes.value_counts()

uint8      131
float64    101
int64       41
int32        3
dtype: int64

In [90]:
#Train.select_dtypes('float64').apply(pd.Series.nunique, axis = 0)

In [91]:
#correlations = Train.corr()['TARGET'].sort_values()
#correlations

In [92]:
# Join to the training dataframe
#train = pd.read_csv('../input/application_train.csv')
Train = Train.merge(Bureau_agg, on = 'SK_ID_CURR', how = 'left')
Train

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,bureau_AMT_CREDIT_SUM_DEBT_count,bureau_AMT_CREDIT_SUM_DEBT_mean,bureau_AMT_CREDIT_SUM_LIMIT_count,bureau_AMT_CREDIT_SUM_LIMIT_mean,bureau_AMT_CREDIT_SUM_OVERDUE_count,bureau_AMT_CREDIT_SUM_OVERDUE_mean,bureau_DAYS_CREDIT_UPDATE_count,bureau_DAYS_CREDIT_UPDATE_mean,bureau_AMT_ANNUITY_count,bureau_AMT_ANNUITY_mean
0,100002,0,0,1,0,202500.0,406597.5,24700.5,351000.0,0.018801,...,5.0,49156.20000,4.0,7997.14125,8.0,0.0,8.0,-499.875000,7.0,0.0
1,100003,0,0,0,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,...,4.0,0.00000,4.0,202500.00000,4.0,0.0,4.0,-816.000000,0.0,
2,100004,1,1,1,0,67500.0,135000.0,6750.0,135000.0,0.010032,...,2.0,0.00000,2.0,0.00000,2.0,0.0,2.0,-532.000000,0.0,
3,100006,0,0,1,0,135000.0,312682.5,29686.5,297000.0,0.008019,...,,,,,,,,,,
4,100007,0,0,1,0,121500.0,513000.0,21865.5,513000.0,0.028663,...,1.0,0.00000,1.0,0.00000,1.0,0.0,1.0,-783.000000,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0,0,0,0,157500.0,254700.0,27558.0,225000.0,0.032561,...,,,,,,,,,,
307507,456252,0,0,1,0,72000.0,269550.0,12001.5,225000.0,0.025164,...,,,,,,,,,,
307508,456253,0,0,1,0,153000.0,677664.0,29979.0,585000.0,0.005002,...,4.0,448958.25000,4.0,0.00000,4.0,0.0,4.0,-253.250000,3.0,58369.5
307509,456254,0,0,1,0,171000.0,370107.0,20205.0,319500.0,0.005313,...,1.0,0.00000,0.0,,1.0,0.0,1.0,-401.000000,1.0,0.0


In [93]:
#Train.dtypes.value_counts()

In [94]:
#Train.select_dtypes('float64').apply(pd.Series.nunique, axis = 0)

In [95]:
#correlations = Train.corr()['TARGET'].sort_values()
#correlations

In [96]:
# Join to the training dataframe
#train = pd.read_csv('../input/application_train.csv')
Train = Train.merge(installments_by_client, on = 'SK_ID_CURR', how = 'left')
Train

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,client_installments_DAYS_ENTRY_PAYMENT_mean_mean,client_installments_DAYS_INSTALMENT_mean_mean,client_installments_NUM_INSTALMENT_VERSION_mean_mean,client_installments_NUM_INSTALMENT_NUMBER_mean_mean,client_installments_DAYS_ENTRY_PAYMENT_count_mean,client_installments_NUM_INSTALMENT_VERSION_count_mean,client_installments_DAYS_ENTRY_PAYMENT_mean_count,client_installments_DAYS_INSTALMENT_mean_count,client_installments_AMT_PAYMENT_mean_mean,client_installments_AMT_INSTALMENT_mean_mean
0,100002,0,0,1,0,202500.0,406597.5,24700.5,351000.0,0.018801,...,-315.421053,-295.000000,1.052632,10.000000,19.000000,19.000000,19.0,19.0,11559.247105,11559.247105
1,100003,0,0,0,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,...,-1385.320000,-1378.160000,1.040000,5.080000,9.160000,9.160000,25.0,25.0,64754.586000,64754.586000
2,100004,1,1,1,0,67500.0,135000.0,6750.0,135000.0,0.010032,...,-761.666667,-754.000000,1.333333,2.000000,3.000000,3.000000,3.0,3.0,7096.155000,7096.155000
3,100006,0,0,1,0,135000.0,312682.5,29686.5,297000.0,0.008019,...,-271.625000,-252.250000,1.125000,4.437500,7.875000,7.875000,16.0,16.0,62947.088438,62947.088438
4,100007,0,0,1,0,121500.0,513000.0,21865.5,513000.0,0.028663,...,-1032.242424,-1028.606061,1.166667,7.045455,13.606061,13.606061,66.0,66.0,12214.060227,12666.444545
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0,0,0,0,157500.0,254700.0,27558.0,225000.0,0.032561,...,-156.285714,-120.000000,1.142857,4.000000,7.000000,7.000000,7.0,7.0,7492.924286,7492.924286
307507,456252,0,0,1,0,72000.0,269550.0,12001.5,225000.0,0.025164,...,-2393.833333,-2391.000000,1.000000,3.500000,6.000000,6.000000,6.0,6.0,10069.867500,10069.867500
307508,456253,0,0,1,0,153000.0,677664.0,29979.0,585000.0,0.005002,...,-2387.428571,-2372.928571,1.000000,4.785714,5.000000,5.000000,14.0,14.0,4115.915357,4399.707857
307509,456254,0,0,1,0,171000.0,370107.0,20205.0,319500.0,0.005313,...,-161.263158,-142.263158,1.000000,5.263158,9.526316,9.526316,19.0,19.0,10239.832895,10239.832895


In [97]:
#Train.dtypes.value_counts()

In [98]:
#Train.select_dtypes('float64').apply(pd.Series.nunique, axis = 0)

In [99]:
#correlations = Train.corr()['TARGET'].sort_values()
#correlations

In [100]:
# Join to the training dataframe
#train = pd.read_csv('../input/application_train.csv')
Train = Train.merge(cash_by_client, on = 'SK_ID_CURR', how = 'left')
Train

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,client_cash_MONTHS_BALANCE_mean_mean,client_cash_SK_DPD_DEF_mean_mean,client_cash_SK_DPD_mean_mean,client_cash_CNT_INSTALMENT_FUTURE_mean_mean,client_cash_CNT_INSTALMENT_mean_mean,client_cash_CNT_INSTALMENT_count_mean,client_cash_CNT_INSTALMENT_FUTURE_count_mean,client_cash_MONTHS_BALANCE_count_mean,client_cash_CNT_INSTALMENT_FUTURE_mean_count,client_cash_MONTHS_BALANCE_mean_count
0,100002,0,0,1,0,202500.0,406597.5,24700.5,351000.0,0.018801,...,-10.000000,0.000000,0.000000,15.000000,24.000000,19.000000,19.000000,19.000000,19.0,19.0
1,100003,0,0,0,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,...,-43.785714,0.000000,0.000000,5.785714,10.107143,9.714286,9.714286,9.714286,28.0,28.0
2,100004,1,1,1,0,67500.0,135000.0,6750.0,135000.0,0.010032,...,-25.500000,0.000000,0.000000,2.250000,3.750000,4.000000,4.000000,4.000000,4.0,4.0
3,100006,0,0,1,0,135000.0,312682.5,29686.5,297000.0,0.008019,...,-9.619048,0.000000,0.000000,8.578231,11.904762,7.857143,7.857143,8.238095,21.0,21.0
4,100007,0,0,1,0,121500.0,513000.0,21865.5,513000.0,0.028663,...,-33.636364,0.000000,0.000000,8.969697,15.333333,13.727273,13.727273,13.727273,66.0,66.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0,0,0,0,157500.0,254700.0,27558.0,225000.0,0.032561,...,-5.000000,0.000000,0.000000,4.375000,7.875000,8.000000,8.000000,9.000000,9.0,9.0
307507,456252,0,0,1,0,72000.0,269550.0,12001.5,225000.0,0.025164,...,-79.000000,0.000000,0.000000,3.000000,6.000000,7.000000,7.000000,7.000000,7.0,7.0
307508,456253,0,0,1,0,153000.0,677664.0,29979.0,585000.0,0.005002,...,-79.235294,0.294118,0.294118,2.000000,6.705882,5.941176,5.941176,5.941176,17.0,17.0
307509,456254,0,0,1,0,171000.0,370107.0,20205.0,319500.0,0.005313,...,-5.550000,0.000000,0.000000,10.350000,14.900000,10.100000,10.100000,10.100000,20.0,20.0


In [101]:
#Train.dtypes.value_counts()

In [102]:
#Train.select_dtypes('float64').apply(pd.Series.nunique, axis = 0)

In [103]:
#correlations = Train.corr()['TARGET'].sort_values()
#correlations

In [104]:
# Join to the training dataframe
#train = pd.read_csv('../input/application_train.csv')
Train = Train.merge(credit_by_client, on = 'SK_ID_CURR', how = 'left')
Train

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,client_credit_MONTHS_BALANCE_count_mean,client_credit_MONTHS_BALANCE_mean_count,client_credit_AMT_CREDIT_LIMIT_ACTUAL_mean_mean,client_credit_CNT_DRAWINGS_OTHER_CURRENT_mean_mean,client_credit_AMT_DRAWINGS_OTHER_CURRENT_mean_mean,client_credit_CNT_DRAWINGS_POS_CURRENT_mean_mean,client_credit_AMT_DRAWINGS_POS_CURRENT_mean_mean,client_credit_CNT_DRAWINGS_ATM_CURRENT_mean_mean,client_credit_AMT_DRAWINGS_ATM_CURRENT_mean_mean,client_credit_AMT_PAYMENT_CURRENT_mean_mean
0,100002,0,0,1,0,202500.0,406597.5,24700.5,351000.0,0.018801,...,,,,,,,,,,
1,100003,0,0,0,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,...,,,,,,,,,,
2,100004,1,1,1,0,67500.0,135000.0,6750.0,135000.0,0.010032,...,,,,,,,,,,
3,100006,0,0,1,0,135000.0,312682.5,29686.5,297000.0,0.008019,...,6.0,6.0,270000.0,,,,,,,
4,100007,0,0,1,0,121500.0,513000.0,21865.5,513000.0,0.028663,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0,0,0,0,157500.0,254700.0,27558.0,225000.0,0.032561,...,,,,,,,,,,
307507,456252,0,0,1,0,72000.0,269550.0,12001.5,225000.0,0.025164,...,,,,,,,,,,
307508,456253,0,0,1,0,153000.0,677664.0,29979.0,585000.0,0.005002,...,,,,,,,,,,
307509,456254,0,0,1,0,171000.0,370107.0,20205.0,319500.0,0.005313,...,,,,,,,,,,


In [105]:
#Train.dtypes.value_counts()

In [106]:
#Train.select_dtypes('float64').apply(pd.Series.nunique, axis = 0)

In [107]:
#correlations = Train.corr()['TARGET'].sort_values()
#correlations

In [109]:
# Join to the training dataframe
#train = pd.read_csv('../input/application_train.csv')
Train = Train.merge(previous_application_agg, on = 'SK_ID_CURR', how = 'left')
Train

MemoryError: Unable to allocate 419. MiB for an array with shape (162, 338857) and data type float64

In [None]:
Train.dtypes.value_counts()

In [None]:
Train.select_dtypes('float64').apply(pd.Series.nunique, axis = 0)

In [None]:
correlations = Train.corr()['TARGET'].sort_values()
correlations

In [None]:
# Display correlations
print('Most Positive Correlations:\n', correlations.tail(15))
print('\nMost Negative Correlations:\n', correlations.head(15))