In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

#Plotting Libraries
import matplotlib.pyplot as plt
import seaborn as sns
import math

#Memory management 
import gc
import random
import time
import xgboost as xgb
from sklearn.preprocessing import StandardScaler, MinMaxScaler    
scaler = StandardScaler()
minmax = MinMaxScaler()


from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

train_file = "/kaggle/input/santander-pr/train.csv"
test_file = "/kaggle/input/santander-pr/test.csv"

targetcols = ["ind_ahor_fin_ult1","ind_aval_fin_ult1","ind_cco_fin_ult1","ind_cder_fin_ult1","ind_cno_fin_ult1","ind_ctju_fin_ult1","ind_ctma_fin_ult1",
              "ind_ctop_fin_ult1","ind_ctpp_fin_ult1","ind_deco_fin_ult1","ind_deme_fin_ult1","ind_dela_fin_ult1", "ind_ecue_fin_ult1","ind_fond_fin_ult1",
              "ind_hip_fin_ult1", "ind_plan_fin_ult1","ind_pres_fin_ult1","ind_reca_fin_ult1","ind_tjcr_fin_ult1","ind_valo_fin_ult1","ind_viv_fin_ult1",
              "ind_nomina_ult1","ind_nom_pens_ult1","ind_recibo_ult1"]

dtype_list = {'ind_cco_fin_ult1': 'uint8',
              'ind_deme_fin_ult1': 'uint8',
              'ind_aval_fin_ult1': 'uint8',
              'ind_valo_fin_ult1': 'uint8',
              'ind_reca_fin_ult1': 'uint8',
              'ind_ctju_fin_ult1': 'uint8',
              'ind_cder_fin_ult1': 'uint8', 
              'ind_plan_fin_ult1': 'uint8',
              'ind_fond_fin_ult1': 'uint8', 
              'ind_hip_fin_ult1': 'uint8',
              'ind_pres_fin_ult1': 'uint8', 
              'ind_nomina_ult1': 'float64', 
              'ind_cno_fin_ult1': 'uint8',
              'ind_ctpp_fin_ult1': 'uint8',
              'ind_ahor_fin_ult1': 'uint8',
              'ind_dela_fin_ult1': 'uint8',
              'ind_ecue_fin_ult1': 'uint8',
              'ind_nom_pens_ult1': 'float64',
              'ind_recibo_ult1': 'uint8',
              'ind_deco_fin_ult1': 'uint8',
              'ind_tjcr_fin_ult1': 'uint8', 
              'ind_ctop_fin_ult1': 'uint8',
              'ind_viv_fin_ult1': 'uint8',
              'ind_ctma_fin_ult1': 'uint8',
             'ncodpers' : 'uint32'}  

# ['fecha_alta','canal_entrada']

feature_cols = ['ncodpers','fecha_dato','age','renta','nomprov', 'ind_nuevo', 
               'segmento', 'ind_actividad_cliente', 'pais_residencia', 'ind_empleado', 
                'sexo', 'tiprel_1mes', 'indrel_1mes', 'antiguedad',  'indrel', 'indext', 'indresi', 'indfall', 'canal_entrada']

mapping_dict = {
'pais_residencia' : {'LV': 102, 'BE': 12, 'BG': 50, 'BA': 61, 'BM': 117, 'BO': 62, 'JP': 82, 'JM': 116, 'BR': 17, 'BY': 64, 'BZ': 113, 'RU': 43, 'RS': 89, 'RO': 41, 'GW': 99, 'GT': 44, 'GR': 39, 'GQ': 73, 'GE': 78, 'GB': 9, 'GA': 45, 'GN': 98, 'GM': 110, 'GI': 96, 'GH': 88, 'OM': 100, 'HR': 67, 'HU': 106, 'HK': 34, 'HN': 22, 'AD': 35, 'PR': 40, 'PT': 26, 'PY': 51, 'PA': 60, 'PE': 20, 'PK': 84, 'PH': 91, 'PL': 30, 'EE': 52, 'EG': 74, 'ZA': 75, 'EC': 19, 'AL': 25, 'VN': 90, 'ET': 54, 'ZW': 114, 'ES': 0, 'MD': 68, 'UY': 77, 'MM': 94, 'ML': 104, 'US': 15, 'MT': 118, 'MR': 48, 'UA': 49, 'MX': 16, 'IL': 42, 'FR': 8, 'MA': 38, 'FI': 23, 'NI': 33, 'NL': 7, 'NO': 46, 'NG': 83, 'NZ': 93, 'CI': 57, 'CH': 3, 'CO': 21, 'CN': 28, 'CM': 55, 'CL': 4, 'CA': 2, 'CG': 101, 'CF': 109, 'CD': 112, 'CZ': 36, 'CR': 32, 'CU': 72, 'KE': 65, 'KH': 95, 'SV': 53, 'SK': 69, 'KR': 87, 'KW': 92, 'SN': 47, 'SL': 97, 'KZ': 111, 'SA': 56, 'SG': 66, 'SE': 24, 'DO': 11, 'DJ': 115, 'DK': 76, 'DE': 10, 'DZ': 80, 'MK': 105, -99: 1, 'LB': 81, 'TW': 29, 'TR': 70, 'TN': 85, 'LT': 103, 'LU': 59, 'TH': 79, 'TG': 86, 'LY': 108, 'AE': 37, 'VE': 14, 'IS': 107, 'IT': 18, 'AO': 71, 'AR': 13, 'AU': 63, 'AT': 6, 'IN': 31, 'IE': 5, 'QA': 58, 'MZ': 27},
'canal_entrada' : {'013': 49, 'KHP': 160, 'KHQ': 157, 'KHR': 161, 'KHS': 162, 'KHK': 10, 'KHL': 0, 'KHM': 12, 'KHN': 21, 'KHO': 13, 'KHA': 22, 'KHC': 9, 'KHD': 2, 'KHE': 1, 'KHF': 19, '025': 159, 'KAC': 57, 'KAB': 28, 'KAA': 39, 'KAG': 26, 'KAF': 23, 'KAE': 30, 'KAD': 16, 'KAK': 51, 'KAJ': 41, 'KAI': 35, 'KAH': 31, 'KAO': 94, 'KAN': 110, 'KAM': 107, 'KAL': 74, 'KAS': 70, 'KAR': 32, 'KAQ': 37, 'KAP': 46, 'KAW': 76, 'KAV': 139, 'KAU': 142, 'KAT': 5, 'KAZ': 7, 'KAY': 54, 'KBJ': 133, 'KBH': 90, 'KBN': 122, 'KBO': 64, 'KBL': 88, 'KBM': 135, 'KBB': 131, 'KBF': 102, 'KBG': 17, 'KBD': 109, 'KBE': 119, 'KBZ': 67, 'KBX': 116, 'KBY': 111, 'KBR': 101, 'KBS': 118, 'KBP': 121, 'KBQ': 62, 'KBV': 100, 'KBW': 114, 'KBU': 55, 'KCE': 86, 'KCD': 85, 'KCG': 59, 'KCF': 105, 'KCA': 73, 'KCC': 29, 'KCB': 78, 'KCM': 82, 'KCL': 53, 'KCO': 104, 'KCN': 81, 'KCI': 65, 'KCH': 84, 'KCK': 52, 'KCJ': 156, 'KCU': 115, 'KCT': 112, 'KCV': 106, 'KCQ': 154, 'KCP': 129, 'KCS': 77, 'KCR': 153, 'KCX': 120, 'RED': 8, 'KDL': 158, 'KDM': 130, 'KDN': 151, 'KDO': 60, 'KDH': 14, 'KDI': 150, 'KDD': 113, 'KDE': 47, 'KDF': 127, 'KDG': 126, 'KDA': 63, 'KDB': 117, 'KDC': 75, 'KDX': 69, 'KDY': 61, 'KDZ': 99, 'KDT': 58, 'KDU': 79, 'KDV': 91, 'KDW': 132, 'KDP': 103, 'KDQ': 80, 'KDR': 56, 'KDS': 124, 'K00': 50, 'KEO': 96, 'KEN': 137, 'KEM': 155, 'KEL': 125, 'KEK': 145, 'KEJ': 95, 'KEI': 97, 'KEH': 15, 'KEG': 136, 'KEF': 128, 'KEE': 152, 'KED': 143, 'KEC': 66, 'KEB': 123, 'KEA': 89, 'KEZ': 108, 'KEY': 93, 'KEW': 98, 'KEV': 87, 'KEU': 72, 'KES': 68, 'KEQ': 138, -99: 6, 'KFV': 48, 'KFT': 92, 'KFU': 36, 'KFR': 144, 'KFS': 38, 'KFP': 40, 'KFF': 45, 'KFG': 27, 'KFD': 25, 'KFE': 148, 'KFB': 146, 'KFC': 4, 'KFA': 3, 'KFN': 42, 'KFL': 34, 'KFM': 141, 'KFJ': 33, 'KFK': 20, 'KFH': 140, 'KFI': 134, '007': 71, '004': 83, 'KGU': 149, 'KGW': 147, 'KGV': 43, 'KGY': 44, 'KGX': 24, 'KGC': 18, 'KGN': 11}
}

## Feature Pyramid

- ~~Renta, Nomprov~~  : 0.02678
- ~~Renta, Nomprov, ind_nuevo~~  : 0.02671
- Renta, Nomprov, ind_nuevo, segmento
- renta, nomprov, ind_nuevo, segmento, ind_actividad_cliente
- renta, nomprov, ind_nuevo, segmento, ind_actividad_cliente, pais_residencia
- renta, nomprov, ind_nuevo, segmento, ind_actividad_cliente, pais_residencia, ind_empleado


- renta, nomprov, ind_nuevo, segmento, ind_actividad_cliente, pais_residencia, ind_empleado, sexo
- renta, nomprov, ind_nuevo, segmento, ind_actividad_cliente, pais_residencia, ind_empleado, sexo, tiprel_1mes

* renta, nomprov, ind_nuevo, segmento, ind_actividad_cliente, pais_residencia, ind_empleado, sexo, tiprel_1mes, indrel_1mes

- ~~renta, nomprov, ind_nuevo, segmento, ind_actividad_cliente, pais_residencia, ind_empleado, sexo, tiprel_1mes, indrel_1mes, antiguedad~~ : 0.02773
- renta, nomprov, ind_nuevo, segmento, ind_actividad_cliente, pais_residencia, ind_empleado, sexo, tiprel_1mes, indrel_1mes, antiguedad, indrel



* ~~renta, nomprov, ind_nuevo, segmento, ind_actividad_cliente, pais_residencia, ind_empleado, sexo, tiprel_1mes, indrel_1mes, antiguedad, indrel, indext~~ : 0.02779



- renta, nomprov, ind_nuevo, segmento, ind_actividad_cliente, pais_residencia, ind_empleado, sexo, tiprel_1mes, indrel_1mes, antiguedad, indrel, indext, indresi
- renta, nomprov, ind_nuevo, segmento, ind_actividad_cliente, pais_residencia, ind_empleado, sexo, tiprel_1mes, indrel_1mes, antiguedad, indrel, indext, indresi, indfall
- ~~renta, nomprov, ind_nuevo, segmento, ind_actividad_cliente, pais_residencia, ind_empleado, sexo, tiprel_1mes, indrel_1mes, antiguedad, indrel, indext, indresi, indfall, canal_entrada~~ : 0.02803(Normal logistic)
- renta, nomprov, ind_nuevo, segmento, ind_actividad_cliente, pais_residencia, ind_empleado, sexo, tiprel_1mes, indrel_1mes, antiguedad, indrel, indext, indresi, indfall, canal_entrada, fecha_alta 

## Modification Functions
### Helper Function

In [None]:
def string_num_age(x):
    if(type(x) == str and x != ' NA'):
        x = int(x)
    elif( x == ' NA'):
        x = np.nan
    return x

def string_num_senior(x):
    if(type(x) == str and x != '     NA'):
        x = int(x)
    elif( x == '     NA'):
        x = np.nan
    return x

def string_num_primary(x):
    if(type(x) == str and x!= np.nan and x!='P'):
        x = float(x)
    elif(type(x) == float and math.isnan(x)==False):
        x = int(x)
    elif(x == 'P'):
        x = 2.5
    return x

def modify_age(train, test):
    print("Modifying...age")
    temp_train = train.copy()
    temp_test = test.copy()
    temp_train.age = temp_train.age.apply(lambda x: string_num_age(x))
    temp_train = temp_train.loc[temp_train.age.isnull()==False]
    temp_train.age = np.where(temp_train.age < 14, 14, temp_train.age)
    temp_train.age = np.where(temp_train.age > 90, 90, temp_train.age)
    temp_test.age = np.where(temp_test.age < 14, 14, temp_test.age)
    temp_test.age = np.where(temp_test.age > 90, 90, temp_test.age)

    temp_train.age = minmax.fit_transform(np.array(temp_train.age).reshape(-1,1))
    temp_test.age = minmax.fit_transform(np.array(temp_test.age).reshape(-1,1))
    
    return temp_train, temp_test

def modify_renta(train, test):
    temp_train = train.copy()
    temp_test = test.copy()

    temp_train.age = temp_train.age.apply(lambda x: string_num_age(x))
    temp_train = temp_train.loc[temp_train.age.isnull()==False]
    
    temp_train.nomprov = temp_train.nomprov.fillna(temp_train.nomprov.mode()[0])
    temp_train.nomprov = temp_train.nomprov.apply(lambda x: nom_mod(x))

    print('Modifying train...renta')
    province = temp_train.nomprov.unique()
#     median = np.zeros((len(province),2))  ## Shape is length(nomprov) x length(ind_actividad_cliente)

#     for i in range(len(province)):
#         for j in [0,1]:
#             median[i][j] = temp_train[(temp_train["nomprov"]==province[i]) & temp_train["ind_actividad_cliente"]==j]['renta'].median()
#     print('Train Medians found ->')
#     print(median)

#     for i in range(len(province)):
#         for j in [0,1]:
#             temp_train.renta = np.where((temp_train.nomprov == province[i]) & (temp_train.ind_actividad_cliente == j) & (temp_train.renta.isnull()==True), median[i][j], temp_train.renta)

#     del median
    median = np.zeros((len(province),1))
    for i in range(len(province)):
        median[i] = temp_train[(temp_train["nomprov"]==province[i])]['renta'].median()
    print('Train Medians found ->')
    print(median)

    for i in range(len(province)):
        temp_train.renta = np.where((temp_train.nomprov == province[i]) & (temp_train.renta.isnull()==True), median[i], temp_train.renta)

    del median
    
    temp_test.age = temp_test.age.apply(lambda x: string_num_age(x))
    temp_test = temp_test.loc[temp_test.age.isnull()==False]
    
    temp_test.nomprov = temp_test.nomprov.fillna(temp_test.nomprov.mode()[0])
    temp_test.nomprov = temp_test.nomprov.apply(lambda x: nom_mod(x))
    
    print('Modifying test...renta')
    province = temp_test.nomprov.unique()
    median = np.zeros((len(province),1))
    for i in range(len(province)):
        median[i] = temp_test[(temp_test["nomprov"]==province[i])]['renta'].median()
    print('Test Medians found ->')
    print(median)

    for i in range(len(province)):
        temp_test.renta = np.where((temp_test.nomprov == province[i]) & (temp_test.renta.isnull()==True), median[i], temp_test.renta)

    del median
    
    temp_train.renta = scaler.fit_transform(np.array(temp_train.loc[:,'renta']).reshape(-1,1))
    temp_test.renta = scaler.fit_transform(np.array(temp_test.loc[:,'renta']).reshape(-1,1))
    return temp_train, temp_test

def modify_segmento(train, test):
    print("Modifying....segmento")
    temp_train = train.copy()
    temp_test = test.copy()
    temp_train.segmento = temp_train.segmento.fillna(temp_train.segmento.mode()[0])
    temp_test.segmento = temp_test.segmento.fillna(temp_test.segmento.mode()[0])
    return temp_train, temp_test

def modify_sexo(train, test):
    print("Modifying....sexo")
    temp_train = train.copy()
    temp_test = test.copy()
    temp_train.sexo = temp_train.sexo.fillna(value=temp_train.sexo.mode()[0])
    temp_test.sexo = temp_test.sexo.fillna(value=temp_test.sexo.mode()[0])
    return temp_train, temp_test

def modify_antiguedad(train, test):
    print("Modifying....antiguedad")
    temp_train = train.copy()
    temp_test = test.copy()
    temp_train.antiguedad = temp_train.antiguedad.apply(lambda x: string_num_senior(x))
    temp_test.antiguedad = temp_test.antiguedad.apply(lambda x: string_num_senior(x))
    
    temp_train.antiguedad = temp_train.antiguedad.fillna(value=-999999)
    temp_test.antiguedad = temp_test.antiguedad.fillna(value=-999999)
    temp_train.antiguedad = np.where(temp_train.antiguedad==-999999,-1,temp_train.antiguedad)
    temp_test.antiguedad = np.where(temp_test.antiguedad==-999999,-1,temp_test.antiguedad)
    temp_train.antiguedad = minmax.fit_transform(np.array(temp_train.loc[:,'antiguedad']).reshape(-1,1))
    temp_test.antiguedad = minmax.fit_transform(np.array(temp_test.loc[:,'antiguedad']).reshape(-1,1))
    temp_train.select_dtypes(include=['int']).apply(pd.to_numeric,downcast='signed')
    temp_train.select_dtypes(include=['float']).apply(pd.to_numeric,downcast='signed')
    temp_test.select_dtypes(include=['int']).apply(pd.to_numeric,downcast='signed')
    temp_test.select_dtypes(include=['float']).apply(pd.to_numeric,downcast='signed')
    return temp_train, temp_test

def modify_fecha_dato(train, test):
    print("Modifying....fecha_dato")
    temp_train = train.copy()
    temp_test = test.copy()
    temp_train.age = temp_train.age.apply(lambda x: string_num_age(x))
    temp_test.age = temp_test.age.apply(lambda x: string_num_age(x))
    temp_test = temp_test.loc[temp_test.age.isnull()==False]
    temp_train = temp_train.loc[temp_train.age.isnull()==False]
    temp_train.fecha_dato = temp_train['fecha_dato'].apply(lambda x: 100*x.year + x.month)
    temp_test.fecha_dato = temp_test['fecha_dato'].apply(lambda x: 100*x.year + x.month)
    return temp_train, temp_test

def modify_fecha_alta(train, test):
    print("Modifying....fecha_alta")
    temp_train = train.copy()
    temp_test = test.copy()
    temp_train.age = temp_train.age.apply(lambda x: string_num_age(x))
    temp_test.age = temp_test.age.apply(lambda x: string_num_age(x))
    temp_test = temp_test.loc[temp_test.age.isnull()==False]
    temp_train = temp_train.loc[temp_train.age.isnull()==False]
    temp_train.fecha_alta = temp_train['fecha_alta'].apply(lambda x: 100*x.year + x.month)
    temp_test.fecha_alta = temp_test['fecha_alta'].apply(lambda x: 100*x.year + x.month)
    return temp_train, temp_test

def modify_indrel_1mes(train, test):
    print("Modifying...indrel_1mes")
    temp_train = train.copy()
    temp_test = test.copy()
    temp_train.indrel_1mes = temp_train.indrel_1mes.apply(lambda x: string_num_primary(x))
    temp_test.indrel_1mes = temp_test.indrel_1mes.apply(lambda x: string_num_primary(x))
    temp_train.indrel_1mes = temp_train.indrel_1mes.fillna(temp_train.indrel_1mes.median())
    temp_test.indrel_1mes = temp_test.indrel_1mes.fillna(temp_test.indrel_1mes.median())
    return temp_train, temp_test

def pais_mod(x):
    pais = ['ES','FR','AR','DE','GB','US','CO','IT','RO','MX']
    if( x not in pais):
        x = 'Outside'
    return x
    
def modify_pais_residencia(train, test):
    temp_train = train.copy()
    temp_test = test.copy()
    temp_train.pais_residencia = temp_train.pais_residencia.apply(lambda x: pais_mod(x))
    temp_test.pais_residencia = temp_test.pais_residencia.apply(lambda x: pais_mod(x))
    return temp_train, temp_test

# def modify_pais_residencia(train, test):
#     temp_train = train.copy()
#     temp_test = test.copy()
#     temp_train.pais_residencia = temp_train.pais_residencia.apply(lambda x: mapping_dict['pais_residencia'][x])
#     temp_test.pais_residencia = temp_test.pais_residencia.apply(lambda x: mapping_dict['pais_residencia'][x])
#     return temp_train, temp_test

def canal_mod(x):
    canal = ['KHE','KAT','KFC','KHQ','KFA','KHK','KHM','KHD','KHN','KAS']
    if( x not in canal):
        x = 'UNK'
    return x
    
def modify_canal_entrada(train, test):
    temp_train = train.copy()
    temp_test = test.copy()
    temp_train.canal_entrada = temp_train.canal_entrada.fillna(temp_train.canal_entrada.mode()[0])
    temp_test.canal_entrada = temp_test.canal_entrada.fillna(temp_test.canal_entrada.mode()[0])
    temp_train.canal_entrada = temp_train.canal_entrada.apply(lambda x: canal_mod(x))
    temp_test.canal_entrada = temp_test.canal_entrada.apply(lambda x: canal_mod(x))
    return temp_train, temp_test

# def modify_canal_entrada(train, test):
#     temp_train = train.copy()
#     temp_test = test.copy()
#     temp_train.canal_entrada = temp_train.canal_entrada.fillna(temp_train.canal_entrada.mode()[0])
#     temp_test.canal_entrada = temp_test.canal_entrada.fillna(temp_test.canal_entrada.mode()[0])
#     temp_train.canal_entrada = temp_train.canal_entrada.apply(lambda x: mapping_dict['canal_entrada'][x])
#     temp_test.canal_entrada = temp_test.canal_entrada.apply(lambda x: mapping_dict['canal_entrada'][x])
#     return temp_train, temp_test

def nom_mod(x):
    nomprov = ['MADRID','BARCELONA','VALENCIA','SEVILLA','CORUÑA, A','MURCIA','MALAGA','ZARAGOZA','ALICANTE','CADIZ']
    if x not in nomprov:
        x = 'OTHER'
    return x

nom_dict = {
    'MADRID': 'M',
    'BARCELONA' : 'B',
    'VALENCIA' : 'V',
    'SEVILLA' : 'S',
    'CORUÑA, A' : 'C',
    'MURCIA' : 'M1',
    'MALAGA': 'M2',
    'ZARAGOZA' : 'Z',
    'ALICANTE' : 'A1',
    'CADIZ' : 'C1',
    'OTHER' : 'O'
}
def modify_nomprov(train, test):
    temp_train = train.copy()
    temp_test = test.copy()
    temp_train.nomprov = temp_train.nomprov.fillna(temp_train.nomprov.mode()[0])
    temp_test.nomprov = temp_test.nomprov.fillna(temp_test.nomprov.mode()[0])
    temp_train.nomprov = temp_train.nomprov.apply(lambda x: nom_mod(x))
    temp_test.nomprov = temp_test.nomprov.apply(lambda x: nom_mod(x))
    temp_test.nomprov = temp_test.nomprov.apply(lambda x: nom_dict[x])
    temp_train.nomprov = temp_train.nomprov.apply(lambda x: nom_dict[x])
    return temp_train, temp_test

### Memory management Code

In [None]:
def reduce_mem_usage(props, columns_now):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    for col in columns_now:
        print(col)
        if props[col].dtype != object:  # Exclude strings

            print("******************************")
            print("dtype before: ",props[col].dtype)

            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()

            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True


            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    

            else:
                props[col] = props[col].astype(np.float32)

        print("___MEMORY USAGE AFTER COMPLETION:___")
        mem_usg = props.memory_usage().sum() / 1024**2 
        print("Memory usage is: ",mem_usg," MB")
        print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props

## Dataframe Creation

### Reading CSV

In [None]:
# x_train = pd.read_csv(train_file, usecols=feature_cols, parse_dates=['fecha_dato','fecha_alta'])
# x_test = pd.read_csv(test_file, usecols=feature_cols, parse_dates=['fecha_dato','fecha_alta'])
x_train = pd.read_csv(train_file, usecols=feature_cols, parse_dates=['fecha_dato'])
x_test = pd.read_csv(test_file, usecols=feature_cols, parse_dates=['fecha_dato'])
x_train.fecha_dato = x_train['fecha_dato'].apply(lambda x: 100*x.year + x.month)
x_test.fecha_dato = x_test['fecha_dato'].apply(lambda x: 100*x.year + x.month)
# x_train = x_train[((x_train.fecha_dato>=201501) & (x_train.fecha_dato<=201506)) |((x_train.fecha_dato>=201601) & (x_train.fecha_dato<=201604))]# | ((x_train.fecha_dato>=201510)&(x_train.fecha_dato<=201512))]
# x_train = x_train[x_train.fecha_dato==201604]

In [None]:
ids = []
x_train, x_test = modify_age(x_train, x_test)
col_to_drop = []
for idx,col in enumerate(x_train.columns):
    
    print("Reading...." + str(col))

    if col == 'age' or col == 'fecha_dato':
        continue

    elif col == "renta":
        x_train, x_test = modify_renta(x_train, x_test)
        print(col + "...Done!")

    elif col == "segmento":
        x_train, x_test = modify_segmento(x_train, x_test)
        print(col + "...Done!")

    elif col == 'sexo':
        x_train, x_test = modify_sexo(x_train, x_test)
        print(col + "...Done!")

    elif col == "ind_nuevo":
        print("Modifying...."+col)
        x_train.ind_nuevo = x_train.ind_nuevo.fillna(value=1)
        x_test.ind_nuevo = x_test.ind_nuevo.fillna(value=1)
        print(col + "...Done!")

    elif col == "antiguedad":
        x_train, x_test = modify_antiguedad(x_train, x_test)
        print(col + "...Done!")

    elif col == 'indrel':
        print("Modifying...."+col)
        x_train.indrel = x_train.indrel.fillna(value=1)
        x_test.indrel = x_test.indrel.fillna(value=1)
        print(col + "...Done!")

    elif col == 'tiprel_1mes':
        print("Modifying...."+col)
        x_train.tiprel_1mes = x_train.tiprel_1mes.fillna(x_train.tiprel_1mes.mode()[0])
        x_train.tiprel_1mes = np.where((x_train.tiprel_1mes=='N')|(x_train.tiprel_1mes=='R'), 'I',x_train.tiprel_1mes)
        x_test.tiprel_1mes = x_test.tiprel_1mes.fillna(x_test.tiprel_1mes.mode()[0])
        x_test.tiprel_1mes = np.where((x_test.tiprel_1mes=='N')|(x_test.tiprel_1mes=='R'), 'I',x_test.tiprel_1mes)
        print(col + "...Done!")

    elif col == 'indext':
        print("Modifying...."+col)
        x_train.indext = x_train.indext.fillna(value='U')
        x_test.indext = x_test.indext.fillna(value='U')
        print(col + "...Done!")

    elif col == "ind_actividad_cliente":
        print("modifying..."+col)
        print(col + "...Done!")
        # No null values

    elif col== 'ncodpers':
        print("Modifying...."+col)
        ids = x_test.ncodpers.unique()
        print(col + "...Done!")

    elif col == "nomprov":
        print("Modifying...."+ col)
        x_train, x_test = modify_nomprov(x_train, x_test)
        print(col + "...Done!")

    elif col == 'fecha_alta':
        x_train, x_test = modify_fecha_alta(x_train, x_test)
        print(col + "...Done!")

    elif col == 'pais_residencia':
        print("Modifying...."+ col)
        x_train, x_test = modify_pais_residencia(x_train, x_test)
        print(col + "...Done!")

    elif col == 'canal_entrada':
        print("Modifying...."+col)
        x_train, x_test = modify_canal_entrada(x_train, x_test)
        print(col + "...Done!")

    elif col == 'indrel_1mes':
        x_train, x_test = modify_indrel_1mes(x_train, x_test)
        print(col + "...Done!")

    else: #Handles indrel, 
        print("Modifying...."+ col)
        print(col + "...Done!")

    '''''''''Null values filled'''''''''''
    columns_now = []

    if x_train[col].dtype == 'object':
        x_train[col] = x_train[col].fillna(x_train[col].mode()[0])
        cat_enc_train = pd.get_dummies(x_train[col], prefix=col)
        cat_enc_test = pd.get_dummies(x_test[col], prefix=col)
        for i in cat_enc_train.columns.to_list():
            columns_now.append(i)
        x_train = pd.concat([x_train, cat_enc_train], axis=1)
        x_test = pd.concat([x_test, cat_enc_test], axis=1)
        col_to_drop.append(col)
    
    else:
        if(col != 'fecha_dato' and col!= 'fecha_alta' and col!='ncodpers'):
            columns_now.append(col)
        continue
        
    del cat_enc_train, cat_enc_test
    x_train.select_dtypes(include=['int']).apply(pd.to_numeric,downcast='unsigned')
    x_train.select_dtypes(include=['float']).apply(pd.to_numeric,downcast='unsigned')
    x_test.select_dtypes(include=['int']).apply(pd.to_numeric,downcast='unsigned')
    x_test.select_dtypes(include=['float']).apply(pd.to_numeric,downcast='unsigned')
    x_train = reduce_mem_usage(x_train, columns_now)
    print("Train Mem reduction...Done!")
    x_test = reduce_mem_usage(x_test, columns_now)
    print("Test Mem reduction...Done!")

for i in col_to_drop:
    x_train.drop(columns=[i], inplace=True)
    x_test.drop(columns=[i], inplace=True)
    
print(x_train.shape)
print(x_test.shape)

In [None]:
y_train = pd.read_csv(train_file, usecols = ['ncodpers','age','fecha_dato','ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',
       'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1',
       'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1',
       'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
       'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
       'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
       'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
       'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1'], dtype=dtype_list, parse_dates=['fecha_dato'])
y_train.select_dtypes(include=['int']).apply(pd.to_numeric,downcast='unsigned')
y_train.select_dtypes(include=['float']).apply(pd.to_numeric,downcast='unsigned')

# Selection of rows
y_train.age = y_train.age.apply(lambda x: string_num_age(x))
y_train = y_train.loc[y_train.age.isnull()==False]
y_train.fecha_dato = y_train['fecha_dato'].apply(lambda x: 100*x.year + x.month)
y_train = y_train.fillna(0)
y_train.select_dtypes(include=['int']).apply(pd.to_numeric,downcast='unsigned')
y_train.select_dtypes(include=['float']).apply(pd.to_numeric,downcast='unsigned')

In [None]:
y_train = reduce_mem_usage(y_train,y_train.columns)

### Lags

In [None]:
#Change this function if you take 201501 into consideration 
def create_lags(lag, date, x_train, df_name):
    
    for i in lag:
        if(i==0):
            break
        rename_dict = {}
        col_names = []
        for j in targetcols:
            name = j + '_lag_' + str(i)
            rename_dict[j] = name
            col_names.append(name)
        df = pd.DataFrame()
        for j in date:
            cur = j-i
            if(cur <= 201500):
                dum = x_train[x_train.fecha_dato == j]
                df_lag = y_train[y_train.fecha_dato==j]
                df_lag = df_lag.rename(columns=rename_dict)
                df_lag.drop(columns=['fecha_dato','age'],inplace=True)
                for k in col_names:
                    df_lag[k] = 0
                dum = dum.merge(df_lag, on=['ncodpers'], how='left')
                df = pd.concat([df,dum], axis=0)
                del dum
            else:
                if((j > 201600) and cur not in range(201501, 201512) and cur not in range(201601, 201605)):
                    cur = 201512 - (201600-cur)
                df_lag = y_train[y_train.fecha_dato==cur]
                df_lag = df_lag.rename(columns=rename_dict)
                df_lag.drop(columns=['fecha_dato','age'],inplace=True)
                dum = x_train[x_train.fecha_dato == j]
                dum = dum.merge(df_lag, on=['ncodpers'], how='left')
                df = pd.concat([df,dum], axis=0)
                print("1_>"+str(j)+"->"+str(dum.shape))
                del dum
        x_train = df
        del df
        print("1--->"+str(x_train.shape))
        print('Lag '+str(i)+' for ' + df_name +'...Done!!')
    x_train.fillna(0, inplace=True)
    return x_train

## Two way split timeframe

In [None]:
'''
# x_train_complete = reduce_mem_usage(x_train, x_train.columns)
# x_test_complete = reduce_mem_usage(x_test, x_test.columns)
lags_1 = [1,2,3,4,5]
lags_A1 = [1,2,3,4,5]
date_A1 = [201506]
date_1 = [201604]#[201503,201504,201505,201506]
lags_2 = [1,2,3]
date_2 = [201602,201603,201604]

# x_train_A1 = create_lags(lags_A1, date_A1, x_train, 'x_train_A1')
# x_train_A1= reduce_mem_usage(x_train_A1, x_train_A1.columns)
# x_test_A1 = create_lags(lags_A1, [201605], x_test, 'x_test_A1')
# x_test_A1=reduce_mem_usage(x_test_A1, x_test_A1.columns)
# y_train_A1 = y_train[((y_train.fecha_dato>=201503) & (y_train.fecha_dato<=201506))]
# y_train_A1=reduce_mem_usage(y_train_A1, y_train_A1.columns) 

x_train_1 = create_lags(lags_1, date_1, x_train, 'x_train_1')
x_train_1= reduce_mem_usage(x_train_1, x_train_1.columns)
x_test_1 = create_lags(lags_1, [201605], x_test, 'x_test_1')
x_test_1=reduce_mem_usage(x_test_1, x_test_1.columns)
y_train_1 = y_train[((y_train.fecha_dato>=201503) & (y_train.fecha_dato<=201506))]
y_train_1=reduce_mem_usage(y_train_1, y_train_1.columns) 

# x_train_2 = create_lags(lags_2, date_2, x_train, 'x_train_2')
# x_train_2=reduce_mem_usage(x_train_2, x_train_2.columns)
# x_test_2 = create_lags(lags_2, [201605], x_test, 'x_test_2')
# x_test_2=reduce_mem_usage(x_test_2, x_test_2.columns)
# y_train_2 = y_train[((y_train.fecha_dato>=201602) & (y_train.fecha_dato<=201604))]
# y_train_2=reduce_mem_usage(y_train_2, y_train_2.columns)
'''

lags_1 = [1,2,3,4]
date_1 = [201503,201504,201505,201506]
lags_2 = [1,2,3,4,5]
date_2 = [201604]

x_train_1 = create_lags(lags_1, date_1, x_train, 'x_train_1')
x_train_1= reduce_mem_usage(x_train_1, x_train_1.columns)
x_test_1 = create_lags(lags_1, [201605], x_test, 'x_test_1')
x_test_1=reduce_mem_usage(x_test_1, x_test_1.columns)
y_train_1 = y_train[((y_train.fecha_dato>=201503) & (y_train.fecha_dato<=201506))]
y_train_1=reduce_mem_usage(y_train_1, y_train_1.columns) 

x_train_2 = create_lags(lags_2, date_2, x_train, 'x_train_2')
x_train_2=reduce_mem_usage(x_train_2, x_train_2.columns)
x_test_2 = create_lags(lags_2, [201605], x_test, 'x_test_2')
x_test_2=reduce_mem_usage(x_test_2, x_test_2.columns)
y_train_2 = y_train[((y_train.fecha_dato==201604))]
y_train_2=reduce_mem_usage(y_train_2, y_train_2.columns)

In [None]:
recent_prod = y_train[y_train.fecha_dato==201604]

del y_train
del x_train, x_test

recent_prod.drop(columns=['fecha_dato'], inplace=True)
recent_prod = reduce_mem_usage(recent_prod, recent_prod.columns)

product_col = recent_prod.columns.tolist()
for i in ['ncodpers','age']:
    product_col.remove(i)



## Model 
### Weighted Average LGBM 

In [None]:
# import lightgbm as lgb
# from collections import defaultdict
# import joblib

# params = {'boosting_type': 'gbdt',
#           'max_depth' : -1,
#           'objective': 'binary',
#           'num_leaves': 64,
#           'learning_rate': 0.1,
#           'num_iterations': 200,
#           'max_bin': 512,
#           'subsample_for_bin': 200,
#           'subsample': 1,
#           'subsample_freq': 1,
#           'colsample_bytree': 0.8,
#           'reg_alpha': 5,
#           'reg_lambda': 10,
#           'min_split_gain': 0.5,
#           'min_child_weight': 1,
#           'min_child_samples': 5,
#           'scale_pos_weight': 1,
#           'num_class' : 1,
#           'metric' : 'binary_error',
#          'verbosity' : 1}

# id_preds = defaultdict(list)
# ids = x_test_1['ncodpers'].values
# models = {}

# for c in product_col:
#     print(c)
#     print(c+"-first")
#     y_t_1 = y_train_1[c]
#     x_t_1 = x_train_1.drop(['fecha_dato','ncodpers'],1)
#     model_1 = lgb.LGBMClassifier(
#         boosting_type= 'gbdt',
#         objective = 'binary',
#         #n_jobs = 3, # Updated from 'nthread'
#         #silent = True,
#         max_depth = params['max_depth'],
#         max_bin = params['max_bin'],
#         #subsample_for_bin = params['subsample_for_bin'],
#         subsample = params['subsample'],
#         subsample_freq = params['subsample_freq'],
#         min_split_gain = params['min_split_gain'],
#         min_child_weight = params['min_child_weight'],
#         min_child_samples = params['min_child_samples'],
#         scale_pos_weight = params['scale_pos_weight'],
#         learning_rate=params['learning_rate'],
#         num_iterations=params['num_iterations'],
#         verbosity = params['verbosity']
#     )
    
#     model_1.fit(x_t_1,y_t_1)
#     x_t2_1 = x_test_1.drop(['fecha_dato','ncodpers'],1)
#     prediction_1 = model_1.predict_proba(x_t2_1)[:,1]
#     name = './'+c+'2015.pkl'
#     joblib.dump(model_1, name)
#     models[c] = model_1
#     del x_t_1, y_t_1, x_t2_1, model_1

#     for id, p in zip(ids, prediction_1):
#         id_preds[id].append(p)

# joblib.dump(models,'./combined2015.pkl')

In [None]:
from collections import defaultdict
import joblib

id_preds = defaultdict(list)
ids = x_test_1['ncodpers'].values

model_15 = joblib.load('../input/combined2015/combined2015.pkl')
model_16 = joblib.load('../input/combined2016/combined.pkl')
for c in product_col:
    print(c)
    x_t2_1 = x_test_1.drop(['fecha_dato','ncodpers'],1)
    model_1 = model_15[c]
    prediction_1 = model_1.predict_proba(x_t2_1)[:,1]
    
    x_t2_2 = x_test_2.drop(['fecha_dato','ncodpers'],1)
    model_2 = model_16[c]
    prediction_2 = model_2.predict_proba(x_t2_2)[:,1]
    
    prediction = prediction_1*0.2 + prediction_2*0.8
    for id, p in zip(ids, prediction):
        id_preds[id].append(p)

## Final Prediction Selection

In [None]:
from tqdm import tqdm
#id_preds.items()
train_preds = {}

for id, p in tqdm(id_preds.items(), desc='Loading....'):
    try:
        recent = recent_prod[recent_prod.ncodpers==id].iloc[0]
        preds = {}
        for i in zip(tuple(product_col),p):
            if(recent[i[0]] == 1):
                preds[i[0]] = 1 - i[1]
            else:
                preds[i[0]] = i[1]

        temp_fin = sorted(preds.items(), key = lambda x: x[1], reverse=True)[:5]  #Dict
        preds_fin = []
        for i in temp_fin:
            preds_fin.append(i[0])
        train_preds[id] = preds_fin
    except:
        #Guy is new
        for i in zip(tuple(product_col),p):
            preds[i[0]] = i[1]
        temp_fin = sorted(preds.items(), key = lambda x: x[1], reverse=True)[:5]
        preds_fin = []
        for i in temp_fin:
            preds_fin.append(i[0])
        train_preds[id] = preds_fin

df = {
    'ncodpers': [],
    'changed' : []
}
for i in train_preds:
    df['ncodpers'].append(i)
    prods = ''
    for j in train_preds[i]:
        prods += " " + j
    df['changed'].append(prods)

print(df)
final_df = pd.DataFrame(df, columns = ['ncodpers','changed'])
final_df.to_csv('/kaggle/working/lgbm_sub1.csv', index=False)