## Import libraries

In [1]:
import numpy as np   # import numpy
import pandas as pd  # import pandas
import os
import gc   # for gabage collection
import seaborn as sns  # data visualization lib
import matplotlib.pyplot as plt
import glob 
from scipy import stats
%matplotlib inline

<hr  style= "
  display: block; 
  margin-top: 0.5em; 
  margin-bottom: 0.5em;
  margin-left: auto;
  margin-right: auto;
  border-style: inset;
  border-width: 15px "/>

## Create a few algos for viz so that they can be called anytime


In [2]:
# some customized function for plotting data
def plot_corr(df,cols):
    fig = plt.figure(figsize=(10,8))
    sns.heatmap(df[cols].corr(),cmap='RdBu_r', annot=True, center=0.0)
    plt.show()
    
def plot_count(df,col,fsize,rotation=None,fillna=True):
    fig = plt.figure(figsize=fsize)
    if fillna:
        value_count = df[col].fillna('unknown').value_counts()
    sns.barplot(value_count.index,value_count.values)
    plt.xticks(rotation=rotation)
    plt.title('value counts for {}'.format(col))
    plt.show()

<hr  style= "
  display: block; 
  margin-top: 0.5em; 
  margin-bottom: 0.5em;
  margin-left: auto;
  margin-right: auto;
  border-style: inset;
  border-width: 15px "/>

## Load All Data

In [None]:
def load_data(path): 
    return glob.glob(path+"*.csv")   

In [None]:
df_files = load_data('/Users/mega/concordia_courses/1260_ML/home_credit_risk/data/')
df_files

In [None]:
df_names = []
for i in range(len(df_files)):
    df_names.append('df' + str(i))
print(df_names)



In [None]:
all_df = {'df0': df_files[0], 'df1': df_files[1], 'df2': df_files[2], 'df3': df_files[3], 'df4': df_files[4], 'df5': df_files[5], 'df6': df_files[6]}
all_df_list = []
for key, value in all_df.items():
    key = pd.read_csv(value)
    all_df_list.append(key)
df_pos_cash, df_cc_bal, df_inst_pay, df_app_train, df_burr, df5_prev_app, df_burr_bal = all_df_list[0], all_df_list[1],  all_df_list[2],  all_df_list[3], all_df_list[4],  all_df_list[5],  all_df_list[6]



<hr  style= "
  display: block; 
  margin-top: 0.5em; 
  margin-bottom: 0.5em;
  margin-left: auto;
  margin-right: auto;
  border-style: inset;
  border-width: 15px "/>

## Load Single file (Use this to speed up testing and avoid confusion)

In [2]:
df_app_train = pd.read_csv('/Users/mega/concordia_courses/1260_ML/home_credit_risk/data/application_train.csv')

<hr  style= "
  display: block; 
  margin-top: 0.5em; 
  margin-bottom: 0.5em;
  margin-left: auto;
  margin-right: auto;
  border-style: inset;
  border-width: 15px "/>

## Memory Stats

### Calculate total memory before forcing data types for all dataframes


In [106]:
total_mem_use = []
for i in all_df_list:
    total_mem_use.append(i.memory_usage().sum())
print('Total memory usage of dataframes df0 through df6 are {:.2f} MB'.format(sum(total_mem_use)/1024**2))

### Calculate total memory before forcing data types for single dataframe

In [3]:
total_mem_use_before = df_app_train.memory_usage()
print('Total memory usage of df_app_train is {:.2f} MB'.format(sum(total_mem_use_before)/1024**2))

Total memory usage of df_app_train is 286.23 MB


<hr  style= "
  display: block; 
  margin-top: 0.5em; 
  margin-bottom: 0.5em;
  margin-left: auto;
  margin-right: auto;
  border-style: inset;
  border-width: 15px "/>

## Data Preprocessing

In [4]:
df_app_train.shape

(307511, 122)

### Create a dataframe for all missing values with column names and nan count.

In [5]:
nan_info = pd.DataFrame(df_app_train.isnull().sum()).reset_index()
nan_info.columns = ['col','nan_cnt']
nan_missing_value = nan_info[nan_info['nan_cnt'] !=0]
nan_missing_value

Unnamed: 0,col,nan_cnt
9,AMT_ANNUITY,12
10,AMT_GOODS_PRICE,278
11,NAME_TYPE_SUITE,1292
21,OWN_CAR_AGE,202929
28,OCCUPATION_TYPE,96391
...,...,...
117,AMT_REQ_CREDIT_BUREAU_DAY,41519
118,AMT_REQ_CREDIT_BUREAU_WEEK,41519
119,AMT_REQ_CREDIT_BUREAU_MON,41519
120,AMT_REQ_CREDIT_BUREAU_QRT,41519


### Extract all columns names with missing values. So we can itterate.

In [6]:
cols_with_missing = nan_missing_value['col'].values
cols_with_missing

array(['AMT_ANNUITY', 'AMT_GOODS_PRICE', 'NAME_TYPE_SUITE', 'OWN_CAR_AGE',
       'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS', 'EXT_SOURCE_1',
       'EXT_SOURCE_2', 'EXT_SOURCE_3', 'APARTMENTS_AVG',
       'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG',
       'YEARS_BUILD_AVG', 'COMMONAREA_AVG', 'ELEVATORS_AVG',
       'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'FLOORSMIN_AVG', 'LANDAREA_AVG',
       'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG',
       'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAREA_AVG', 'APARTMENTS_MODE',
       'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE',
       'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE',
       'ENTRANCES_MODE', 'FLOORSMAX_MODE', 'FLOORSMIN_MODE',
       'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE',
       'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE',
       'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI',
       'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI',
       'COMMONAREA_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI',
       'FLOOR

### Create a df with missing values

In [7]:
df_app_train_missing = pd.DataFrame(df_app_train[cols_with_missing])
df_app_train_missing.shape

(307511, 67)

### Remove columns with missing values from orignal df

In [8]:
df_app_train.drop(list(cols_with_missing), axis = 1, inplace=True )
df_app_train.shape

(307511, 55)

### Create a test df for testing

In [9]:
df_test = df_app_train_missing.copy()
df_test.shape

(307511, 67)

In [10]:
df_test_num = df_test.select_dtypes(include=['floating', 'integer'])
column_names = df_test_num.columns
unique_values = [df_test_num[each_col].nunique() for each_col in df_test_num.columns]
data_types = [df_test_num[each_col].dtype for each_col in df_test_num.columns]
mean =  [df_test_num[each_col].mean() for each_col in df_test_num.columns]
median = [df_test_num[each_col].median() for each_col in df_test_num.columns]
min_val = [df_test_num[each_col].min() for each_col in df_test_num.columns]
max_val = [df_test_num[each_col].max() for each_col in df_test_num.columns]  
std = [df_test_num[each_col].std() for each_col in df_test_num.columns]
columns = ['Column Name', 'Unique Values', 'Data Type', 'Mean', 'Median', 'Min', 'Max', 'Std']
data_num = {'Column Name': column_names, 'Unique Values':unique_values, 'Data Type':data_types, 'Mean':mean, 'Median':median, 'Min':min_val, 'Max':max_val, 'std':std}
df_test_num_info = pd.DataFrame(data=data_num)
df_test_num_info

Unnamed: 0,Column Name,Unique Values,Data Type,Mean,Median,Min,Max,std
0,AMT_ANNUITY,13672,float64,27108.573909,24903.000000,1615.500000,2.580255e+05,14493.737315
1,AMT_GOODS_PRICE,1002,float64,538396.207429,450000.000000,40500.000000,4.050000e+06,369446.460540
2,OWN_CAR_AGE,62,float64,12.061091,9.000000,0.000000,9.100000e+01,11.944812
3,CNT_FAM_MEMBERS,17,float64,2.152665,2.000000,1.000000,2.000000e+01,0.910682
4,EXT_SOURCE_1,114584,float64,0.502130,0.505998,0.014568,9.626928e-01,0.211062
...,...,...,...,...,...,...,...,...
56,AMT_REQ_CREDIT_BUREAU_DAY,9,float64,0.007000,0.000000,0.000000,9.000000e+00,0.110757
57,AMT_REQ_CREDIT_BUREAU_WEEK,9,float64,0.034362,0.000000,0.000000,8.000000e+00,0.204685
58,AMT_REQ_CREDIT_BUREAU_MON,24,float64,0.267395,0.000000,0.000000,2.700000e+01,0.916002
59,AMT_REQ_CREDIT_BUREAU_QRT,11,float64,0.265474,0.000000,0.000000,2.610000e+02,0.794056


In [11]:
df_test_obj = df_test.select_dtypes(include='object')
column_names = df_test_obj.columns
unique_values = [df_test_obj[each_col].nunique() for each_col in df_test_obj.columns]
columns = ['Column Name', 'Unique Values']
data_obj = {'Column Name': column_names, 'Unique Values':unique_values}
df_test_obj_info = pd.DataFrame(data=data_obj)
df_test_obj_info


Unnamed: 0,Column Name,Unique Values
0,NAME_TYPE_SUITE,7
1,OCCUPATION_TYPE,18
2,FONDKAPREMONT_MODE,4
3,HOUSETYPE_MODE,3
4,WALLSMATERIAL_MODE,7
5,EMERGENCYSTATE_MODE,2


In [12]:
df_test.describe()

Unnamed: 0,AMT_ANNUITY,AMT_GOODS_PRICE,OWN_CAR_AGE,CNT_FAM_MEMBERS,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,...,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
count,307499.0,307233.0,104582.0,307509.0,134133.0,306851.0,246546.0,151450.0,127568.0,157504.0,...,306490.0,306490.0,306490.0,307510.0,265992.0,265992.0,265992.0,265992.0,265992.0,265992.0
mean,27108.573909,538396.2,12.061091,2.152665,0.50213,0.5143927,0.510853,0.11744,0.088442,0.977735,...,0.143421,1.405292,0.100049,-962.858788,0.006402,0.007,0.034362,0.267395,0.265474,1.899974
std,14493.737315,369446.5,11.944812,0.910682,0.211062,0.1910602,0.194844,0.10824,0.082438,0.059223,...,0.446698,2.379803,0.362291,826.808487,0.083849,0.110757,0.204685,0.916002,0.794056,1.869295
min,1615.5,40500.0,0.0,1.0,0.014568,8.173617e-08,0.000527,0.0,0.0,0.0,...,0.0,0.0,0.0,-4292.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,16524.0,238500.0,5.0,2.0,0.334007,0.3924574,0.37065,0.0577,0.0442,0.9767,...,0.0,0.0,0.0,-1570.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,24903.0,450000.0,9.0,2.0,0.505998,0.5659614,0.535276,0.0876,0.0763,0.9816,...,0.0,0.0,0.0,-757.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,34596.0,679500.0,15.0,3.0,0.675053,0.6636171,0.669057,0.1485,0.1122,0.9866,...,0.0,2.0,0.0,-274.0,0.0,0.0,0.0,0.0,0.0,3.0
max,258025.5,4050000.0,91.0,20.0,0.962693,0.8549997,0.89601,1.0,1.0,1.0,...,34.0,344.0,24.0,0.0,4.0,9.0,8.0,27.0,261.0,25.0


#### PLEASE NOTE We have now split df_app_train into 67 missing (df_app_train_missing) 55 non missing (df_app_train with dropped missing cols)

## Clean dataframe --> identify data type --> force data size --> fill missing values

#### WORKING CODE: 
1) all column dtypes split WORKING \ 
2) all data types sized forced  WORKING \
3) all missing values filled WORKING  \
4) all categorical columns encoded WORKING \
5) all columns accounted WORKING \

In [46]:
num_stats = {
'flt':{ 'flt64_min':np.finfo(np.float64).min, 'flt64_max':np.finfo(np.float64).max, 'flt32_min':np.finfo(np.float32).min,
'flt32_max':np.finfo(np.float32).max, 'flt16_min':np.finfo(np.float16).min, 'flt16_max':np.finfo(np.float16).max},
'int':
{ 'int64_min':np.iinfo(np.int64).min, 'int64_max':np.iinfo(np.int64).max, 'int32_min':np.iinfo(np.int32).min,
'int32_max':np.iinfo(np.int32).max, 'int16_min':np.iinfo(np.int16).min, 'int16_max':np.iinfo(np.int16).max, 
'int8_min':np.iinfo(np.int8).min, 'int8_max':np.iinfo(np.int8).max }
}

def force_dtype():
    if df_test[each_col].dtype == int:
    #int8 condition already forced in binary column test
    #if col.min() >=  num_stats['int']['int8_min'].min and col.max() <=  num_stats['int']['int8_max'].max:
    #    col.astype(np.int8)
        if df_test[each_col].min() >=  num_stats['int']['int16_min'] and df_test[each_col].max() <=  num_stats['int']['int16_max']:
            df_test[each_col]= df_test[each_col].astype(np.int16)
        elif df_test[each_col].min() >=  num_stats['int']['int32_min'] and df_test[each_col].max() <=  num_stats['int']['int32_max']:
            df_test[each_col]= df_test[each_col].astype(np.int32)
        
        else: 
            #col.min() >=  num_stats['int']['int64_min'].min and col.max() <=  num_stats['int']['int64_max'].max
            df_test[each_col]= df_test[each_col].astype(np.int64)
    elif df_test[each_col].dtype == float:
    
        if df_test[each_col].min() >=  num_stats['flt']['flt16_min'] and df_test[each_col].max() <=  num_stats['flt']['flt16_max']:
            df_test[each_col]= df_test[each_col].astype(np.float16)
        elif df_test[each_col].min() >=  num_stats['flt']['flt32_min'] and df_test[each_col].max() <=  num_stats['flt']['flt32_max']:
            df_test[each_col] =  df_test[each_col].astype(np.float32)
            
        else: 
            df_test[each_col]= df_test[each_col].astype(np.float64)
            

col_int_cont=set()
col_int_bin =set()
col_obj=set()
col_obj_cat=set()
col_flt=set()
col_obj_cat_enc=set()

for each_col in df_test.columns:
# if column is OBJECT TYPE:
    if df_test[each_col].dtype in ['object']:
# Fill missing values
        df_test[each_col].fillna('unknown', inplace=True)             
# identify if column is categorical based on number of unique values (5%)
        if df_test[each_col].nunique()/len(df_test[each_col]) <= 0.2:
            col_obj_cat.add(each_col)

        else:
        # Count object columns NONE AS ALL SHOULD BE CAT
            col_obj.add(each_col)          
    elif df_test[each_col].dtype == float:
        df_test[each_col].fillna(df_test[each_col].mean(), inplace=True)
        col_flt.add(each_col)
        force_dtype()
    elif df_test[each_col].dtype == int:
# check for binary classification for missing values to -1, and force int8
# if the missing values in a column are binary in nature (0 or 1), we do not want median, we want -1        
        if df_test[each_col].min() == 0 and df_test[each_col].max() == 1 and df_test[each_col].nunique == 2:
            df_test[each_col].fillna(-1, inplace=True)
             # Count columns: none as no integer columns in df_test
            col_int_bin.add(each_col) 
            force_dtype()
#Else if not binary@int8 then force int16, int32 or int64 and fill missing values with median
        else:
            df_test[each_col].fillna(df_test[each_col].median(), inplace=True)
             # Count columns: none as no integer columns in df_test
            col_int_cont.add(each_col)
            force_dtype()

            
# memory test check df_app_train_missing b4 encoding

# encode categorical column
df_test_enc = pd.get_dummies(df_test[col_obj_cat])

# count for encoded columns
col_obj_cat_enc = list(df_test_enc.columns)

#concatanate dummy variables df_test with df_test_temp 
df_test_new = pd.concat([df_test, df_test_enc], axis = 1)
        
# check missing values again and if all data type seperated missing vaule columns and total missing columns :
cols = [col_int_cont, col_int_bin, col_obj, col_obj_cat, col_flt, col_obj_cat_enc]
count = sum(map(len, cols))
if df_test_new.isnull().sum().sum() == 0 and  count == len(df_test_new.columns):
    print( 'All missing columns seperated by data types filled and accounted for!')
    print(f'object type : {len(col_obj)}, object type categorical:{len(col_obj_cat)}, object type categorical encoded:{len(col_obj_cat_enc)}, integer type continuous: {len(col_int_cont)}, integer type binary: {len(col_int_bin)}, float type: {len(col_flt)}, total columns: {count}')  
else:
    print("There are errors in processing")



All missing columns seperated by data types filled and accounted for!
object type : 0, object type categorical:6, object type categorical encoded:47, integer type continuous: 0, integer type binary: 0, float type: 61, total columns: 114


In [39]:
df_test_new.info()
#dtypes: float16(59), float32(2), object(6), uint8(47)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Columns: 114 entries, AMT_ANNUITY to HOUSETYPE_MODE_unknown
dtypes: float16(59), float32(2), object(6), uint8(47)
memory usage: 64.8+ MB


#### Run these lines to reset df if any errors in processing above

In [45]:
df_test = df_app_train_missing.copy()
df_test.shape


(307511, 67)

### Since Df_test_new is the processed df, after cleaning, after forcing dypes and encoding we can rename

### ERRORS  DTYPE 'O' LEAKING THROUGH ? Whats difference

In [47]:
# SEEMS LIKE NOT ALL OBJ HAVE BEEN ENCODED (NAME_TYPE_SUITE , OCCUPATION_TYPE), maybe havent reached threshold??

df_app_train_cleaned = df_test_new
df_app_train_cleaned.head()
#df_app_train_cleaned['OCCUPATION_TYPE'].dtype

Unnamed: 0,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,OWN_CAR_AGE,OCCUPATION_TYPE,CNT_FAM_MEMBERS,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,...,NAME_TYPE_SUITE_Group of people,NAME_TYPE_SUITE_Other_A,NAME_TYPE_SUITE_Other_B,"NAME_TYPE_SUITE_Spouse, partner",NAME_TYPE_SUITE_Unaccompanied,NAME_TYPE_SUITE_unknown,HOUSETYPE_MODE_block of flats,HOUSETYPE_MODE_specific housing,HOUSETYPE_MODE_terraced house,HOUSETYPE_MODE_unknown
0,24700.5,351000.0,Unaccompanied,12.0625,Laborers,1.0,0.083008,0.262939,0.139404,0.024704,...,0,0,0,0,1,0,1,0,0,0
1,35698.5,1129500.0,Family,12.0625,Core staff,2.0,0.311279,0.62207,0.510742,0.095886,...,0,0,0,0,0,0,1,0,0,0
2,6750.0,135000.0,Unaccompanied,26.0,Laborers,1.0,0.501953,0.556152,0.729492,0.117432,...,0,0,0,0,1,0,0,0,0,1
3,29686.5,297000.0,Unaccompanied,12.0625,Laborers,2.0,0.501953,0.650391,0.510742,0.117432,...,0,0,0,0,1,0,0,0,0,1
4,21865.5,513000.0,Unaccompanied,12.0625,Core staff,1.0,0.501953,0.322754,0.510742,0.117432,...,0,0,0,0,1,0,0,0,0,1



### Garbage collect all unused df:

In [23]:
#We will keep the dfs we use [df_app_train, df_app_train_cleaned] and dump all rest df_app_train_missing
all_df = %who_ls DataFrame 
print(all_df)

['df_app_train', 'df_app_train_cleaned', 'df_app_train_missing', 'df_test', 'df_test_enc', 'df_test_new', 'df_test_num', 'df_test_num_info', 'df_test_obj', 'df_test_obj_info', 'nan_info', 'nan_missing_value']


### We will keep only df_app_train and df_app_train_cleaned

In [24]:
del [[df_app_train_missing, df_test, df_test_enc, df_test_new, df_test_num, df_test_num_info, df_test_obj, df_test_obj_info, nan_info, nan_missing_value]]
gc.collect()


73

### df_app_train has not been encoded or  forced dypes, but it contains no missing values so repeat process

In [110]:
#Working copy of df_app_train
df_app_train_test = df_app_train.copy()
df_app_train_test.shape

(307511, 55)

In [111]:
#df_app_train

def force_dtype():
    if df_app_train_test[each_col].dtype == int:
    #int8 condition already forced in binary column test
    #if col.min() >=  num_stats['int']['int8_min'].min and col.max() <=  num_stats['int']['int8_max'].max:
    #    col.astype(np.int8)
        if df_app_train_test[each_col].min() >=  num_stats['int']['int16_min'] and df_app_train_test[each_col].max() <=  num_stats['int']['int16_max']:
            df_app_train_test[each_col]= df_app_train_test[each_col].astype(np.int16)
        elif df_app_train_test[each_col].min() >=  num_stats['int']['int32_min'] and df_app_train_test[each_col].max() <=  num_stats['int']['int32_max']:
            df_app_train_test[each_col]= df_app_train_test[each_col].astype(np.int32)
        
        else: 
            #col.min() >=  num_stats['int']['int64_min'].min and col.max() <=  num_stats['int']['int64_max'].max
            df_app_train_test[each_col]= df_app_train_test[each_col].astype(np.int64)
    elif df_app_train_test[each_col].dtype == float:
    
        if df_app_train_test[each_col].min() >=  num_stats['flt']['flt16_min'] and df_app_train_test[each_col].max() <=  num_stats['flt']['flt16_max']:
            df_app_train_test[each_col]= df_app_train_test[each_col].astype(np.float16)
        elif df_app_train_test[each_col].min() >=  num_stats['flt']['flt32_min'] and df_app_train_test[each_col].max() <=  num_stats['flt']['flt32_max']:
            df_app_train_test[each_col] =  df_app_train_test[each_col].astype(np.float32)
            
        else: 
            df_app_train_testn[each_col]= df_app_train_test[each_col].astype(np.float64)
            

col_int_cont_2=set()
col_int_bin_2 =set()
col_obj_2=set()
col_obj_cat_2=set()
col_flt_2=set()
col_obj_cat_enc_2=set()

for each_col in df_app_train_test:
# if column is OBJECT TYPE:
    if df_app_train_test[each_col].dtype == object or df_app_train_test[each_col].dtype in ['O']:            
# identify if column is categorical based on number of unique values (5%)
        if df_app_train_test[each_col].nunique()/len(df_app_train_test[each_col]) <= 0.10:
            col_obj_cat_2.add(each_col)
        else:
        # Count object columns NONE AS ALL SHOULD BE CAT
            col_obj_2.add(each_col)          
    elif df_app_train_test[each_col].dtype == float:
        df_app_train_test[each_col].fillna(df_app_train_test[each_col].mean(), inplace=True)
        col_flt_2.add(each_col)
        force_dtype()
    elif df_app_train_test[each_col].dtype == int:
# check for binary classification for missing values to -1, and force int8
# if the missing values in a column are binary in nature (0 or 1), we do not want median, we want -1        
        if df_app_train_test[each_col].min() == 0 and df_app_train_test[each_col].max() == 1 and df_app_train_test[each_col].nunique == 2:
            df_app_train_test[each_col].fillna(-1, inplace=True)
             # Count columns: none as no integer columns in df_test
            col_int_bin_2.add(each_col) 
            force_dtype()
#Else if not binary@int8 then force int16, int32 or int64 and fill missing values with median
        else:
            df_app_train_test[each_col].fillna(df_app_train_test[each_col].median(), inplace=True)
             # Count columns: none as no integer columns in df_test
            col_int_cont_2.add(each_col)
            force_dtype()

            
# memory test check df_app_train b4 encoding

# encode categorical column
df_app_train_test_enc = pd.get_dummies(df_app_train_test[col_obj_cat_2])

# count for encoded columns
col_obj_cat_enc_2 = list(df_app_train_test_enc.columns)

#concatanate dummy variables df_test with df_test_temp 
df_app_train_test_new = pd.concat([df_app_train_test, df_app_train_test_enc], axis = 1)
        
# check missing values again and if all data type seperated missing vaule columns and total missing columns :
cols = [col_int_cont_2, col_int_bin_2, col_obj_2, col_obj_cat_2, col_flt_2, col_obj_cat_enc_2]
count = sum(map(len, cols))
if  count == len(df_app_train_test_new.columns):
    print( 'All missing columns seperated by data types filled and accounted for!')
    print(f'object type : {len(col_obj)}, object type categorical:{len(col_obj_cat)}, object type categorical encoded:{len(col_obj_cat_enc)}, integer type continuous: {len(col_int_cont)}, integer type binary: {len(col_int_bin)}, float type: {len(col_flt)}, total columns: {count}')  
else:
    print("There are errors in processing")
    
    

All missing columns seperated by data types filled and accounted for!
object type : 0, object type categorical:6, object type categorical encoded:47, integer type continuous: 0, integer type binary: 0, float type: 61, total columns: 154


### ERRORS  DTYPE 'O' LEAKING THROUGH ? Whats difference

In [114]:
df_app_train_test_new.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,NAME_INCOME_TYPE,...,CODE_GENDER_M,CODE_GENDER_XNA,FLAG_OWN_REALTY_N,FLAG_OWN_REALTY_Y,NAME_FAMILY_STATUS_Civil marriage,NAME_FAMILY_STATUS_Married,NAME_FAMILY_STATUS_Separated,NAME_FAMILY_STATUS_Single / not married,NAME_FAMILY_STATUS_Unknown,NAME_FAMILY_STATUS_Widow
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,Working,...,1,0,0,1,0,0,0,1,0,0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,State servant,...,0,0,1,0,0,1,0,0,0,0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,Working,...,1,0,0,1,0,0,0,1,0,0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,Working,...,0,0,0,1,1,0,0,0,0,0
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,Working,...,1,0,0,1,0,0,0,1,0,0


In [115]:
df_app_train_test_new['NAME_CONTRACT_TYPE'].dtype

dtype('O')

In [48]:
df_app_train_test_num = df_app_train_test.select_dtypes(include=['floating', 'integer'])
column_names = df_app_train_test_num.columns
unique_values = [df_app_train_test_num[each_col].nunique() for each_col in df_app_train_test_num.columns]
data_types = [df_app_train_test_num[each_col].dtype for each_col in df_app_train_test_num.columns]
mean =  [df_app_train_test_num[each_col].mean() for each_col in df_app_train_test_num.columns]
median = [df_app_train_test_num[each_col].median() for each_col in df_app_train_test_num.columns]
min_val = [df_app_train_test_num[each_col].min() for each_col in df_app_train_test_num.columns]
max_val = [df_app_train_test_num[each_col].max() for each_col in df_app_train_test_num.columns]  
std = [df_app_train_test_num[each_col].std() for each_col in df_app_train_test_num.columns]
columns = ['Column Name', 'Unique Values', 'Data Type', 'Mean', 'Median', 'Min', 'Max', 'Std']
data_num = {'Column Name': column_names, 'Unique Values':unique_values, 'Data Type':data_types, 'Mean':mean, 'Median':median, 'Min':min_val, 'Max':max_val, 'std':std}
df_app_train_test_num_info = pd.DataFrame(data=data_num)
df_app_train_test_num_info

Unnamed: 0,Column Name,Unique Values,Data Type,Mean,Median,Min,Max,std
0,SK_ID_CURR,307511,int32,278180.518577,278202.0,100002.0,456255.0,102790.175348
1,TARGET,2,int16,0.080729,0.0,0.0,1.0,0.272419
2,CNT_CHILDREN,15,int16,0.417052,0.0,0.0,19.0,0.722121
3,AMT_INCOME_TOTAL,2548,float32,168797.921875,147150.0,25650.0,117000000.0,237123.140625
4,AMT_CREDIT,5603,float32,599025.9375,513531.0,45000.0,4050000.0,402490.78125
5,REGION_POPULATION_RELATIVE,81,float16,0.0,0.018845,0.00029,0.07250977,0.0
6,DAYS_BIRTH,17460,int16,-16036.995067,-15750.0,-25229.0,-7489.0,4363.988632
7,DAYS_EMPLOYED,12574,int32,63815.045904,-1213.0,-17912.0,365243.0,141275.766519
8,DAYS_REGISTRATION,5282,float16,,-4504.0,-24672.0,0.0,
9,DAYS_ID_PUBLISH,6168,int16,-2994.202373,-3254.0,-7197.0,0.0,1509.450419


### Reassemble the df_app_train with df_app_train_cleaned (PLEASE NOTE: df_app_train doesnt have missing values BUT Data type is not FORCED nor is an dummy encoding done, so it can be run thru the same preprocessing code which we can try to make as a lamda or reg function)

In [67]:
df_app_train_final = pd.concat([df_app_train_test_new, df_app_train_cleaned], axis = 1)

In [69]:
len(df_app_train_final.columns)

268

In [68]:
df_app_train_final.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,NAME_INCOME_TYPE,...,NAME_TYPE_SUITE_Group of people,NAME_TYPE_SUITE_Other_A,NAME_TYPE_SUITE_Other_B,"NAME_TYPE_SUITE_Spouse, partner",NAME_TYPE_SUITE_Unaccompanied,NAME_TYPE_SUITE_unknown,HOUSETYPE_MODE_block of flats,HOUSETYPE_MODE_specific housing,HOUSETYPE_MODE_terraced house,HOUSETYPE_MODE_unknown
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,Working,...,0,0,0,0,1,0,1,0,0,0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,State servant,...,0,0,0,0,0,0,1,0,0,0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,Working,...,0,0,0,0,1,0,0,0,0,1
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,Working,...,0,0,0,0,1,0,0,0,0,1
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,Working,...,0,0,0,0,1,0,0,0,0,1


### Debug dtype 'O' because obj not encoding

In [116]:
df_app_train_final.shape

(307511, 268)