## Projet 7 : Mettre en œuvre un outil de “scoring crédit” pour calculer la probabilité qu’un client rembourse son crédit, puis classifie la demande en crédit accordé ou refusé.

### Partie-2 : Pré-traitement des données

  - [Pré-traitement des jeux de données](#Pré-traitement-des-jeux-de-données)
      * [1- application_train et application_test](#1--application_train-et-application_test)
      * [2- bureau et bureau_balance](#2--bureau-et-bureau_balance)
      * [3- POS_CASH_balance](#3--POS_CASH_balance)
      * [4- credit_card_balance](#4--credit_card_balance)
      * [5- installments_payments](#5--installments_payments)
      * [6- previous_application](#6--previous_application)
  - [Formats des jeux de données avant et aprés pré-traitement](#Formats-des-jeux-de-données-avant-et-aprés-pré-traitement)    
  - [Jointure des jeux de données](#Jointure-des-jeux-de-données)
  - [Format et structure du nouveau jeu de données](#Format-et-structure-du-nouveau-jeu-de-données)
  - [Jeux d'entrainement et de test](#Jeux-d'entrainement-et-de-test)
  - [Valuers manquantes et leur traitement](#Valuers-manquantes-et-leur-traitement)
  - [Imputation des valeurs manquantes](#Imputation-des-valeurs-manquantes)
  - [Améleoration de l'usage de la mémoire par nos variables](#Améleoration-de-l'usage-de-la-mémoire-par-nos-variables)
  - [Enregistrement des nouveaux jeux de données](#Enregistrement-des-nouveaux-jeux-de-données)
  


### Importer les modules nécéssaires 

In [1]:
import numpy as np
import pandas as pd

In [2]:
import gc
import time
import re
from contextlib import contextmanager

In [3]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
from sklearn.impute import SimpleImputer

In [5]:
from sklearn.metrics import   make_scorer ,roc_curve,  classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, roc_auc_score,f1_score, precision_score, recall_score ,average_precision_score


In [6]:
import fonctionsUtiles as fctUtiles

In [7]:
#Temps d'exécution
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

### Pré-traitement des jeux de données

###### 1- application_train et application_test

In [8]:
def application_train_test(num_rows = None, nan_as_category = False):
    # Read data and merge
    train_df = pd.read_csv('data/application_train.csv', nrows= num_rows)
    test_df = pd.read_csv('data/application_test.csv', nrows= num_rows)
    print("Format avant pré-traitement")
    print("Train samples: {}, test samples: {}".format(train_df.shape, test_df.shape))
    #df = df.append(test_df).reset_index()
    df = pd.concat([train_df,test_df], axis=0)
    df.reset_index()
    print("Data concatened: {}".format(df.shape))
    
    # Optional: Remove 4 applications with XNA CODE_GENDER (train set)
    df = df[df['CODE_GENDER'] != 'XNA']
    
    # Categorical features with Binary encode (0 or 1; two categories)
    for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
        df[bin_feature], uniques = pd.factorize(df[bin_feature])
    # Categorical features with One-Hot encode
    df, cat_cols = fctUtiles.one_hot_encoder(df, nan_as_category)
    
    # NaN values for DAYS_EMPLOYED: 365.243 -> nan
    df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)
    # Some simple new features (percentages)
    # Some simple new features (percentages)
    df.loc[:,'DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    df.loc[:,'INCOME_CREDIT_PERC'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
    df.loc[:,'INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
    df.loc[:,'ANNUITY_INCOME_PERC'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    df.loc[:,'PAYMENT_RATE'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']
    
    df = df.set_index('SK_ID_CURR')
    
    print("Format aprés pré-traitement")
    print("Train-test samples: {}".format(df.shape))
    print("-"*100)
    
    del test_df
    gc.collect()
    
    return df

###### 2- bureau et bureau_balance

In [9]:
# Preprocess bureau.csv and bureau_balance.csv
def bureau_and_balance(nan_as_category = True):
    bureau = pd.read_csv('data/bureau.csv')
    bb = pd.read_csv('data/bureau_balance.csv')
    
    print("Format avant pré-traitement")
    print("bureau samples: {}, bureau_balance : {} ".format(bureau.shape, bb.shape))
    
    bb, bb_cat = fctUtiles.one_hot_encoder(bb, nan_as_category)
    bureau, bureau_cat = fctUtiles.one_hot_encoder(bureau, nan_as_category)
    
    # Bureau balance: Perform aggregations and merge with bureau.csv
    bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']}
    for col in bb_cat:
        bb_aggregations[col] = ['mean']
    bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations)
    bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])
    bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')
    bureau.drop(['SK_ID_BUREAU'], axis=1, inplace= True)
    del bb, bb_agg
    gc.collect()
    
    # Bureau and bureau_balance numeric features
    num_aggregations = {
        'DAYS_CREDIT': ['min', 'max', 'mean', 'var'],
        'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],
        'DAYS_CREDIT_UPDATE': ['mean'],
        'CREDIT_DAY_OVERDUE': ['max', 'mean'],
        'AMT_CREDIT_MAX_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
        'AMT_ANNUITY': ['max', 'mean'],
        'CNT_CREDIT_PROLONG': ['sum'],
        'MONTHS_BALANCE_MIN': ['min'],
        'MONTHS_BALANCE_MAX': ['max'],
        'MONTHS_BALANCE_SIZE': ['mean', 'sum']
    }
    # Bureau and bureau_balance categorical features
    cat_aggregations = {}
    for cat in bureau_cat: cat_aggregations[cat] = ['mean']
    for cat in bb_cat: cat_aggregations[cat + "_MEAN"] = ['mean']
    
    bureau_agg = bureau.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])
    # Bureau: Active credits - using only numerical aggregations
    active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]
    active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)
    active_agg.columns = pd.Index(['ACTIVE_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(active_agg, how='left', on='SK_ID_CURR')
    del active, active_agg
    gc.collect()
    # Bureau: Closed credits - using only numerical aggregations
    closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]
    closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations)
    closed_agg.columns = pd.Index(['CLOSED_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR')
 
    del bureau, closed, closed_agg 
    gc.collect()
    
    print("Format aprés pré-traitement")
    print("bureau_agg samples: {}".format(bureau_agg.shape))
    print("-"*100)
    return bureau_agg

###### 3- POS_CASH_balance

In [10]:
# Preprocess POS_CASH_balance.csv
def pos_cash(nan_as_category = True):
    pos = pd.read_csv('data/POS_CASH_balance.csv')
    print("Format avant pré-traitement")
    print("POS_CASH_balance: {}".format(pos.shape))
    pos, cat_cols = fctUtiles.one_hot_encoder(pos, nan_as_category= True)
    # Features
    aggregations = {
        'MONTHS_BALANCE': ['max', 'mean', 'size'],
        'SK_DPD': ['max', 'mean'],
        'SK_DPD_DEF': ['max', 'mean']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']
    
    pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations)
    pos_agg.columns = pd.Index(['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])
    # Count pos cash accounts
    pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size()
    
    del pos
    gc.collect()
    
    
    print("Format aprés pré-traitement")
    print("pos_agg samples: {}".format(pos_agg.shape))
    print("-"*100)
    return pos_agg

###### 4- credit_card_balance

In [11]:
# Preprocess credit_card_balance.csv
def credit_card_balance(nan_as_category = True):
    cc = pd.read_csv('data/credit_card_balance.csv')
    
    print("Format avant pré-traitement")
    print("credit_card_balance samples: {}".format(cc.shape))
    cc, cat_cols = fctUtiles.one_hot_encoder(cc, nan_as_category= True)
    # General aggregations
    cc.drop(['SK_ID_PREV'], axis= 1, inplace = True)
    cc_agg = cc.groupby('SK_ID_CURR').agg(['min', 'max', 'mean', 'sum', 'var'])
    cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()])
    # Count credit card lines
    cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size()
    
    del cc
    gc.collect()
    
    print("Format aprés pré-traitement")
    print("cc_agg samples: {}".format(cc_agg.shape))
    print("-"*100)
    return cc_agg

###### 5- installments_payments

In [12]:
# Preprocess installments_payments.csv
def installments_payments(nan_as_category = True):
    ins = pd.read_csv('data/installments_payments.csv')
    print("Format avant pré-traitement")
    print("installments_payments samples: {}".format(ins.shape))
    ins, cat_cols = fctUtiles.one_hot_encoder(ins, nan_as_category= True)
    # Percentage and difference paid in each installment (amount paid and installment value)
    ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
    ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT']
    # Days past due and days before due (no negative values)
    ins['DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']
    ins['DBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT']
    ins['DPD'] = ins['DPD'].apply(lambda x: x if x > 0 else 0)
    ins['DBD'] = ins['DBD'].apply(lambda x: x if x > 0 else 0)
    # Features: Perform aggregations
    aggregations = {
        'NUM_INSTALMENT_VERSION': ['nunique'],
        'DPD': ['max', 'mean', 'sum'],
        'DBD': ['max', 'mean', 'sum'],
        'PAYMENT_PERC': ['max', 'mean', 'sum', 'var'],
        'PAYMENT_DIFF': ['max', 'mean', 'sum', 'var'],
        'AMT_INSTALMENT': ['max', 'mean', 'sum'],
        'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'],
        'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']
    ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations)
    ins_agg.columns = pd.Index(['INSTAL_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()])
    # Count installments accounts
    ins_agg['INSTAL_COUNT'] = ins.groupby('SK_ID_CURR').size()
    
    del ins
    gc.collect()
    
    
    print("Format aprés pré-traitement")
    print("ins_agg samples: {}".format(ins_agg.shape))
    print("-"*100)
    return ins_agg

###### 6- previous_application

In [13]:
# Preprocess previous_applications.csv
def previous_applications(nan_as_category = True):
    prev = pd.read_csv('data/previous_application.csv')
    
    print("Format avant pré-traitement")
    print("previous_application samples: {}".format(prev.shape))
    
    prev, cat_cols = fctUtiles.one_hot_encoder(prev, nan_as_category= True)
    # Days 365.243 values -> nan
    prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
    prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
    prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
    prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
    prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)
    # Add feature: value ask / value received percentage
    prev['APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT']
    # Previous applications numeric features
    num_aggregations = {
        'AMT_ANNUITY': ['min', 'max', 'mean'],
        'AMT_APPLICATION': ['min', 'max', 'mean'],
        'AMT_CREDIT': ['min', 'max', 'mean'],
        'APP_CREDIT_PERC': ['min', 'max', 'mean', 'var'],
        'AMT_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'AMT_GOODS_PRICE': ['min', 'max', 'mean'],
        'HOUR_APPR_PROCESS_START': ['min', 'max', 'mean'],
        'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'DAYS_DECISION': ['min', 'max', 'mean'],
        'CNT_PAYMENT': ['mean', 'sum'],
    }
    # Previous applications categorical features
    cat_aggregations = {}
    for cat in cat_cols:
        cat_aggregations[cat] = ['mean']
    
    prev_agg = prev.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])
    # Previous Applications: Approved Applications - only numerical features
    approved = prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1]
    approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations)
    approved_agg.columns = pd.Index(['APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
    prev_agg = prev_agg.join(approved_agg, how='left', on='SK_ID_CURR')
    # Previous Applications: Refused Applications - only numerical features
    refused = prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1]
    refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations)
    refused_agg.columns = pd.Index(['REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
    prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR')
    
    del prev
    gc.collect()
    
    
    print("Format aprés pré-traitement")
    print("prev_agg samples: {}".format(prev_agg.shape))
    print("-"*100)
    return prev_agg

- Lancement des pré-traitement 

###### Formats des jeux de données avant et aprés pré-traitement

In [14]:
df_train_test = application_train_test()
df_bureau_and_balance = bureau_and_balance()
df_pos_cash = pos_cash()
df_installments_payments = installments_payments()
df_credit_card_balance = credit_card_balance()
#df_previous_applications = previous_applications()

Format avant pré-traitement
Train samples: (307511, 122), test samples: (48744, 121)
Data concatened: (356255, 122)
Format aprés pré-traitement
Train-test samples: (356251, 246)
----------------------------------------------------------------------------------------------------
Format avant pré-traitement
bureau samples: (1716428, 17), bureau_balance : (27299925, 3) 
Format aprés pré-traitement
bureau_agg samples: (305811, 116)
----------------------------------------------------------------------------------------------------
Format avant pré-traitement
POS_CASH_balance: (10001358, 8)
Format aprés pré-traitement
pos_agg samples: (337252, 18)
----------------------------------------------------------------------------------------------------
Format avant pré-traitement
installments_payments samples: (13605401, 8)
Format aprés pré-traitement
ins_agg samples: (339587, 26)
----------------------------------------------------------------------------------------------------
Format avant pré

- On remarque que le nombre de variables a augmenté pour chaque jeu de données
- On remarque aussi que le nombre d'observations a significativement bien baissé pour certains jeus de données

In [15]:
gc.collect()

0

###### Jointure des jeux de données

In [16]:
df = df_train_test \
.join(df_bureau_and_balance, how='left', on='SK_ID_CURR') \
.join(df_pos_cash, how='left', on='SK_ID_CURR') \
.join(df_installments_payments, how='left', on='SK_ID_CURR') \
.join(df_credit_card_balance, how='left', on='SK_ID_CURR') \
.reset_index()

In [17]:
del df_train_test, df_bureau_and_balance, df_pos_cash, df_installments_payments, df_credit_card_balance
gc.collect()

0

###### Format et structure du nouveau jeu de données

In [18]:
df1 = fctUtiles.formats(df, 'data global')
df2 = fctUtiles.vars_types(df)
df3 = fctUtiles.colunmLigneDuplicated(df,'')
dfs = {'Nouveau dataset': df1, 'Types de variables': df2, 'Colonnes/Lignes dupliquées':df3}
fctUtiles.display_dfs(dfs, justify='flex-start')
print('-'*90)
del df1, df2, df3, dfs

Unnamed: 0,Nbre de lignes,Nbre de variables
data global,356251,548

Unnamed: 0,Objet,Float,Int,Bool
,0,506,42,0

Unnamed: 0,Colonnes dupliquées,Lignes dupliquées
,0,0


------------------------------------------------------------------------------------------


- Le nouveau jeu de données comporte 356251 observations avec 548 variables dont 506 de type float et 42 de type int

In [19]:
gc.collect()

0

In [20]:
# On se rassure que les noms des colonnes sont adéquats (sans espaces)
df.columns = list(map(lambda x: str(x).replace(" ", "_").replace("-", "_").replace("_/_", "_").upper(),
                      df.columns))
df = df.rename(columns=lambda x: re.sub('[^A-Za-z0-9_]+', '', x))

In [21]:
df = df.replace([np.inf, -np.inf], np.nan).copy()

###### Jeux d'entrainement et de test

- Nous allons séparer ce nouveau jeu de données en 2 parties : Jeu d'entrainement et jeu de test
- Ceci en fonction de la valeur de la variable **TARGET** : renseignée ou pas    

In [22]:
train_df = df[df['TARGET'].notnull()]
test_df = df[df['TARGET'].isnull()]

- Formats:

In [23]:
fctUtiles.formats(train_df, 'data train')

Unnamed: 0,Nbre de lignes,Nbre de variables
data train,307507,548


In [24]:
fctUtiles.formats(test_df, 'data test')

Unnamed: 0,Nbre de lignes,Nbre de variables
data test,48744,548


- Nous entrainerons, évaluerons et testerons nos modéles avec train_df
- Le jeu de données test_df, nous le consacrerons pour tester notre meilleur modèle à travers l'api Flask qui sera déployé sur Heroku et ceci depuis le notre Dashboard

###### Valuers manquantes et leur traitement

- Valeurs manquantes en nombre et en pourcentage

In [25]:
df_nan = fctUtiles.data_count_percent(train_df)
fctUtiles.display_dfs({'Valeurs manquantes':df_nan}, justify='flex-start')

Unnamed: 0,count,percent
SK_ID_CURR,0,0.000000
OCCUPATION_TYPE_REALTY_AGENTS,0,0.000000
OCCUPATION_TYPE_SALES_STAFF,0,0.000000
OCCUPATION_TYPE_SECRETARIES,0,0.000000
OCCUPATION_TYPE_SECURITY_STAFF,0,0.000000
...,...,...
CC_CNT_DRAWINGS_ATM_CURRENT_VAR,246814,80.262888
CC_AMT_DRAWINGS_ATM_CURRENT_VAR,246814,80.262888
CC_CNT_DRAWINGS_OTHER_CURRENT_VAR,246814,80.262888
CC_AMT_DRAWINGS_POS_CURRENT_VAR,246814,80.262888


- Nous constatons que certaines variables contiennent plus de 80% de nan
- Nous avons fait le choix que de ne garder que les variables dont le % en nan ne dépassent pas 20%

In [26]:
df_final = fctUtiles.ratio(train_df, 0.2)

In [27]:
df_nan = fctUtiles.data_count_percent(df_final)
fctUtiles.display_dfs({'Valeurs manquantes < 20% ':df_nan}, justify='flex-start')

Unnamed: 0,count,percent
SK_ID_CURR,0,0.000000
ORGANIZATION_TYPE_BUSINESS_ENTITY_TYPE_1,0,0.000000
ORGANIZATION_TYPE_BUSINESS_ENTITY_TYPE_2,0,0.000000
ORGANIZATION_TYPE_BUSINESS_ENTITY_TYPE_3,0,0.000000
ORGANIZATION_TYPE_CLEANING,0,0.000000
...,...,...
BURO_AMT_CREDIT_SUM_DEBT_MEAN,51380,16.708563
BURO_AMT_CREDIT_SUM_DEBT_MAX,51380,16.708563
DAYS_EMPLOYED_PERC,55374,18.007395
DAYS_EMPLOYED,55374,18.007395


In [28]:
fctUtiles.formats(df_final, 'Jeu de données final')

Unnamed: 0,Nbre de lignes,Nbre de variables
Jeu de données final,307507,291


- Nous sommes passés de 548 à 291 variables

In [29]:
del df, train_df, df_nan
gc.collect()

0

###### Imputation des valeurs manquantes

In [30]:
# Median imputation of missing values
imputer = SimpleImputer(missing_values=np.nan, strategy='median', verbose=0)

In [31]:
imputer.fit(df_final)
df_final_imputed = imputer.transform(df_final)

In [32]:
data_cleaned = pd.DataFrame(df_final_imputed, 
                      index=df_final.index, 
                      columns=df_final.columns)

In [33]:
df_nan = fctUtiles.data_count_percent(data_cleaned)
fctUtiles.display_dfs({'Valeurs manquantes ':df_nan}, justify='flex-start')

Unnamed: 0,count,percent
SK_ID_CURR,0,0.0
DAYS_EMPLOYED_PERC,0,0.0
EMERGENCYSTATE_MODE_YES,0,0.0
EMERGENCYSTATE_MODE_NO,0,0.0
WALLSMATERIAL_MODE_WOODEN,0,0.0
...,...,...
NAME_HOUSING_TYPE_MUNICIPAL_APARTMENT,0,0.0
NAME_HOUSING_TYPE_HOUSE_APARTMENT,0,0.0
NAME_HOUSING_TYPE_CO_OP_APARTMENT,0,0.0
NAME_TYPE_SUITE_UNACCOMPANIED,0,0.0


###### Améleoration de l'usage de la mémoire par nos variables

In [34]:
df_final = fctUtiles.reduce_mem_usage(data_cleaned, verbose=True)

----------------------------------------------------------------------
Memory usage du dataframe: 685.06 MB
Memory usage après optimization: 190.62 MB
Diminution de 72.2%
----------------------------------------------------------------------


###### Enregistrement des nouveaux jeux de données

In [35]:
df_final.to_csv('./data/preprocessed_data/df_final.csv', index=False)

In [36]:
test_df.to_csv('./data/preprocessed_data/test_df.csv', index=False)

In [37]:
df_api = df_final.sample(10)

In [38]:
df_api

Unnamed: 0,SK_ID_CURR,TARGET,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,INSTAL_AMT_INSTALMENT_MEAN,INSTAL_AMT_INSTALMENT_SUM,INSTAL_AMT_PAYMENT_MIN,INSTAL_AMT_PAYMENT_MAX,INSTAL_AMT_PAYMENT_MEAN,INSTAL_AMT_PAYMENT_SUM,INSTAL_DAYS_ENTRY_PAYMENT_MAX,INSTAL_DAYS_ENTRY_PAYMENT_MEAN,INSTAL_DAYS_ENTRY_PAYMENT_SUM,INSTAL_COUNT
217321,351803.0,0.0,1.0,0.0,1.0,0.0,135000.0,72000.0,7753.5,72000.0,...,15771.503906,583545.6,10662.254883,48765.511719,15771.503906,583545.6,-23.0,-602.0,-22269.0,37.0
245278,383874.0,0.0,0.0,0.0,1.0,0.0,81000.0,450000.0,22018.5,450000.0,...,13190.174805,184662.5,2798.370117,56511.179688,12486.046875,174804.7,-513.0,-719.5,-10075.0,14.0
229630,365970.0,0.0,0.0,1.0,0.0,0.0,270000.0,945000.0,52767.0,945000.0,...,26931.332031,1265773.0,18.629999,175725.0,26883.460938,1263523.0,-5.0,-786.5,-36963.0,47.0
57338,166455.0,0.0,0.0,1.0,0.0,0.0,90000.0,188460.0,9994.5,135000.0,...,3160.68335,69535.03,1350.0,9063.360352,3036.30957,66798.81,-671.0,-1686.0,-37094.0,22.0
113491,231624.0,0.0,1.0,0.0,0.0,0.0,180000.0,824823.0,24246.0,688500.0,...,8560.25293,410892.1,4.86,35483.039062,7836.259766,376140.5,-26.0,-1578.0,-75746.0,48.0
216634,351019.0,0.0,0.0,1.0,0.0,0.0,180000.0,156384.0,16420.5,135000.0,...,9684.605469,87161.45,3730.22998,30819.150391,9684.605469,87161.45,-96.0,-332.0,-2987.0,9.0
183286,312445.0,0.0,1.0,0.0,1.0,1.0,135000.0,835380.0,40189.5,675000.0,...,20578.378906,658508.1,0.405,143295.4375,20434.591797,653906.9,-796.0,-1316.0,-42120.0,32.0
63502,173643.0,0.0,1.0,0.0,0.0,0.0,225000.0,790830.0,57676.5,675000.0,...,6327.435059,75929.22,919.440002,11602.980469,5840.115234,70081.38,-140.0,-266.75,-3200.0,12.0
31082,136077.0,0.0,0.0,1.0,1.0,1.0,180000.0,824823.0,24246.0,688500.0,...,6356.571289,139844.6,2699.72998,14076.450195,6356.571289,139844.6,-214.0,-1994.0,-43878.0,22.0
211326,344898.0,0.0,1.0,0.0,1.0,0.0,31500.0,247275.0,17586.0,225000.0,...,15998.535156,175983.9,15998.535156,15998.535156,15998.535156,175983.9,-20.0,-175.875,-1935.0,11.0


In [39]:
df_api.to_csv('./data/preprocessed_data/df_api.csv', index=False)

### Fin