# Prepare Data 

## Import needed libraries

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## Set seed for reproducibility

We set a seed so that train_test_split function always do the same split.

In [5]:
SEED = 27
np.random.seed(SEED)

## Load data

We load the train and test datasets provided by Kaggle from datasets/ directory and store them in dataframes.

In [11]:
train_full = pd.read_csv('./datasets/cs-training.csv')
test = pd.read_csv('./datasets/cs-test.csv')

## Change features

After putting both dataframes in a list, we can modify them at the same time:
- Name the first unnamed column 'Id'
- Replace all 'Nan' values by 0
- Create 'NumberOfTimes30DaysOrMoreLate' feature by adding 'NumberOfTime30-59DaysPastDueNotWorse', 'NumberOfTime60-89DaysPastDueNotWorse' and NumberOfTimes90DaysLate
- Create 'IncomePerPerson' feature which is equal to 'MonthlyIncome'/('NumberOfDependents'+1)
- Create 'MonthlyDebt' feature by multiplying 'DebtRatio' and 'MonthlyIncome'. We noticed some extravagant 'DebtRation' values so we decided that if 'DebtRatio'>=5 then 'MontlyDebt' get the value of 'DebtRatio'
- Create 'MonthlyBalance' feature equal to 'MoonthlyIncome' - 'MonthlyDebt'

In [12]:
dataframes = [train_full, test]
for df in dataframes:
    df.columns.values[0] = "Id"
    df.fillna(0, inplace=True)
    df['NumberOfTimes30DaysOrMoreLate'] = df['NumberOfTime30-59DaysPastDueNotWorse']+df['NumberOfTime60-89DaysPastDueNotWorse']+df['NumberOfTimes90DaysLate']
    df['IncomePerPerson'] = df['MonthlyIncome']/(df['NumberOfDependents']+1)
    df['MonthlyDebt'] = (
        np.where(
            df['DebtRatio']>=5,
            df['DebtRatio'],
            df['DebtRatio']*df['MonthlyIncome'],))
    df['MonthlyBalance'] = df['MonthlyIncome']-df['MonthlyDebt'] 

## Train / Val Split

In [20]:
VAL_SIZE = 0.3
train, val = train_test_split(train_full, test_size=VAL_SIZE)

We check the datasets shape to see if all went well

In [21]:
print(f"train_full shape: {str(train_full.shape)}")
print(f"train shape: {str(train.shape)}")
print(f"val shape: {str(val.shape)}")

train_full shape: (150000, 16)
train shape: (105000, 16)
val shape: (45000, 16)


## Save Data

In [18]:
train_full.to_csv('./datasets/trainfull.csv', index=False)
test.to_csv('./datasets/test.csv', index=False)
train.to_csv('./datasets/train.csv', index=False)
val.to_csv('./datasets/val.csv', index=False)