In [37]:
# import libraries 
import pandas as pd 
import numpy as np

import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler


In [6]:
pd.set_option('display.max_rows', None)

In [4]:
filepath = '../data/application_train.csv'
data = pd.read_csv(filepath)

# Basic EDA

In [7]:
def basic_eda(df):

    print("INFORMATION")
    print('-------------------------------------------')
    print(df.info(verbose = True))
    print('\n')

    print("SHAPE")
    print('-------------------------------------------')
    print(df.shape)
    print('\n')

    print("DESCRIBE")
    print('-------------------------------------------')
    print(df.describe())
    print('\n')

    print("NULL COUNT")
    print('-------------------------------------------')
    print(df.isnull().sum())
    print('\n')

    print("DUPLICATES")
    print('-------------------------------------------')
    print(df.duplicated().sum())
    print('\n')

    

In [8]:
basic_eda(data)

INFORMATION
-------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Data columns (total 122 columns):
 #    Column                        Dtype  
---   ------                        -----  
 0    SK_ID_CURR                    int64  
 1    TARGET                        int64  
 2    NAME_CONTRACT_TYPE            object 
 3    CODE_GENDER                   object 
 4    FLAG_OWN_CAR                  object 
 5    FLAG_OWN_REALTY               object 
 6    CNT_CHILDREN                  int64  
 7    AMT_INCOME_TOTAL              float64
 8    AMT_CREDIT                    float64
 9    AMT_ANNUITY                   float64
 10   AMT_GOODS_PRICE               float64
 11   NAME_TYPE_SUITE               object 
 12   NAME_INCOME_TYPE              object 
 13   NAME_EDUCATION_TYPE           object 
 14   NAME_FAMILY_STATUS            object 
 15   NAME_HOUSING_TYPE             object 
 16   REGION_POPULATION_RELATIVE    floa

In [10]:
data.TARGET.value_counts()

TARGET
0    282686
1     24825
Name: count, dtype: int64

In [11]:
# filling missing values with -999
data.fillna(-999, inplace = True)

In [16]:
# separating numerical columns and categorical columns  
cat_cols = [item for item in data.columns if data[item].dtype == 'object']
num_cols = [item for item in data.columns if item not in cat_cols]

In [18]:
print(f"Number of categorical columns : {len(cat_cols)}")
print(f"Number of Numerical columns : {len(num_cols)}")


Number of categorical columns : 16
Number of Numerical columns : 106


In [21]:
data[cat_cols].head()

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,OCCUPATION_TYPE,WEEKDAY_APPR_PROCESS_START,ORGANIZATION_TYPE,FONDKAPREMONT_MODE,HOUSETYPE_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE
0,Cash loans,M,N,Y,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,Laborers,WEDNESDAY,Business Entity Type 3,reg oper account,block of flats,"Stone, brick",No
1,Cash loans,F,N,N,Family,State servant,Higher education,Married,House / apartment,Core staff,MONDAY,School,reg oper account,block of flats,Block,No
2,Revolving loans,M,Y,Y,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,Laborers,MONDAY,Government,-999,-999,-999,-999
3,Cash loans,F,N,Y,Unaccompanied,Working,Secondary / secondary special,Civil marriage,House / apartment,Laborers,WEDNESDAY,Business Entity Type 3,-999,-999,-999,-999
4,Cash loans,M,N,Y,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,Core staff,THURSDAY,Religion,-999,-999,-999,-999


In [23]:
y = data['TARGET']
X = data.drop(columns=['TARGET'], axis = 1)

In [34]:
#one hot encoding of categorical columns 

# Convert all categorical columns to string type
X[cat_cols] = X[cat_cols].astype(str)

# Initialize OneHotEncoder
encoder = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')

# Fit and transform categorical columns
X_encoded_array = encoder.fit_transform(X[cat_cols])

# Create a DataFrame with the new one-hot encoded columns
X_encoded_df = pd.DataFrame(X_encoded_array, columns=encoder.get_feature_names_out(cat_cols), index=X.index)

# Drop the original categorical columns and concatenate the new one-hot encoded columns
X_encoded = pd.concat([X.drop(columns=cat_cols), X_encoded_df], axis=1)

# Display the first few rows
X_encoded.head()

Unnamed: 0,SK_ID_CURR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes
0,100002,0,202500.0,406597.5,24700.5,351000.0,0.018801,-9461,-637,-3648.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,100003,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,-16765,-1188,-1186.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,100004,0,67500.0,135000.0,6750.0,135000.0,0.010032,-19046,-225,-4260.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,135000.0,312682.5,29686.5,297000.0,0.008019,-19005,-3039,-9833.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,100007,0,121500.0,513000.0,21865.5,513000.0,0.028663,-19932,-3038,-4311.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
#dropping customer ids 
X_encoded.drop(columns=['SK_ID_CURR'], axis = 1, inplace = True)

In [38]:

scaler = MinMaxScaler()

X_scaled_array = scaler.fit_transform(X_encoded)

X_scaled = pd.DataFrame(X_scaled_array, columns=X_encoded.columns, index=X_encoded.index)

# Display the first few rows
X_scaled.head()

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,...,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes
0,0.0,0.001512,0.090287,0.099216,0.086892,0.256321,0.888839,0.045086,0.85214,0.705433,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.0,0.002089,0.311736,0.141676,0.279067,0.045016,0.477114,0.043648,0.951929,0.959566,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.000358,0.022472,0.029916,0.033572,0.134897,0.348534,0.046161,0.827335,0.648326,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000935,0.066837,0.118466,0.073562,0.107023,0.350846,0.038817,0.601451,0.661387,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.000819,0.116854,0.088272,0.126882,0.39288,0.298591,0.03882,0.825268,0.519522,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
save_path = "../data/appl_train_benchmark_001.csv"

# Concatenate the TARGET column back to the dataset
X_scaled["TARGET"] = y

# Save the final dataset as a CSV file
X_scaled.to_csv(save_path, index=False)

print("CSV file 'appl_train_benchmark_001.csv' has been created successfully!")

CSV file 'appl_train_benchmark_001.csv' has been created successfully!


In [41]:
X_scaled.shape

(307511, 235)