In [1]:
# import packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pandas_profiling import ProfileReport

In [2]:
# load csv
df = pd.read_csv('../data/raw.csv')

In [3]:
# drop unecessary index column
df = df.drop(df.columns[0], 1)

# Add column names
new_col_names = ['Sex',
                 'Age',
                 'Debt',
                 'Married',
                 'BankCustomer',
                 'EducationLevel',
                 'Ethnicity',
                 'YearsEmployed',
                 'PriorDefault',
                 'Employed',
                 'CreditScore',
                 'DriversLicense',
                 'Citizen',
                 'ZipCode',
                 'Income',
                 'Approved'
                ]

df.columns = new_col_names

In [4]:
# split test and train data
X = df.drop(['Approved'], 1)
y = df[['Approved']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [5]:
# create pandas profiling report
train_df = X_train.join(y_train)
profile = ProfileReport(df, title='Pandas Profiling Report')

In [6]:
profile.to_notebook_iframe()

# Initial observations
- There are no missing values, but there are many features with zeroes. We should inspect these to ensure they are not errors.
- `Age` is currently a categorical feature, but should be numeric.
- `Married` is highly correlated with `BankCustomer`. We may need to remove one of these features (or combine them) to avoid issues of multicollinearity in our models.
- There are categorical features (e.g. `Citizen` and `BankCustomer`) that will need to be transformed through OHE or another method.
- The numeric features have different scales and will need to be standardized or normalized to work with some of the models we are considering.
- The dataset is well balanced, with `Approved` having 55.5% of data in one class and 45.5% in the other class.