In [7]:
import pandas as pd
credit_df = pd.read_csv("MyCreditData.csv")
credit_df.head()

Unnamed: 0,checking_account,duration,credit_history,purpose,amount,savings_account,employment_duration,installment_rate,other_debtors,present_residence,...,age,other_installment_plans,housing,number_credits,job,people_liable,telephone,foreign_worker,gender,profit
0,3,18,0,2,1049,4,2,2,2,3,...,21,1,0,0,1,0,0,0,female,242
1,3,9,0,5,2799,4,0,1,2,0,...,36,1,0,1,1,1,0,0,male,596
2,0,12,4,8,841,0,1,1,2,3,...,23,1,0,0,3,0,0,0,female,25
3,3,12,0,5,2122,4,0,0,2,0,...,39,1,0,1,3,1,0,1,male,568
4,3,12,0,5,2171,4,0,2,2,3,...,38,0,2,1,3,0,0,1,male,782


In [8]:
# ensure Python reads the categorical variables as categorical
non_categorical_columns = ['duration', 'amount', 'age', 'profit']
for column in credit_df.columns:
    if column not in non_categorical_columns:
        credit_df[column] = pd.Categorical(credit_df[column])

We now create a binary dependendent variable, *is_profitable*, indicating if *profit* is positive:

In [11]:
import numpy as np
credit_df["is_profitable"] = np.where(credit_df['profit'] > 0, 1, 0)

In [12]:
credit_df.head()

Unnamed: 0,checking_account,duration,credit_history,purpose,amount,savings_account,employment_duration,installment_rate,other_debtors,present_residence,...,other_installment_plans,housing,number_credits,job,people_liable,telephone,foreign_worker,gender,profit,is_profitable
0,3,18,0,2,1049,4,2,2,2,3,...,1,0,0,1,0,0,0,female,242,1
1,3,9,0,5,2799,4,0,1,2,0,...,1,0,1,1,1,0,0,male,596,1
2,0,12,4,8,841,0,1,1,2,3,...,1,0,0,3,0,0,0,female,25,1
3,3,12,0,5,2122,4,0,0,2,0,...,1,0,1,3,1,0,1,male,568,1
4,3,12,0,5,2171,4,0,2,2,3,...,0,2,1,3,0,0,1,male,782,1


As previously, we split our data for modeling, create dummies and normalize

In [10]:
y = credit_df['is_profitable']
X = credit_df.iloc[:, :-2] # All columns but the last two, profit and is_profitable

# Use dummy variables for categorical variables
X = pd.get_dummies(X, drop_first=True)

# Standardize our non-dummy variables
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X[['duration', 'amount', 'age']]= scaler.fit_transform(X[['duration', 'amount', 'age']])

# split into 70% training 30% validation
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state = 1)