# Import Libaries

In [1]:
# dataset processing
import pandas as pd
import numpy as np

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# other
import pickle

np.random.seed(1234)

# Import Dataset

In [2]:
training_source = 'https://raw.githubusercontent.com/TJMSambas/credit_score_classifier/master/Data/german_credit.csv'
df = pd.read_csv(training_source)

In [3]:
df.head(3)

Unnamed: 0,default,account_check_status,duration_in_month,credit_history,purpose,credit_amount,savings,present_emp_since,installment_as_income_perc,personal_status_sex,...,present_res_since,property,age,other_installment_plans,housing,credits_this_bank,job,people_under_maintenance,telephone,foreign_worker
0,0,< 0 DM,6,critical account/ other credits existing (not ...,domestic appliances,1169,unknown/ no savings account,.. >= 7 years,4,male : single,...,4,real estate,67,none,own,2,skilled employee / official,1,"yes, registered under the customers name",yes
1,1,0 <= ... < 200 DM,48,existing credits paid back duly till now,domestic appliances,5951,... < 100 DM,1 <= ... < 4 years,2,female : divorced/separated/married,...,2,real estate,22,none,own,1,skilled employee / official,1,none,yes
2,0,no checking account,12,critical account/ other credits existing (not ...,(vacation - does not exist?),2096,... < 100 DM,4 <= ... < 7 years,2,male : single,...,3,real estate,49,none,own,1,unskilled - resident,2,none,yes


In [4]:
categorical = list(df.dtypes[df.dtypes == 'object'].index)
numerical = list(df.dtypes[df.dtypes != 'object'].index)

# Data Preprocessing

In [5]:
target = ['default']
predictors = list(set(df.columns) - set(target))
num_predictors = list(set(predictors) - set(categorical))
cat_predictors = list(set(predictors) - set(numerical))

In [25]:
num_predictors

['credit_amount',
 'installment_as_income_perc',
 'credits_this_bank',
 'present_res_since',
 'age',
 'people_under_maintenance',
 'duration_in_month']

In [26]:
cat_predictors

['job',
 'present_emp_since',
 'other_installment_plans',
 'credit_history',
 'personal_status_sex',
 'foreign_worker',
 'other_debtors',
 'telephone',
 'savings',
 'property',
 'purpose',
 'account_check_status',
 'housing']

In [6]:
X = df[predictors]
y = df[target]

## Categorical Encoding

In [7]:
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

In [8]:
d = defaultdict(LabelEncoder)

### Fitting

In [9]:
fit = df[cat_predictors].apply(lambda x: d[x.name].fit_transform(x))

### Transforming

In [10]:
X_encoded = X[cat_predictors].apply(lambda x: d[x.name].transform(x))

## Numerical Encoding

In [11]:
from sklearn.preprocessing import MinMaxScaler

In [12]:
scaler = MinMaxScaler()

In [13]:
X_scaled = scaler.fit_transform(X[num_predictors])

  return self.partial_fit(X, y)


## Final Data

In [14]:
X_full = np.concatenate((X_scaled, X_encoded), axis = 1)

## Data Splitting

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_full, y, test_size=0.25, random_state=42)

# Modeling

In [17]:
from sklearn.linear_model import LogisticRegression

In [18]:
LogReg = LogisticRegression()

## Fitting

In [19]:
LogReg.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

## Testing

In [20]:
from sklearn.metrics import roc_auc_score

In [21]:
print('LogReg AUC score (Train): %.4f' %roc_auc_score(y_train,LogReg.predict(X_train)))
print('LogReg AUC score (Test): %.4f' %roc_auc_score(y_test,LogReg.predict(X_test)))

LogReg AUC score (Train): 0.6341
LogReg AUC score (Test): 0.5897


## Saving

In [22]:
pickle.dump(LogReg, open('LogReg.pkl','wb'))

In [23]:
pickle.dump(d, open('label_dictionary.pkl','wb'))

In [24]:
pickle.dump(scaler, open('scaler.pkl','wb'))