In [21]:
import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings('ignore')

import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import pickle

In [22]:
df = pd.read_csv('data/german_credit_card.csv') 

In [23]:
df.sample(5).T

Unnamed: 0,330,316,920,861,440
Creditability,1,1,0,0,1
Account Balance,4,4,1,1,4
Duration of Credit (month),24,24,24,48,21
Payment Status of Previous Credit,4,1,1,2,4
Purpose,6,9,2,9,2
Credit Amount,1927,1559,3552,4308,2288
Value Savings/Stocks,5,1,1,1,1
Length of current employment,3,4,4,2,2
Instalment per cent,3,4,3,3,4
Sex & Marital Status,2,3,3,2,2


In [24]:
# Only nummerical columns can have outliers
cols = ['Duration of Credit (month)','Credit Amount','Age (years)']
for i in df[cols]:
    if abs(df[i].skew()) > 0.015:
        print(i,df[i].skew())
    else:
        print('NORMAL DISTRIBUTION -->> {}'.format(i))

Duration of Credit (month) 1.0941841715555418
Credit Amount 1.9495942869127831
Age (years) 1.0247120249859745


In [25]:
# For normal Distribution Features
for i in df[cols]:
    q3 = np.percentile(df[i],75)
    q1 = np.percentile(df[i],25)
    iqr = q3 - q1
    upper_limit = q1 + 1.5*iqr
    lower_limit = q1 - 1.5*iqr
    l = []
    m = set()
    for val,idx in zip(df[i],df[i].index):
        if (val > upper_limit) or (val < lower_limit):
            l.append(idx)
            df[i].iloc[idx] = np.nan
    print(f'{i} : {len(l)} | {round(len(l)/len(df[i])*100,2)} %')
    

Duration of Credit (month) : 173 | 17.3 %
Credit Amount : 175 | 17.5 %
Age (years) : 123 | 12.3 %


In [26]:
df['Duration of Credit (month)'].isnull().sum()

173

In [27]:
# random imputation
for i in cols:
    df[i][df[i].isnull()] = df[i].dropna().sample(df[i].isnull().sum()).values

In [28]:
X = df.iloc[:,1:]
y = df.iloc[:,0]

In [29]:
X_train,X_test,y_train,y_test = train_test_split(X,y,
                                                 random_state=120,
                                                 test_size=0.25,
                                                 shuffle=True)

In [30]:
rf = RandomForestClassifier(n_estimators=120,
                             max_depth=12,
                             max_leaf_nodes=20)

In [31]:
rf.fit(X_train,y_train)

In [32]:
y_pred = rf.predict(X_test)

In [33]:
score = f1_score(y_test,y_pred)
print(f'{round(score*100,2)} %')

84.37 %


In [34]:
pickle.dump(lor,open('pickle_model.pkl','wb'))