In [2]:
# Import libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.metrics import f1_score

In [3]:
# Load dataset
credit_data = pd.read_csv('german_credit_data.csv')
credit_data

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,,2096,12,education,good
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,4,53,male,2,free,little,little,4870,24,car,bad
...,...,...,...,...,...,...,...,...,...,...,...
995,995,31,female,1,own,little,,1736,12,furniture/equipment,good
996,996,40,male,3,own,little,little,3857,30,car,good
997,997,38,male,2,own,little,,804,12,radio/TV,good
998,998,23,male,2,free,little,little,1845,45,radio/TV,bad


In [4]:
credit_data.isna().sum().sort_values(ascending=False)

Checking account    394
Saving accounts     183
Unnamed: 0            0
Age                   0
Sex                   0
Job                   0
Housing               0
Credit amount         0
Duration              0
Purpose               0
Risk                  0
dtype: int64

In [5]:
cat_features = credit_data.select_dtypes('object').columns.difference(['Risk'])
cat_features

Index(['Checking account', 'Housing', 'Purpose', 'Saving accounts', 'Sex'], dtype='object')

In [6]:
num_features = credit_data.select_dtypes(exclude='object').columns.difference(['Unnamed: 0'])

In [7]:
count = credit_data['Purpose'].value_counts()
threshold = 50
rep = count[count < threshold].index
credit_data['Purpose'] = credit_data['Purpose'].replace(rep, 'others')

In [8]:
credit_data['Purpose'].value_counts()

Purpose
car                    337
radio/TV               280
furniture/equipment    181
business                97
education               59
others                  46
Name: count, dtype: int64

In [9]:
credit_data['Checking account'].unique()

array(['little', 'moderate', nan, 'rich'], dtype=object)

In [10]:
# Random Imputation
credit_data['Checking account'] = credit_data['Checking account'].fillna(np.random.choice(pd.Series(['little','moderate','rich'])))

In [11]:
x = credit_data.iloc[:,:-1]
y = credit_data['Risk']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=23)

In [13]:
cat_pipeline = Pipeline([
    ('si', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder()),
    ('ss', StandardScaler(with_mean=False))
])

In [14]:
num_pipeline = Pipeline([
    ('si', SimpleImputer(strategy='median')),
    ('ss', StandardScaler(with_mean=False))
])

In [15]:
preprocessor = ColumnTransformer([
    ('cat_pipeline',cat_pipeline, cat_features),
    ('num_pipeline', num_pipeline, num_features)    
])

In [16]:
X_train_new = preprocessor.fit_transform(X_train)
X_test_new = preprocessor.transform(X_test)

In [17]:
le = LabelEncoder()
y_train_new  = le.fit_transform(y_train)
y_test_new = le.transform(y_test)

In [18]:
def evaluate_model(true, predicted):
    f1_sco = f1_score(true, predicted)
    return f1_sco

In [19]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "K-Nearest Classifier": KNeighborsClassifier(),
    "Support Vector Classifier": SVC()
}

model_list = []
f1_score_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train_new, y_train_new)
    
    y_train_pred = model.predict(X_train_new)
    y_test_pred = model.predict(X_test_new)
    
    model_train_f1_score = evaluate_model(y_train_new, y_train_pred)
    model_test_f1_score = evaluate_model(y_test_new, y_test_pred)
    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print("Model Performance on Train set:")
    print("F1_score : {:.2f}".format(model_train_f1_score))
    print("Model Performance on Test set:")
    print("F1_score : {:.2f}".format(model_test_f1_score))
    
    f1_score_list.append(model_test_f1_score)
    
    print("="*35)
    print("\n")
    

Logistic Regression
Model Performance on Train set:
F1_score : 0.83
Model Performance on Test set:
F1_score : 0.81


Decision Tree
Model Performance on Train set:
F1_score : 1.00
Model Performance on Test set:
F1_score : 0.75




Random Forest
Model Performance on Train set:
F1_score : 1.00
Model Performance on Test set:
F1_score : 0.80


AdaBoost
Model Performance on Train set:
F1_score : 0.85
Model Performance on Test set:
F1_score : 0.79


Gradient Boosting
Model Performance on Train set:
F1_score : 0.92
Model Performance on Test set:
F1_score : 0.80


K-Nearest Classifier
Model Performance on Train set:
F1_score : 0.86
Model Performance on Test set:
F1_score : 0.80


Support Vector Classifier
Model Performance on Train set:
F1_score : 0.87
Model Performance on Test set:
F1_score : 0.81




In [20]:
pd.DataFrame(list(zip(model_list, f1_score_list)),
             columns=["model", "F1_score"]).sort_values(by=["F1_score"], ascending=False)

Unnamed: 0,model,F1_score
0,Logistic Regression,0.813793
6,Support Vector Classifier,0.812903
2,Random Forest,0.804124
4,Gradient Boosting,0.804124
5,K-Nearest Classifier,0.8
3,AdaBoost,0.786207
1,Decision Tree,0.753731


## Random Forest

In [21]:
classify = RandomForestClassifier()
classify.fit(X_train_new, y_train_new)

y_test_pred_rf = classify.predict(X_test_new)
print(f"The f1_score of the model is : {f1_score(y_test_new, y_test_pred_rf)}")

The f1_score of the model is : 0.7986111111111113


## Logistic Regression

In [22]:
log_reg = LogisticRegression()
log_reg.fit(X_train_new, y_train_new)

y_test_pred_lr = log_reg.predict(X_test_new)
print(f"The f1_score of the model is : {f1_score(y_test_new, y_test_pred_lr)}")

The f1_score of the model is : 0.8137931034482758
