In [None]:
# IMPORTING NECCESSARY LIBRARIES 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_digits
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import roc_curve, roc_auc_score

%matplotlib inline
sns.set_style("darkgrid")


# IMPORTING THE DATASET
df_loan = pd.read_csv('german_credit_data.csv', index_col=0)
df_loan.head()



# Exploratory data analysis (EDA)

# Age
sns.displot(df_loan["Age"], kde = True)
plt.xlabel('Age', fontsize = 15)

plt.figure(figsize=(10,4))
df_loan[df_loan['Risk']== "bad"]['Age'].hist(alpha=0.9,color='red',
                                              bins=30,label='Bad Risk')
df_loan[df_loan['Risk']== "good"]['Age'].hist(alpha=0.6,color='blue',
                                              bins=30,label='Good Risk')
plt.legend()
plt.xlabel('Age', fontsize = 15)

sns.boxplot(y= "Age", x = "Risk", data = df_loan)
plt.xlabel('Age', fontsize = 15)

# Credit Amount
sns.displot(df_loan["Credit amount"])
plt.xlabel('Credit amount', fontsize = 15)

plt.figure(figsize=(10,4))
df_loan[df_loan['Risk']== "bad"]['Credit amount'].hist(alpha=0.9,color='red',
                                              bins=30,label='Bad Risk')
df_loan[df_loan['Risk']== "good"]['Credit amount'].hist(alpha=0.6,color='green',
                                              bins=30,label='Good Risk')
plt.legend()
plt.xlabel('Credit Amount', fontsize = 15)

sns.boxplot(y= "Credit amount", x = "Risk", data = df_loan)
plt.xlabel('Credit amount', fontsize = 15)

## Duration
sns.boxplot(x = "Risk", y = "Duration", data = df_loan)
plt.xlabel('Duration', fontsize = 15)

plt.figure(figsize=(10,4))
df_loan[df_loan['Risk']== "bad"]['Duration'].hist(alpha=0.7,color='red',
                                              bins=30,label='Bad Risk')
df_loan[df_loan['Risk']== "good"]['Duration'].hist(alpha=0.6,color='blue',
                                              bins=30,label='Good Risk')
plt.legend()
plt.xlabel('Duration', fontsize = 15)

## Risk
plt.figure(figsize=(5,5))
sns.countplot("Risk", data = df_loan)
plt.xlabel('Risk', fontsize = 15)

## Sex
plt.figure(figsize=(5,5))
sns.countplot("Sex", data = df_loan, hue = "Risk")
plt.xlabel('Sex', fontsize = 15)

## Checking accounts
plt.figure(figsize= (7, 5))
sns.boxplot("Checking account", "Credit amount", data = df_loan)
plt.xlabel('Checking account', fontsize = 15)

plt.figure(figsize= (7, 5))
sns.countplot("Checking account", data = df_loan, hue = "Risk")
plt.xlabel('Checking account', fontsize = 15)

sns.countplot("Saving accounts", data = df_loan, hue = "Risk")
plt.xlabel('Saving accounts', fontsize = 15)

## Housing
sns.countplot("Housing", data = df_loan, hue = "Risk")
plt.xlabel('Housing', fontsize = 15)



# PRE-PROCESSING

## Sex
df_loan["Sex"].value_counts()
#cross table for the 'Sex' feature
cross_sex = pd.crosstab(df_loan['Risk'], df_loan['Sex']).apply(lambda x: x/x.sum() * 100)
decimals = pd.Series([2,2], index=['Male', 'Female'])
cross_sex = cross_sex.round(2)
cross_sex_transposed = cross_sex.T
cross_sex_transposed
# Performing OneHotEncoding
df_loan["Sex"] = df_loan["Sex"].apply(lambda x:1 if x=="male" else 0)
df_loan["Sex"].head()

## job
# Where; 0 - unskilled and non-resident, 1 - unskilled and resident, 2 - skilled, 3 - highly skilled
df_loan["Job"].head()

## Housing
df_loan["Housing"].value_counts()
#cross table for the 'housing' feature
cross_housing = pd.crosstab(df_loan['Risk'], df_loan['Housing']).apply(lambda x: x/x.sum() * 100)
cross_housing = cross_housing.round(2)
cross_housing_transposed = cross_housing.T
cross_housing_transposed
## Performing OneHotEncoding
# 1- Own, 2- Rent, 0- Free
df_loan["Housing"].replace(["own", "rent", "free"], [1, 2, 0], inplace = True)
df_loan["Housing"].head()

## Purpose
df_loan["Purpose"].value_counts(normalize = True)
df_loan["Purpose"].replace(["repairs", "radio/TV", "vacation/others"], "others", inplace = True)
df_loan["Purpose"].replace(["furniture/equipment", "domestic appliances"], "domestic equipments", inplace = True)
df_loan["Purpose"].value_counts()
#cross table for the 'Purpose' feature
cross_sex = pd.crosstab(df_loan['Risk'], df_loan['Purpose']).apply(lambda x: x/x.sum() * 100)
cross_sex = cross_sex.round(2)
cross_sex_transposed = cross_sex.T
cross_sex_transposed
## Performing OneHotEncoding
# where; 0- others, 1- business, 2- car, 3- domestic equipments, 4- education.
df_loan["Purpose"].replace(["others", "business", "car", "domestic equipments", "education"], [0, 1, 2, 3, 4], inplace = True)
df_loan.head()

## Saving Accounts
df_loan["Saving accounts"].value_counts(normalize= True)
df_loan["Saving accounts"].fillna("None", inplace= True)
df_loan["Saving accounts"].value_counts(normalize= True)
#cross table for the 'Saving accounts' feature
cross_sex = pd.crosstab(df_loan['Risk'], df_loan['Saving accounts']).apply(lambda x: x/x.sum() * 100)
cross_sex = cross_sex.round(2)
cross_sex_transposed = cross_sex.T
cross_sex_transposed
## Performing OneHotEncoding
df_loan["Saving accounts"].replace(["little", "None", "moderate", "quite rich", "rich"], [1, 0, 2, 4, 3], inplace= True)
df_loan["Saving accounts"].head()

## Checking Account
df_loan["Checking account"].value_counts(normalize= True)
df_loan["Checking account"].fillna("None", inplace= True)
df_loan["Checking account"].value_counts(normalize= True)
#cross table for the 'Checking account' feature
cross_sex = pd.crosstab(df_loan['Risk'], df_loan['Checking account']).apply(lambda x: x/x.sum() * 100)
cross_sex = cross_sex.round(2)
cross_sex_transposed = cross_sex.T
cross_sex_transposed
## Performing OneHotEncoding
df_loan["Checking account"].replace(["little", "None", "moderate", "rich"], [1, 0, 2, 3], inplace= True)
df_loan["Checking account"].head()
df_loan.head()

## Preprocessing the dependent variable - Risk
df_loan["Risk"].value_counts(normalize= True)
## Encoding the dependent variable
df_loan["Risk_Status"] = df_loan["Risk"].apply(lambda x:1 if x == "bad" else 0)
df_loan["Risk_Status"].head()
df_loan.head()
df_loan.tail()
df_loan.drop("Risk", axis = 1, inplace = True)
df_loan.head()


## Scalling the dataset
scaler = StandardScaler()
scaler.fit(df_loan.drop("Risk_Status", axis = 1)) 
scaled_features = scaler.transform(df_loan.drop("Risk_Status", axis = 1))
scaled_features
df_feat = pd.DataFrame(scaled_features, columns = df_loan.columns[:-1])
df_feat.head()


### Train test split
X = df_feat
y = df_loan["Risk_Status"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 101)




# BUILDING THE MODELS


# KNN Model

knn = KNeighborsClassifier(n_neighbors=5, metric = "euclidean")
knn.fit(X_train, y_train)
pred = knn.predict(X_test)

### Evaluation of the KNN model
print(confusion_matrix(y_test, pred))
print("===========================================================")
print(classification_report(y_test, pred))
print(f"Train Accuracy : {knn.score(X_train, y_train):.3f}")
print(f"Test Accuracy : {knn.score(X_test, y_test):.3f}")

### Exploring the model using Grid search cv
knn_gs = KNeighborsClassifier()
parameters = {
    "n_neighbors": [2, 3, 21, 22],
    "metric": ["minkowski", "euclidean"],
    "weights": ["uniform", "distance"],
    "p": [1,2,3,4,5]
}

grid_search = GridSearchCV(estimator = knn_gs, 
                           param_grid = parameters, 
                           scoring = "accuracy", 
                           cv = 10)

grid_search.fit(X_train, y_train)
grid_search.best_params_
accuracy = grid_search.best_score_
accuracy
knn_model = KNeighborsClassifier(n_neighbors = 21, p = 2, weights = "uniform")
knn_model.fit(X_train, y_train)
print(f"Train Accuracy : {knn_model.score(X_train, y_train):.3f}")
print(f"Test Accuracy : {knn_model.score(X_test, y_test):.3f}")
pred = knn_model.predict(X_test)
print(confusion_matrix(y_test, pred))
print("===========================================================")
print(classification_report(y_test, pred))

### Learning curve
from sklearn.model_selection import ShuffleSplit
cv = ShuffleSplit(test_size =0.2, random_state = 101)
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Accuracy Scores")
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    plt.grid()
    
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, 
                     alpha=0.1,color="r")
    
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, 
                     alpha=0.1, color="g")
    
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

X, y = X_train, y_train

title = "Learning Curve (Tuned K-Nearest Neighbors)"
cv = ShuffleSplit(test_size=0.2, random_state=0)
plot_learning_curve(knn_model, title, X, y, ylim=(0.5, 1.02), cv=cv, n_jobs=4)
plt.show()





# logistic Regreression

logmode = LogisticRegression()
logmode.fit(X_train, y_train)
predictions = logmode.predict(X_test)
logmode.intercept_, logmode.coef_

#Model Evaluation
print(f"Train Accuracy : {logmode.score(X_train, y_train):.3f}")
print(f"Test Accuracy : {logmode.score(X_test, y_test):.3f}")
confusion_matrix(y_test, predictions)
print(classification_report(y_test, predictions))

## Parameter tunning
log_gs = LogisticRegression()

parameter = {
    "C": np.logspace(-4, 4, 20), 
    "penalty": ["l1", "l2", 'elasticnet'], ## L1- Lasso, L2- Ridge
    "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"], 
    "max_iter": [100, 10]
}

grid_search = GridSearchCV(log_gs, parameter, scoring = "accuracy",
                           cv = 10, verbose = True, n_jobs = 1)
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
accuracy = grid_search.best_score_; accuracy
grid_search.best_estimator_

log_model = LogisticRegression(C = 0.004832930238571752, solver='liblinear', max_iter = 100)
log_model.fit(X_train, y_train)
print(f"Train Accuracy : {log_model.score(X_train, y_train):.3f}")
print(f"Test Accuracy : {log_model.score(X_test, y_test):.3f}")
pred = log_model.predict(X_test)

print(confusion_matrix(y_test, pred))
print("===========================================================")
print(classification_report(y_test, pred))

### Learning Curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Accuracy Score")
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    plt.grid()
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, 
                     alpha=0.1,color="r")
    
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, 
                     alpha=0.1, color="g")
    
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

X, y = X_train, y_train
title = "Learning Curves (Logistic Regression))"
cv = ShuffleSplit(test_size=0.2, random_state=0)
estimator = LogisticRegression()

plot_learning_curve(estimator, title, X, y, ylim=(0.5, 1.02), cv=cv, n_jobs=4)
plt.show()





# Naive bayes (NB)

naive_bayes = GaussianNB()
naive_bayes.fit(X_train, y_train)
pred = naive_bayes.predict(X_test)
print(f"Train Accuracy : {naive_bayes.score(X_train, y_train):.3f}")
print(f"Test Accuracy : {naive_bayes.score(X_test, y_test):.3f}")

accuracy_score(y_test, pred)
confusion_matrix(y_test, pred)
print(classification_report(y_test, pred))

# Best Parameter
nb_gs = GaussianNB()
parameters = {
    "var_smoothing": np.logspace(0, -9, num = 100), 
}
grid_search = GridSearchCV(estimator = nb_gs, 
                           param_grid = parameters, 
                           scoring = "accuracy", 
                           cv = 10, 
                           n_jobs = -1)

grid_search.fit(X_train, y_train)
nb_gs.get_params().keys()
grid_search.best_params_
accuracy = grid_search.best_score_; accuracy
grid_search.best_estimator_

nb_model = GaussianNB(var_smoothing = 8.111308307896873e-06)
nb_model.fit(X_train, y_train)
print(f"Train Accuracy : {nb_model.score(X_train, y_train):.3f}")
print(f"Test Accuracy : {nb_model.score(X_test, y_test):.3f}")

pred = nb_model.predict(X_test)
print(confusion_matrix(y_test, pred))
print("===========================================================")
print(classification_report(y_test, pred))

### Learning Curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Accuracy Score")
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    plt.grid()
    
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, 
                     alpha=0.1,color="r")
    
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, 
                     alpha=0.1, color="g")
    
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt


X, y = X_train, y_train

title = "Learning Curves (Naive Bayes))"
cv = ShuffleSplit(test_size=0.2, random_state=0)

estimator = GaussianNB()
plot_learning_curve(estimator, title, X, y, ylim=(0.5, 1.02), cv=cv, n_jobs=4)
plt.show()





# Support vecctor machines

svc_classifier = SVC()
svc_classifier.fit(X_train, y_train)

y_pred = svc_classifier.predict(X_test)
confusion_matrix(y_test, y_pred)
print(f"Train Accuracy : {svc_classifier.score(X_train, y_train):.3f}")
print(f"Test Accuracy : {svc_classifier.score(X_test, y_test):.3f}")
print(classification_report(y_test, y_pred))

# Best Parameter
svm_gs = SVC()
parameters = {
    'kernel': ['rbf','poly'], 
    'degree': [1, 2, 3, 4, 5, 6, 7]
}
grid_search = GridSearchCV(estimator = svm_gs, 
                           param_grid = parameters, 
                           scoring = "accuracy", 
                           cv = 5, 
                           n_jobs = -1)

grid_search.fit(X_train, y_train)
grid_search.best_params_
accuracy = grid_search.best_score_; accuracy
grid_search.best_estimator_

svm_model = SVC(degree=6, kernel='poly')
svm_model.fit(X_train, y_train)

print(f"Train Accuracy : {svm_model.score(X_train, y_train):.3f}")
print(f"Test Accuracy : {svm_model.score(X_test, y_test):.3f}")

pred = svm_model.predict(X_test)

print(confusion_matrix(y_test, pred))
print("===========================================================")
print(classification_report(y_test, pred))

### Learning Curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Accuracy Score")
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    plt.grid()
    
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, 
                     alpha=0.1,color="r")
    
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, 
                     alpha=0.1, color="g")
    
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt


X, y = X_train, y_train

title = "Learning Curves (SVM)"
cv = ShuffleSplit(test_size=0.2, random_state=0)
estimator = svm_model
plot_learning_curve(estimator, title, X, y, ylim=(0.5, 1.02), cv=cv, n_jobs=4)

plt.show()





# Random Forest

rfc_classifier = RandomForestClassifier(n_jobs= -1, oob_score= False, random_state= 0)
rfc_classifier.fit(X_train, y_train)

pred = rfc_classifier.predict(X_test)
accuracy_score(y_test, pred)
confusion_matrix(y_test, pred)

print(f"Train Accuracy : {rfc_classifier.score(X_train, y_train):.3f}")
print(f"Test Accuracy : {rfc_classifier.score(X_test, y_test):.3f}")
print(classification_report(y_test, pred))

## Applying GridSearchCV
grid = {
    'n_estimators': [50, 60, 70, 75, 80, 90], 
    'max_depth': [5, 7, 8, 9, 10, 12, 15], 
    'max_features': [2, 3, 4, 5], 
    'min_samples_leaf' :[1, 2, 3, 4], 
    'min_samples_split': [2, 4, 5, 6, 7, 8], 
    'criterion': ["gini", "entropy"]
}

grid_search_1 = GridSearchCV(estimator = rfc_classifier, 
                           param_grid = grid, 
                           scoring = "accuracy", 
                           verbose = 2,
                           cv = 4)

grid_search_1 = grid_search_1.fit(X_train, y_train)
grid_search_1.best_estimator_
accuracy = grid_search_1.best_score_; accuracy
grid_search_1.best_params_

rfc_classifier = RandomForestClassifier(criterion='entropy', max_depth=12, max_features=5,
                       min_samples_leaf=2, min_samples_split=7, n_estimators=80,
                       n_jobs=-1, random_state=0)

rfc_classifier.fit(X_train, y_train)

# Model Evaluation
print(f"Train Accuracy : {rfc_classifier.score(X_train, y_train):.3f}")
print(f"Test Accuracy : {rfc_classifier.score(X_test, y_test):.3f}")

pred = rfc_classifier.predict(X_test)
print(confusion_matrix(y_test, pred))
print("===========================================================")
print(classification_report(y_test, pred))

### Learning Curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Accuracy Score")
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    plt.grid()
    
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, 
                     alpha=0.1,color="r")
    
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, 
                     alpha=0.1, color="g")
    
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt


X, y = X_train, y_train

title = "Learning Curves (Random Forest)"
cv = ShuffleSplit(test_size=0.2, random_state=0)
estimator = rfc_classifier
plot_learning_curve(estimator, title, X, y, ylim=(0.5, 1.02), cv=cv, n_jobs=4)

plt.show()





# Decision trees

dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)

pred = dtree.predict(X_test)
print(f"Train Accuracy : {dtree.score(X_train, y_train):.3f}")
print(f"Test Accuracy : {dtree.score(X_test, y_test):.3f}")
print(confusion_matrix(y_test, pred))
print("===========================================================")
print(classification_report(y_test, pred))

# Best Parameter
dtree_gs = DecisionTreeClassifier()

parameters = {
    'criterion': ['gini','entropy'], 
    'min_samples_split': [2 ,4 ,6 ,8 ,10 ,15]
}

grid_search = GridSearchCV(estimator = dtree_gs, 
                           param_grid = parameters, 
                           scoring = "accuracy", 
                           cv = 5, 
                           n_jobs = -1)
grid_search.fit(X_train, y_train)

grid_search.best_params_
accuracy = grid_search.best_score_; accuracy
grid_search.best_estimator_

dtree_model = DecisionTreeClassifier(min_samples_split=15)
dtree_model.fit(X_train, y_train)

print(f"Train Accuracy : {dtree_model.score(X_train, y_train):.3f}")
print(f"Test Accuracy : {dtree_model.score(X_test, y_test):.3f}")

pred = dtree_model.predict(X_test)
print(confusion_matrix(y_test, pred))
print("===========================================================")
print(classification_report(y_test, pred))

### Learning Curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Accuracy Score")
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    plt.grid()
    
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, 
                     alpha=0.1,color="r")
    
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, 
                     alpha=0.1, color="g")
    
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

X, y = X_train, y_train

title = "Learning Curves (Decision Trees)"
cv = ShuffleSplit(test_size=0.2, random_state=0)
estimator = dtree_model
plot_learning_curve(estimator, title, X, y, ylim=(0.5, 1.02), cv=cv, n_jobs=4)

plt.show()





# BUILDIING THE BEST MODEL 

rfc_classifier = RandomForestClassifier(criterion='entropy', max_depth=12, max_features=5,
                       min_samples_leaf=2, min_samples_split=7, n_estimators=80,
                       n_jobs=-1, random_state=0)

rfc_classifier.fit(X_train, y_train)

# Model Evaluation
print(f"Train Accuracy : {rfc_classifier.score(X_train, y_train):.3f}")
print(f"Test Accuracy : {rfc_classifier.score(X_test, y_test):.3f}")

# Prediction of Test
pred_proba = rfc_classifier.predict_proba(X_test)[:, 1]
pred_proba

# Feature Importance
X_test.columns
feat_imp = pd.DataFrame({"Variable": X_test.columns, 
                         "Importance": rfc_classifier.feature_importances_}).sort_values(
    by = "Importance", ascending= False).reset_index(drop = True)
feat_imp
feat_imp.sort_values("Importance").plot("Variable", "Importance", "barh", figsize = (10, 5))


# DECISION MAKING

### ROC curve
def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='blue', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()
fpr, tpr, thresholds = roc_curve(y_test, pred_proba, )
print(tpr)
print(fpr)
print(thresholds)

print(f"ROC-AUC: {roc_auc_score(y_test, pred_proba):.3f}")

optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print(f"Threshold value is: {optimal_threshold:.3f}")

plot_roc_curve(fpr, tpr)

#### acceptance rate;
df_accept = pd.DataFrame({"Actual Test": y_test, "Probability of default": pred_proba})
df_accept

df_accept['Prediction'] = np.where(df_accept['Probability of default'] > optimal_threshold, 1, 0)
df_accept

df_accept["accept or reject"] = np.where(df_accept["Prediction"] == 1, "reject", "accept")
df_accept = df_accept.reset_index(drop = True)
df_accept.head()
df_accept.tail()

pd.crosstab(df_accept['Actual Test'], df_accept['Prediction'], rownames = ['Actual'], colnames = ['Predicted'])
pd.crosstab(df_accept['Actual Test'], df_accept['Prediction'], rownames = ['Actual'], colnames = ['Predicted']) / df_accept.shape[0]

actual = df_accept["Actual Test"]
thresh_pred = df_accept["Prediction"]

print(classification_report(actual, thresh_pred))
print(confusion_matrix(actual, thresh_pred))

# Setting Cutoffs
fpr.shape

df_cutoffs = pd.concat([pd.DataFrame(thresholds), pd.DataFrame(fpr), pd.DataFrame(tpr)], axis = 1)
df_cutoffs.columns = ['thresholds', 'fpr', 'tpr']
df_cutoffs.head()

df_cutoffs['thresholds'][0] = 1 - 1 / np.power(10, 16)
# Let the first threshold (the value of the thresholds column with index 0) be equal to a number, very close to 1
# but smaller than 1, say 1 - 1 / 10 ^ 16.

df_cutoffs.head()
df_cutoffs.tail()
df_accept.head()

X_test.shape

# We define a function called 'n_approved' which assigns a value of 1 if a predicted probability
# is greater than the parameter p, which is a threshold, and a value of 0, if it is not.
# Then it sums the column.
# Thus, if given any percentage values, the function will return
# the number of rows wih estimated probabilites less than the threshold. 
def n_approved(p):
    return np.where(df_accept['Probability of default'] <= p, 1, 0).sum()

df_cutoffs['N Approved'] = df_cutoffs['thresholds'].apply(n_approved)
df_cutoffs['N Rejected'] = X_test.shape[0] - df_cutoffs['N Approved']
df_cutoffs['Approval Rate'] = df_cutoffs['N Approved'] / df_accept['Probability of default'].shape[0]
df_cutoffs['Rejection Rate'] = 1 - df_cutoffs['Approval Rate']

df_cutoffs.iloc[40: , ]


# Comparison of the different evalustion  metrices for each algorithm

def bar_plot(ax, data, colors=None, total_width=0.8, single_width=1, legend=True):
    
    if colors is None:
        colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

    # Number of bars per group
    n_bars = len(data)

    # The width of a single bar
    bar_width = total_width / n_bars

    # List containing handles for the drawn bars, used for the legend
    bars = []

    # Iterate over all data
    for i, (name, values) in enumerate(data.items()):
        # The offset in x direction of that bar
        x_offset = (i - n_bars / 2) * bar_width + bar_width / 2

        # Draw a bar for every value of that type
        for x, y in enumerate(values):
            bar = ax.bar(x + x_offset, y, width=bar_width * single_width, color=colors[i % len(colors)])

        # Add a handle to the last drawn bar, which we'll need for the legend
        bars.append(bar[0])

    # Draw legend if we need
    if legend:
        ax.legend(bars, data.keys())


if __name__ == "__main__":
    # Usage example:
    data = {
        "Accuracy": [0.72, 0.69, 0.71, .69, .74, .70], 
        "Error" : [0.28, 0.31, 0.29, .31, .26, .30], 
        "Recall": [0.23, 0.7, 0.25, .17, .42, .43], 
        "Specificity": [0.95, 0.98, 0.93, .94, .90, .83],
        "Precision": [0.88, 0.62, 0.69, .57, .32, .44],
        "F1-Score": [.34, .13, 0.36, .26, .51, .48]
}

    fig, ax = plt.subplots()
    bar_plot(ax, data, total_width=.8, single_width=.9)
    
    X = ['KNN','LR','NB', "SVM", "RF", "CART"]
    X_axis = np.arange(len(X))
    plt.xticks(X_axis, X)
    
    plt.show()
