# Classification to categorize credit applications

In [None]:
#Import all the necessary libraries
import pandas as pd
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
from scipy.stats import randint
from scipy.stats import uniform
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import seaborn as sns
import itertools
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.model_selection import cross_validate, cross_val_predict
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import confusion_matrix,roc_auc_score,roc_curve,classification_report,accuracy_score
from matplotlib import pyplot
%matplotlib inline

In [None]:
# Load the files from the google drive
df_apprec=pd.read_csv("C:/New folder/SecondSpring/Python/Project/credit-card-approval-prediction/application_record.csv")
df_credrec=pd.read_csv("C:/New folder/SecondSpring/Python/Project/credit-card-approval-prediction/credit_record.csv")

In [None]:
# Shape of the dataframes
print('Size of the Application dataset is', df_apprec.shape)
print('Size of the credit record dataset is', df_credrec.shape)

In [None]:
#unique values
print(df_apprec.nunique())
print(df_apprec.head())

In [None]:
#Plot of distribution of status codes in the target variable
plt.figure(figsize=(15,8))
sns.countplot(x='STATUS', data=df_credrec)

In [None]:
# Status code distribution in the credit record
a=df_credrec['STATUS'].value_counts()
for i in a:
        print (list(a).index(i),":",round((i)/len(df_credrec)*100,2), "%")

In [None]:
#Percetage of missing values by each column
missing_values = (100-df_apprec.count()/len(df_apprec) * 100)
print(round(missing_values).sort_values(ascending=False))

In [None]:
#Outlier plot for days of birth
import seaborn as sns
sns.boxplot(x=df_apprec['DAYS_BIRTH'])

In [None]:
#Outlier plot for day employed 
sns.boxplot(x=df_apprec['DAYS_EMPLOYED'])

In [None]:
df_apprec.DAYS_EMPLOYED.max()

In [None]:
#Outlier plot for Income
import seaborn as sns
sns.boxplot(x=df_apprec['AMT_INCOME_TOTAL'])

In [None]:
df_apprec.AMT_INCOME_TOTAL.max()

In [None]:
#Outlier removal for income total
df_apprec = df_apprec[df_apprec['AMT_INCOME_TOTAL'] <= 1000000] 
#new_data.AMT_INCOME_TOTAL.max()

In [None]:
sns.set_style("whitegrid") 

In [None]:
#Checking column data types
df_apprec.select_dtypes(include=[np.float64]).columns
df_apprec.select_dtypes(include=[np.int64]).columns
df_apprec.select_dtypes(include=[np.object]).columns
df_apprec.isnull().sum()

In [None]:
#Correclation Matrix for Application
correlations = df_apprec.corr()
names=list(df_apprec.columns)
# plot correlation matrix
fig = plt.figure(figsize=(15,8))
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,11,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(names)
ax.set_yticklabels(names)
plt.show()

In [None]:
#Correlation Matrix for Credit
correlations = df_credrec.corr()
names=list(df_credrec.columns)
# plot correlation matrix
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,2,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(names)
ax.set_yticklabels(names)
plt.show()

In [None]:
# Scatterplot Matrix for Credit
names=list(df_credrec.columns)
scatter_matrix(df_credrec)
plt.show()

In [None]:
#Aggrgating the credit record by recoding 1,2,3,4,5 to 1 and X,C values to zero
dfCr=df_credrec["STATUS"].replace({"X": "0", "C": "0", "2":"1", "3":"1", "4":"1", "5":"1"}, inplace=True)

In [None]:
#Plot of good and bad applications in the given data
sns.countplot(x='STATUS', data=df_credrec)

In [None]:
#Aggrgating the user IDs based on the maximum value of the staus value
uniq_users=pd.DataFrame(df_credrec.groupby('ID').agg({'STATUS':'max'}))

In [None]:
#Merge the application data with the user credit status data
new_data=pd.merge(df_apprec,uniq_users,how="inner",on="ID") #merge to record data

In [None]:
#Renaming the columns
new_data.rename(columns={'CODE_GENDER':'Gender','FLAG_OWN_CAR':'Car','FLAG_OWN_REALTY':'Reality',
                         'CNT_CHILDREN':'ChldNo','AMT_INCOME_TOTAL':'inc',
                         'NAME_EDUCATION_TYPE':'edutp','NAME_FAMILY_STATUS':'famtp',
                        'NAME_HOUSING_TYPE':'houtp','FLAG_EMAIL':'email',
                         'NAME_INCOME_TYPE':'inctp','FLAG_WORK_PHONE':'wkphone',
                         'FLAG_PHONE':'phone','CNT_FAM_MEMBERS':'famsize',
                        'OCCUPATION_TYPE':'occyp'
                        },inplace=True)

In [None]:
#Masking the null values
new_data.dropna()
new_data = new_data.mask(new_data == 'NULL').dropna()

In [None]:
#Creating dummy variables 
one_hot_train_i = pd.get_dummies(new_data['inctp'])
one_hot_train_e = pd.get_dummies(new_data['edutp'])
one_hot_train_h = pd.get_dummies(new_data['houtp'])
one_hot_train_f = pd.get_dummies(new_data['famtp'])
one_hot_train_o = pd.get_dummies(new_data['occyp'])


In [None]:
#Recoding the data types
new_data['Car'] = new_data['Car'].astype('bool')
new_data['Reality'] = new_data['Reality'].astype('bool')
#new_data['STATUS'] = new_data['STATUS'].astype('bool')
new_data['wkphone']=new_data['wkphone'].astype('str')
new_data['phone']=new_data['phone'].astype('str')
new_data['email']=new_data['email'].astype('str')
new_data['STATUS'] = new_data['STATUS'].astype(np.uint8)
#df_appRec.FLAG_OWN_CAR.map(dict(Y=1, N=0))
#df_appRec.FLAG_OWN_REALTY.map(dict(Y=1, N=0))
new_data['Gender'] = np.where(new_data['Gender'] != 'M', 1, 0)

In [None]:
#Dropping categorical variables and updating with one hot encoded vectors
new_data.drop(columns=['inctp', 'edutp','occyp','houtp','famtp'], axis = 1, inplace = True)
new_data = pd.concat([one_hot_train_i, new_data], axis = 1)
new_data = pd.concat([one_hot_train_e, new_data], axis = 1)
new_data = pd.concat([one_hot_train_h, new_data], axis = 1)
new_data = pd.concat([one_hot_train_f, new_data], axis = 1)
new_data = pd.concat([one_hot_train_o, new_data], axis = 1)

In [None]:
df_final=new_data

In [None]:
# Droping the wkphone, phone and email columns
y=new_data['STATUS']
X = new_data[['Gender','Car','Reality','ChldNo', 'inc','DAYS_BIRTH','DAYS_EMPLOYED','FLAG_MOBIL','famsize']]

In [None]:
X.info()

In [None]:
#Split the data into train and test divides.
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3)

In [None]:
#Logistic regression model for the data
Log_classifier = LogisticRegression(class_weight='balanced')
scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(Log_classifier, X_train, y_train, scoring=scoring, cv=20)
sorted(scores.keys())
LR_fit_time = scores['fit_time'].mean()
LR_score_time = scores['score_time'].mean()
LR_accuracy = scores['test_accuracy'].mean()
LR_precision = scores['test_precision_macro'].mean()
LR_recall = scores['test_recall_macro'].mean()
LR_f1 = scores['test_f1_weighted'].mean()
LR_roc = scores['test_roc_auc'].mean()
LR_cm = scores['test_roc_auc'].mean()

In [None]:
#Confusion matrix for logistic regression
Log_classifier.fit(X_train, y_train)
y_pred=Log_classifier.predict(X_test)
confusion_matrix(y_test, y_pred)


In [None]:
#Precision recall curve for Logistic regression
ns_probs = [0 for _ in range(len(y_test))]
Log_classifier.fit(X_train, y_train)
lr_probs = Log_classifier.predict_proba(X_test)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# calculate scores
ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, lr_probs)
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Logistic: ROC AUC=%.3f' % (lr_auc))
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
# plot the roc curve for the model
pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
pyplot.plot(lr_fpr, lr_tpr, marker='.', label='Logistic')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()

In [None]:
#Fit the decision tree model on training data 
decision_tree = DecisionTreeClassifier()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(decision_tree, X_train, y_train, scoring=scoring, cv=20)

sorted(scores.keys())
dtree_fit_time = scores['fit_time'].mean()
dtree_score_time = scores['score_time'].mean()
dtree_accuracy = scores['test_accuracy'].mean()
dtree_precision = scores['test_precision_macro'].mean()
dtree_recall = scores['test_recall_macro'].mean()
dtree_f1 = scores['test_f1_weighted'].mean()
dtree_roc = scores['test_roc_auc'].mean()


In [None]:
#Confusion Matrix for Decision tree model 
decision_tree.fit(X_train, y_train)
y_pred=decision_tree.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
#Precision recall curve for Decision tree
ns_probs = [0 for _ in range(len(y_test))]
decision_tree.fit(X_train, y_train)
lr_probs = decision_tree.predict_proba(X_test)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# calculate scores
ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, lr_probs)
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Decision Tree: ROC AUC=%.3f' % (lr_auc))
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
# plot the roc curve for the model
pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
pyplot.plot(lr_fpr, lr_tpr, marker='.', label='Decision tree')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()

In [None]:
from sklearn.svm import SVC
lin_clf = SVC(random_state=42,probability=True)
scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(lin_clf, X_train, y_train, scoring=scoring, cv=20)
sorted(scores.keys())
sv_fit_time = scores['fit_time'].mean()
sv_score_time = scores['score_time'].mean()
sv_accuracy = scores['test_accuracy'].mean()
sv_precision = scores['test_precision_macro'].mean()
sv_recall = scores['test_recall_macro'].mean()
sv_f1 = scores['test_f1_weighted'].mean()
sv_roc = scores['test_roc_auc'].mean()

In [None]:
#Confusion Matrix for SVM 
lin_clf.fit(X_train, y_train)
y_pred=lin_clf.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
#Precision recall curve for SVM
ns_probs = [0 for _ in range(len(y_test))]
lin_clf.fit(X_train, y_train)
lr_probs = lin_clf.predict_proba(X_test)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# calculate scores
ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, lr_probs)
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('SVM: ROC AUC=%.3f' % (lr_auc))
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
# plot the roc curve for the model
pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
pyplot.plot(lr_fpr, lr_tpr, marker='.', label='SVM')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()

In [None]:
#Gradient Boost algorithm
gdclf = GradientBoostingClassifier()
scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(gdclf, X_train, y_train, scoring=scoring, cv=20)
sorted(scores.keys())
gb_fit_time = scores['fit_time'].mean()
gb_score_time = scores['score_time'].mean()
gb_accuracy = scores['test_accuracy'].mean()
gb_precision = scores['test_precision_macro'].mean()
gb_recall = scores['test_recall_macro'].mean()
gb_f1 = scores['test_f1_weighted'].mean()
gb_roc = scores['test_roc_auc'].mean()

In [None]:
#Confusion Matrix for GB 
gdclf.fit(X_train, y_train)
y_pred=gdclf.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
#Precision recall curve for GB
ns_probs = [0 for _ in range(len(y_test))]
gdclf.fit(X_train, y_train)
lr_probs = gdclf.predict_proba(X_test)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# calculate scores
ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, lr_probs)
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Gradient Boost: ROC AUC=%.3f' % (lr_auc))
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
# plot the roc curve for the model
pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
pyplot.plot(lr_fpr, lr_tpr, marker='.', label='Gradient boost')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()

In [None]:
#Random forest classifier
random_forest = RandomForestClassifier()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(random_forest, X_train, y_train, scoring=scoring, cv=20)

sorted(scores.keys())
forest_fit_time = scores['fit_time'].mean()
forest_score_time = scores['score_time'].mean()
forest_accuracy = scores['test_accuracy'].mean()
forest_precision = scores['test_precision_macro'].mean()
forest_recall = scores['test_recall_macro'].mean()
forest_f1 = scores['test_f1_weighted'].mean()
forest_roc = scores['test_roc_auc'].mean()

In [None]:
#Confusion Matrix for random forest
random_forest.fit(X_train, y_train)
y_pred=random_forest.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
#Precision recall curve for Random forest
ns_probs = [0 for _ in range(len(y_test))]
random_forest.fit(X_train, y_train)
lr_probs = random_forest.predict_proba(X_test)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# calculate scores
ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, lr_probs)
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Random forest: ROC AUC=%.3f' % (lr_auc))
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
# plot the roc curve for the model
pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
pyplot.plot(lr_fpr, lr_tpr, marker='.', label='Random forest')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()

In [None]:
print(random_forest.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(random_forest.feature_importances_, index=X.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.show()

In [None]:
  #Model Comparison
  models_rfecv = pd.DataFrame({
      'Model'       : ['Logistic Regression', 'Decision Tree', 'Random Forest','Gradient Boost','Support Vector'],
      'Fitting time': [LR_fit_time, dtree_fit_time, forest_fit_time,gb_fit_time,sv_fit_time],
      'Scoring time': [LR_score_time, dtree_score_time, forest_score_time,gb_score_time,sv_score_time],
      'Accuracy'    : [LR_accuracy, dtree_accuracy, forest_accuracy,gb_accuracy,sv_accuracy],
      'Precision'   : [LR_precision, dtree_precision, forest_precision,gb_precision,sv_precision],
      'Recall'      : [LR_recall, dtree_recall, forest_recall,gb_recall,sv_recall],
      'F1_score'    : [LR_f1, dtree_f1, forest_f1,gb_f1,sv_f1],
      'AUC_ROC'     : [LR_roc, dtree_roc, forest_roc,gb_roc,sv_roc],
      }, columns = ['Model', 'Fitting time', 'Scoring time', 'Accuracy', 'Precision', 'Recall', 'F1_score', 'AUC_ROC'])

  models_rfecv.sort_values(by='Accuracy', ascending=False)


In [None]:
df_final.STATUS.value_counts()

Selected Random forest model as the best after comparison. Model is balanced again for imabalnaced data and used for prediction.

In [None]:
def evalPerformance(test_algo_obtained_labels, test_labels):
	a, b, c, d = 0, 0, 0, 0
	test_labels = np.array(test_labels)
	test_algo_obtained_labels = np.array(test_algo_obtained_labels)
	test_algo_obtained_labels = test_algo_obtained_labels.astype(np.float)
	for i in range(0, len(test_labels)):
		if(test_labels[i] == test_algo_obtained_labels[i] and test_labels[i] == 1.0):
			a += 1
		elif(test_labels[i] == 1.0 and test_algo_obtained_labels[i] == 0.0):
			b += 1
		elif(test_labels[i] == 0.0 and test_algo_obtained_labels[i] == 1.0):
			c += 1
		elif(test_labels[i] == test_algo_obtained_labels[i] and test_labels[i] == 0.0):
			d += 1
	accuracy = 0.0
	precision = 0.0
	recall = 0.0
	f_measure = 0.0
	if(a+b+c+d != 0):
		accuracy = float(a+d)/(a+b+c+d)
	if(a+c != 0):
		precision = a / float(a+c)
	if(a+b != 0):
		recall = a/float(a+b)
	if(2*a + b + c != 0):
		f_measure = 2*a/float(2*a + b + c)
	print(" Accuracy is : " + str(accuracy)),
	print(" Precision is : " + str(precision))
	print(" Recall is : " + str(recall))
	print(" F Measure is : " + str(f_measure))
	print()
	return accuracy, precision, recall, f_measure

In [None]:
#SMote
y1=df_final['STATUS']
X1 = df_final[['Gender','Car','Reality','ChldNo', 'inc','DAYS_BIRTH','DAYS_EMPLOYED','FLAG_MOBIL','famsize']]

In [None]:
from imblearn.over_sampling import SMOTE
X_balance,Y_balance = SMOTE().fit_sample(X1,y1)
X_balance = pd.DataFrame(X_balance,columns=X1.columns)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_balance,Y_balance, 
                                                    stratify=Y_balance, test_size=0.3,
                                                    random_state = 10086)

In [None]:
rf= RandomForestClassifier()
rf.fit(X_train,y_train)

In [None]:
y_pred=rf.predict(X_test)
evalPerformance(y_pred,y_test)
confusion_matrix(y_test, y_pred)

In [None]:
#Precision recall curve for Random forest
ns_probs = [0 for _ in range(len(y_test))]
rf.fit(X_train, y_train)
lr_probs = rf.predict_proba(X_test)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# calculate scores
ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, lr_probs)
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Random forest: ROC AUC=%.3f' % (lr_auc))
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
# plot the roc curve for the model
pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
pyplot.plot(lr_fpr, lr_tpr, marker='.', label='Random forest')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()