In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("fivethirtyeight")
%matplotlib inline
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
#print(os.listdir("../input"))
#Any results you write to the current directory are saved as output.

In [5]:
#test_data = pd.read_csv("../input/cs-test.csv")
#train_data = pd.read_csv("../input/cs-training.csv")
test_data = pd.read_csv("../input/cs-test.csv")
train_data = pd.read_csv("../input/cs-training.csv")

In [6]:
test_data.sample(10)


Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
20965,20966,,0.021788,80,0,715.0,,15,0,1,0,0.0
9162,9163,,0.067046,36,0,3175.0,,6,0,2,0,
55594,55595,,0.058907,65,0,0.1404,7150.0,8,0,2,0,0.0
62780,62781,,0.51058,46,0,0.304205,11866.0,16,0,1,0,0.0
63743,63744,,0.893858,44,2,0.962679,3000.0,8,0,1,0,2.0
98692,98693,,0.039673,50,0,0.293658,7300.0,10,0,1,0,3.0
39572,39573,,0.001425,68,0,1619.0,,7,0,2,0,0.0
58329,58330,,0.348217,36,0,0.221395,4701.0,6,0,0,0,0.0
82553,82554,,0.043242,62,0,0.269097,5157.0,10,0,1,0,1.0
77963,77964,,0.060626,56,0,0.000484,4128.0,2,0,0,0,0.0


In [None]:
#Let's take a look at the data
train_data.sample(10)

In [None]:
train_data.info()

We have large null values for MonthlyIncome and NumberOfDependents, we will handle those values in a bit. Also, these features has inconsistent data types, we will change them to int64. Let's look at the summary statistics of the features.

In [None]:
train_data.describe()

Age feature seems to have an outlier value 0. I assume that it is not recorded and we will impute it also with the age's median.  Meanwhile, The features NumberOfTimes90DaysLate, NumberOfTime60-89DaysPastDueNotWorse	and NumberOfTime30-59DaysPastDueNotWorse looks like giving the same information. Also, NumberOfOpenCreditLinesAndLoans and NumberRealEstateLoansOrLines. We will check their correlations to each other with the correlation matrix and do something to make use of these features. Let's go now and have a sneak peek on our test data.

In [None]:
test_data.sample(10)

In [None]:
test_data.info()

In [None]:
test_data.describe()

Unlike our training dataset, our test set the min value of the age feature is 21. The MonthlyIncome and NumberOfDependents feature also has null values and we will handle those accordingly. Ofcourse SeriousDlqin2yrs has 0 values since it is our target class.

**Let's get our hands dirty!**

First, we will look at the distribution of our target class. SeriousDlqin2yrs to have some perspective about the problem.

In [None]:
plt.figure(figsize=(10,8))
sns.countplot("SeriousDlqin2yrs", data=train_data)

There is clear problem here, we have an **unbalanced target class!!** we will check the event rate of financial distress (SeriousDlqin2yrs) in our dataset.

In [None]:
class_0 = train_data.SeriousDlqin2yrs.value_counts()[0]
class_1 = train_data.SeriousDlqin2yrs.value_counts()[1]
print("Total number of class_0: {}".format(class_0))
print("Total number of class_1: {}".format(class_1))
print("Event rate: {} %".format(class_1/(class_0+class_1) *100))

We have an event rate of **6.68%**, consequences of having this kind of target class is most likely that the minority class is being ignored by the algorithm and will predict the new instances to class_0 as it was the safest way to have a great accuracy.
There are guides on how to handle this problem and what I found most useful was the article of Jason Brownlee [here.](https://machinelearningmastery.com/tactics-to-combat-imbalanced-classes-in-your-machine-learning-dataset/) This competition uses an evaluation metric AUC so we will work inline with this evaluation metric (i.e using ROC Curve to compare models).
After reading the article I have concluded ways on how to tackle the problem.
1.  Since we have a lot of data, over 100,000 training data set, we will consider using Resampling (Under-sampling to be exact) this strategy will randomly delete some of the instances of the majority class (class_0) to make it balanced. using the [imbalanced-learn module](https://github.com/scikit-learn-contrib/imbalanced-learn)
2. Using penalized models (penalized RF, Logit)
3. Considering ensemble models.


Back to the training set.
age feature has a 0 value in it, so we will locate the entry and impute it with the age median.

In [None]:
train_data.loc[train_data["age"] < 18] #less than legal age

only one instance, let's impute it right away.

In [None]:
train_data.loc[train_data["age"] == 0, "age"] = train_data.age.median()

We're done with the age feature, now we will go and impute missing values for the MonthlyIncome feature. We will tackle this differently by bracketing ages with the working (18 to 60) and senior (60 and above). First, let's create a temporary dataframes for them and compute for each's median then impute the values accordingly.

In [None]:
age_working = train_data.loc[(train_data["age"] >= 18) & (train_data["age"] < 60)]
age_senior = train_data.loc[(train_data["age"] >= 60)]

age_working_impute = age_working.MonthlyIncome.mean()
age_senior_impute = age_senior.MonthlyIncome.mean()

We will change the monthlyincome data type to int64 then fill those null values with 99999 and impute with the corresponding age's monthlyincome mean.

In [None]:
train_data["MonthlyIncome"] = np.absolute(train_data["MonthlyIncome"])

In [None]:
train_data["MonthlyIncome"] = train_data["MonthlyIncome"].fillna(99999)

In [None]:
train_data["MonthlyIncome"] = train_data["MonthlyIncome"].astype('int64')

In [None]:
train_data.loc[((train_data["age"] >= 18) & (train_data["age"] < 60)) & (train_data["MonthlyIncome"] == 99999),\
               "MonthlyIncome"] = age_working_impute
train_data.loc[(train_data["age"] >= 60) & (train_data["MonthlyIncome"] == 99999), "MonthlyIncome"] = age_senior_impute

In [None]:
#check
train_data.info()

In [None]:
train_data.loc[train_data["MonthlyIncome"] == 99999]

We're done with the Monthly Income, now we will move to the NumberOfDependents feature.

In [None]:
train_data["NumberOfDependents"] = np.absolute(train_data["NumberOfDependents"])
train_data["NumberOfDependents"] = train_data["NumberOfDependents"].fillna(0)
train_data["NumberOfDependents"] = train_data["NumberOfDependents"].astype('int64')

In [None]:
train_data.NumberOfDependents.value_counts()

I decided not to go through each of the numberofdependents feature and impute it by the mode. We will now take a look at the correlation of the features to the target variable.

In [None]:
corr = train_data.corr()
plt.figure(figsize=(14,12))
sns.heatmap(corr, annot=True, fmt=".2g")

**Findings**: As expected, the NumberOfTimes90DaysLate, NumberOfTime60-89DaysPastDueNotWorse and NumberOfTime30-59DaysPastDueNotWorse are highly correlated to each other and keeping all those features won't help the prediction power of algorithms(avoiding multicollinearity). I came up with 2 ways to handle this, drop the other 2 features and keep 1 or combine the three features and make a binary feature that classify if a borrower defaulted any loan/credit payment. Also, the NumberOfOpenCreditLinesAndLoans and NumberRealEstateLoansOrLines features are somehow correlated to each other but has different degree of correlation from our target class we can also handle this features the same way as we will handle the pastdue/late features.

We will go with feature engineering the pastdue/late features (because a default is a default!) and credit/loans features but providing a buffer since debts are everywhere!

In [None]:
train_data["CombinedDefaulted"] = (train_data["NumberOfTimes90DaysLate"] + train_data["NumberOfTime60-89DaysPastDueNotWorse"])\
                                        + train_data["NumberOfTime30-59DaysPastDueNotWorse"]

In [None]:
train_data.loc[(train_data["CombinedDefaulted"] >= 1), "CombinedDefaulted"] = 1

In [None]:
train_data["CombinedCreditLoans"] = train_data["NumberOfOpenCreditLinesAndLoans"] + \
                                        train_data["NumberRealEstateLoansOrLines"]

In [None]:
train_data.loc[(train_data["CombinedCreditLoans"] <= 5), "CombinedCreditLoans"] = 0
train_data.loc[(train_data["CombinedCreditLoans"] > 5), "CombinedCreditLoans"] = 1

In [None]:
train_data.CombinedCreditLoans.value_counts()

Next, we will create a binary feature WithDependents which is derived from the NumberOfDependents feature. Also, from the description of the data DebtRatio = Monthly debt payments / monthly gross income. we will extract MonthlyDebtPayments from this formula to get a new feature.

In [None]:
train_data["WithDependents"] = train_data["NumberOfDependents"]
train_data.loc[(train_data["WithDependents"] >= 1), "WithDependents"] = 1

In [None]:
train_data.WithDependents.value_counts()

In [None]:
train_data["MonthlyDebtPayments"] = train_data["DebtRatio"] * train_data["MonthlyIncome"]
train_data["MonthlyDebtPayments"] = np.absolute(train_data["MonthlyDebtPayments"])
train_data["MonthlyDebtPayments"] = train_data["MonthlyDebtPayments"].astype('int64')

In [None]:
train_data["age"] = train_data["age"].astype('int64')
train_data["MonthlyIncome"] = train_data["MonthlyIncome"].astype('int64')

Also, let's see if we can get a good predictor out of age feature. using senior and working temporary dataframes earlier.

In [None]:
train_data["age_map"] = train_data["age"]
train_data.loc[(train_data["age"] >= 18) & (train_data["age"] < 60), "age_map"] = 1
train_data.loc[(train_data["age"] >= 60), "age_map"] = 0 

In [None]:
#replacing those numbers to categorical features then get the dummy variables
train_data["age_map"] = train_data["age_map"].replace(0, "working")
train_data["age_map"] = train_data["age_map"].replace(1, "senior")

In [None]:
train_data = pd.concat([train_data, pd.get_dummies(train_data.age_map,prefix='is')], axis=1)

Now let's look at the correlation matrix to decide to retain or drop the engineered features (avoiding multicollinearity).

In [None]:
corr = train_data.corr()
plt.figure(figsize=(14,12))
sns.heatmap(corr, annot=True, fmt=".2g")

Findings: 
* we will retain CombinedDefaulted feature as it clearly a good predictor of our target class than the three features it was derived from.
* we will retain NumberOfTime30-59DaysPastDueNotWorse and drop the other two features derived from CombinedDefaulted as it gives a more meaningful information on our target variable (also, it looks like this is the medium range of time a borrower defaulted a payment)
* we will drop the engineered is_working and is_senior feature since age feature outperforms them.
* we will drop also the WithDependents
* we will retain CombinedCreditLoans also since it outperforms the two features it came from.
* we will drop MonthlyDebtPayments


In [None]:
train_data.columns

In [None]:
train_data.drop(["Unnamed: 0","NumberOfOpenCreditLinesAndLoans",\
                 "NumberOfTimes90DaysLate","NumberRealEstateLoansOrLines","NumberOfTime60-89DaysPastDueNotWorse",\
                 "WithDependents","age_map","is_senior","is_working", "MonthlyDebtPayments"], axis=1, inplace=True)

In [None]:
train_data.columns

In [None]:
#now let's take a look at the filtered final features to be used in predicting the financial distress for the next two years
corr = train_data.corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr, annot=True, fmt=".2g")

Ta-da! we now have clean training dataset. now it's ready to apply algorithms to it but before that, since we have unbalanced dataset we know that this will not generalized well in the test set. So, we need to perform the undersampling or penalized kernels as we shortlisted those strategies earlier.

Now let's also clean the test set! Since we have concluded what features to retain and drop. we will skip some of the process.

In [None]:
def cleaned_dataset(dataset):
    dataset.loc[dataset["age"] <= 18, "age"] = dataset.age.median()
    
    age_working = dataset.loc[(dataset["age"] >= 18) & (dataset["age"] < 60)]
    age_senior = dataset.loc[(dataset["age"] >= 60)]

    age_working_impute = age_working.MonthlyIncome.mean()
    age_senior_impute = age_senior.MonthlyIncome.mean()

    dataset["MonthlyIncome"] = np.absolute(dataset["MonthlyIncome"])
    dataset["MonthlyIncome"] = dataset["MonthlyIncome"].fillna(99999)
    dataset["MonthlyIncome"] = dataset["MonthlyIncome"].astype('int64')

    dataset.loc[((dataset["age"] >= 18) & (dataset["age"] < 60)) & (dataset["MonthlyIncome"] == 99999),\
                   "MonthlyIncome"] = age_working_impute
    dataset.loc[(train_data["age"] >= 60) & (dataset["MonthlyIncome"] == 99999), "MonthlyIncome"] = age_senior_impute
    dataset["NumberOfDependents"] = np.absolute(dataset["NumberOfDependents"])
    dataset["NumberOfDependents"] = dataset["NumberOfDependents"].fillna(0)
    dataset["NumberOfDependents"] = dataset["NumberOfDependents"].astype('int64')

    dataset["CombinedDefaulted"] = (dataset["NumberOfTimes90DaysLate"] + dataset["NumberOfTime60-89DaysPastDueNotWorse"])\
                                            + dataset["NumberOfTime30-59DaysPastDueNotWorse"]

    dataset.loc[(dataset["CombinedDefaulted"] >= 1), "CombinedDefaulted"] = 1

    dataset["CombinedCreditLoans"] = dataset["NumberOfOpenCreditLinesAndLoans"] + \
                                            dataset["NumberRealEstateLoansOrLines"]
    dataset.loc[(dataset["CombinedCreditLoans"] <= 5), "CombinedCreditLoans"] = 0
    dataset.loc[(dataset["CombinedCreditLoans"] > 5), "CombinedCreditLoans"] = 1

    dataset.drop(["Unnamed: 0","NumberOfOpenCreditLinesAndLoans",\
                 "NumberOfTimes90DaysLate","NumberRealEstateLoansOrLines","NumberOfTime60-89DaysPastDueNotWorse"], axis=1, inplace=True)

cleaned_dataset(test_data)

In [None]:
test_data.columns

In [None]:
train_data.columns

In [None]:
train_data.shape, test_data.shape

In [None]:
test_data.info()

In [None]:
#Let's split our predictors and the target variable in our datasets
X = train_data.drop("SeriousDlqin2yrs", axis=1).copy()
y = train_data.SeriousDlqin2yrs
X.shape, y.shape

In [None]:
X_test = test_data.drop("SeriousDlqin2yrs", axis=1).copy()
y_test = test_data.SeriousDlqin2yrs
X_test.shape, y_test.shape

In [None]:
#let's first try the penalized model Logit by providing the class_weight="balanced" parameter
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_predict #to perform stratified sampling using cv param
from sklearn.metrics import roc_curve, roc_auc_score #AUC score
from sklearn.preprocessing import StandardScaler

X_train, X_val, y_train, y_val = train_test_split(X,y,random_state=42)
logit = LogisticRegression(random_state=42, solver="saga", penalty="l1", class_weight="balanced", C=1.0, max_iter=500)
scaler = StandardScaler().fit(X_train)

Since we have a vast amount of data, we will use solver="saga" from logit and apply preprocessing of the input data using StandardScaler. class_weight="balanced" and a regularization param C to the default value of 1

In [None]:
X_train_scaled = scaler.transform(X_train) #scaling features!
X_val_scaled = scaler.transform(X_val)

In [None]:
logit.fit(X_train_scaled, y_train)
logit_scores_proba = logit.predict_proba(X_train_scaled)
logit_scores = logit_scores_proba[:,1]

In [None]:
#lets make a roc_curve visualization
def plot_roc_curve(fpr, tpr, label=None):
    plt.figure(figsize=(12,10))
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0,1],[0,1], "k--")
    plt.axis([0,1,0,1])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive rate")

In [None]:
fpr_logit, tpr_logit, thresh_logit = roc_curve(y_train, logit_scores)
plot_roc_curve(fpr_logit,tpr_logit)
print("AUC Score {}".format(roc_auc_score(y_train,logit_scores)))

In [None]:
#validate with the validation set
logit_scores_proba_val = logit.predict_proba(X_val_scaled)
logit_scores_val = logit_scores_proba_val[:,1]
fpr_logit_val, tpr_logit_val, thresh_logit_val = roc_curve(y_val, logit_scores_val)
plot_roc_curve(fpr_logit_val,tpr_logit_val)
print("AUC Score {}".format(roc_auc_score(y_val,logit_scores_val)))

With using our first try with the logistic regression we got an AUC score of .80, not bad! let's try tuning the parameters to see if we can improve our score. we will try setting a different regularization factor, let's tighten it by 0.1 and 10. and making max_iteration to 1000. Our validation set score is not that far away from our training score and that's a good thing!

In [None]:
logit_C_low = LogisticRegression(random_state=42, solver="saga", penalty="l1", class_weight="balanced", C=0.001, max_iter=1000)
logit_C_low.fit(X_train_scaled, y_train)
logit_C_low_scores_proba = logit_C_low.predict_proba(X_train_scaled)
logit_C_low_scores = logit_C_low_scores_proba[:,1]
fpr_logit_C_low, tpr_logit_C_low, thresh_logit_C_low = roc_curve(y_train, logit_C_low_scores)
#plot_roc_curve(fpr_logit_C_low,tpr_logit_C_low)
print("AUC Score {}".format(roc_auc_score(y_train,logit_C_low_scores)))

In [None]:
logit_C_high = LogisticRegression(random_state=42, solver="saga", penalty="l1", class_weight="balanced", C=1000, max_iter=1000)
logit_C_high.fit(X_train_scaled, y_train)
logit_C_high_scores_proba = logit_C_high.predict_proba(X_train_scaled)
logit_C_high_scores = logit_C_high_scores_proba[:,1]
fpr_logit_C_high, tpr_logit_C_high, thresh_logit_C_high = roc_curve(y_train, logit_C_high_scores)
print("AUC Score {}".format(roc_auc_score(y_train,logit_C_high_scores)))

Lets visualize all of them at once!

In [None]:
#lets make a roc_curve visualization
plt.figure(figsize=(12,10))
plt.plot(fpr_logit, tpr_logit, label="Logit C=1")
plt.plot(fpr_logit_C_high, tpr_logit_C_high , label="Logit C=1000")
plt.plot(fpr_logit_C_low, tpr_logit_C_low , label="Logit C=0.001")
plt.plot([0,1],[0,1], "k--", label="naive prediction")
plt.axis([0,1,0,1])
plt.legend(loc="best")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive rate")

Adjusting the C parameter don't mean much for our classifier to improve it's score. Let's try our second option which is to implement undersampling of our dataset to make the target variable balanced.

In [None]:
#Random Sampling
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
print("Original dataset shape {}".format(Counter(y)))

In [None]:
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_sample(X,y)
print("Resampled dataset shape {}".format(Counter(y_resampled)))

From here, we dropped most of the majority class ended up on a 50/50 ratio. the disadvantage of this strategy is that have lost most of the information from the majority class. advantage are our dataset will have a faster training and we solved the unbalanced dataset problem. let's give it a try!

In [None]:
X_resampled.shape, y_resampled.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train_rus, X_val_rus, y_train_rus, y_val_rus = train_test_split(X_resampled, y_resampled, random_state=42)
X_train_rus.shape, y_train_rus.shape

In [None]:
scaler = StandardScaler().fit(X_train_rus)
X_train_rus_scaled = scaler.transform(X_train_rus)
X_val_rus_scaled = scaler.transform(X_val_rus)

In [None]:
logit_resampled = LogisticRegression(random_state=42, solver="saga", penalty="l1", C=1.0, max_iter=500)
logit_resampled.fit(X_train_rus_scaled, y_train_rus)
logit_resampled_proba_res = logit_resampled.predict_proba(X_train_rus_scaled)
logit_resampled_scores = logit_resampled_proba_res[:,1]
fpr_logit_resampled, tpr_logit_resampled, thresh_logit_resampled = roc_curve(y_train_rus, logit_resampled_scores)
plot_roc_curve(fpr_logit_resampled,tpr_logit_resampled)
print("AUC Score {}".format(roc_auc_score(y_train_rus, logit_resampled_scores)))

Our score doesn't improve that much using the undersampling method. One reason of this would be that the logisticregression model can't handle this vast amount of data or we have reached its limitation of predictive power on this type of dataset. Let's try other complex models!

One way to improve our score  is to use ensembling models.  First, we will use RandomForests and will try GradientBoostingClassifier and compare their scores.

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
forest = RandomForestClassifier(random_state=42,n_estimators=300, max_depth=5, class_weight="balanced")
forest.fit(X_train,y_train) #Using the original dataset, not the resampled
y_scores_proba = forest.predict_proba(X_train)
y_scores = y_scores_proba[:,1]
fpr, tpr, thresh = roc_curve(y_train, y_scores)
plot_roc_curve(fpr,tpr)
print("AUC Score {}".format(roc_auc_score(y_train,y_scores))) #max_depth=5 .8525 #7 .864 cross .85 #10 .89 cross .85 #9 .88 cross .853 #12 .92 cross .84 Overfit!

In [None]:
#Let's cross validate
y_val_proba = forest.predict_proba(X_val)
y_scores_val = y_val_proba[:,1]
fpr_val, tpr_val, thresh_val = roc_curve(y_val, y_scores_val)
plot_roc_curve(fpr_val,tpr_val)
print("AUC Score {}".format(roc_auc_score(y_val,y_scores_val)))

Let's see how the random forest classifier treat each of the features, here, the randomforest gives a huge importance for the CombinedDefaulted feature and the RevolvingUtilizationOfUnsecuredLines and almost disgregard the other features in its predictions.

In [None]:
def plot_feature_importances(model):
    plt.figure(figsize=(10,8))
    n_features = X.shape[1]
    plt.barh(range(n_features), model.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), X.columns)
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")
    plt.ylim(-1, n_features)

plot_feature_importances(forest)

Tuning the max_depth param of randomforest with class_weight gave us a good score of .87 in the training set and .85 in the validation set. 
Here are the summary of the param tuninng 
#7 .864 cross .85 #10 .89 cross .85 #9 .88 cross .853 #12 .92 cross .84 Overfit!

Now Let's try GradientBoostingClassifier!

In [None]:
gbc_clf = GradientBoostingClassifier(n_estimators=300, learning_rate=0.05, max_depth=8, random_state=42)
gbc_clf.fit(X_train,y_train)
gbc_clf_proba = gbc_clf.predict_proba(X_train)
gbc_clf_scores = gbc_clf_proba[:,1]
fpr_gbc, tpr_gbc, thresh_gbc = roc_curve(y_train, gbc_clf_scores)
plot_roc_curve(fpr_gbc, tpr_gbc)
print("AUC Score {}".format(roc_auc_score(y_train, gbc_clf_scores)))

In [None]:
#validation
gbc_val_proba = gbc_clf.predict_proba(X_val)
gbc_val_scores = gbc_val_proba[:,1]
print("AUC Score {}".format(roc_auc_score(y_val, gbc_val_scores)))

We are overfitting! Let's try tuning the hyperparameters of our gradient boosting classifier to improve generalization.

In [None]:
gbc_clf_submission = GradientBoostingClassifier(n_estimators=200, learning_rate=0.05 ,max_depth=4,  random_state=42)
gbc_clf_submission.fit(X_train,y_train)
gbc_clf_proba = gbc_clf_submission.predict_proba(X_train)
gbc_clf_scores = gbc_clf_proba[:,1]
gbc_val_proba = gbc_clf_submission.predict_proba(X_val)
gbc_val_scores = gbc_val_proba[:,1]
fpr_gbc, tpr_gbc, thresh_gbc = roc_curve(y_train, gbc_clf_scores)
print("AUC Score {}".format(roc_auc_score(y_train, gbc_clf_scores))), print("AUC Score {}".format(roc_auc_score(y_val, gbc_val_scores)))

In [None]:
plot_feature_importances(gbc_clf)

Here, the GradientBoostingClassifier gives more emphasis on the RevolvingUtilizationOfUnsecuredLines feature and the DebtRatio, much more equally than the RandonForestClassifier. We're taking GradientBoostingClassifier as our model to submit on the kaggle competition.

In [None]:
X_test.shape

In [None]:
submission_proba = gbc_clf_submission.predict_proba(X_test)
submission_scores = submission_proba[:,1] #Positive Class

In [None]:
ids = np.arange(1,101504)
submission = pd.DataFrame( {'Id': ids, 'Probability': submission_scores})
submission.to_csv('submission_credit.csv', index=False)