In [None]:
from sklearn.model_selection import train_test_split
import pandas
import numpy
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from pprint import pprint

**Reading Data**

In [None]:
data = pandas.read_csv('cs-training.csv')

In [None]:
# data.head()

**Removing NaNs and cleaning data**

In [None]:
data.dropna(axis=0,how='any',subset=['NumberOfDependents'],inplace=True)
data.dropna(axis=0,how='any',subset=['MonthlyIncome'],inplace=True)
data.reset_index()

train_frame = data[data.columns[1:]]
train_frame.columns[1:]

In [None]:
train_frame.head()

**Creating new features and cleaning more**

In [None]:
train_frame['MonthlyDebt'] = train_frame['DebtRatio']*train_frame['MonthlyIncome']
train_frame['NumOfPastDue'] = train_frame['NumberOfTimes90DaysLate']+train_frame['NumberOfTime60-89DaysPastDueNotWorse'] +train_frame['NumberOfTime30-59DaysPastDueNotWorse']
train_frame['MonthlyBalance'] = train_frame['MonthlyIncome']-train_frame['MonthlyDebt']
train_frame['NumOfOpenCreditLines'] = train_frame['NumberOfOpenCreditLinesAndLoans']-train_frame['NumberRealEstateLoansOrLines']
train_frame['IncomePerPerson'] = train_frame['MonthlyIncome']/(train_frame['NumberOfDependents']+1)

# We need only the observations where MonthlyBalance is positive
train_frame['MonthlyBalance'][train_frame['MonthlyBalance'] <= 0] = 1

In [None]:
# train_frame.columns

**Plot the distribution of each variable to see how skewed they are**

In [None]:
# Uncomment below lines if needed

# for each in train_frame.columns[1:]:
#     sns.distplot(train_frame[each])
#     plt.show()

**Taking log transform to reduce skeweness in some variables**

In [None]:
for column in ['RevolvingUtilizationOfUnsecuredLines', 'NumberOfTime30-59DaysPastDueNotWorse', 'NumberOfTimes90DaysLate', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumOfPastDue', 'MonthlyDebt',
              'MonthlyIncome', 'DebtRatio', 'NumberRealEstateLoansOrLines', 'IncomePerPerson', 'MonthlyBalance']:
    train_frame[column] = numpy.log10(1 + train_frame[column].values)

**Plot distribution again**

In [None]:
# Uncomment below lines if needed

# for each in train_frame.columns[1:]:
#     sns.distplot(train_frame[each])
#     plt.show()

**Removing outliers**

In [None]:
train_frame = train_frame[train_frame['age'] != 0]
train_frame = train_frame[train_frame['age'] !=99]
train_frame = train_frame[train_frame['age'] !=101]

**Choosing features and splitting dataset**

In [None]:
features = ['RevolvingUtilizationOfUnsecuredLines','MonthlyDebt', 'MonthlyIncome', 'DebtRatio', 'age', 'IncomePerPerson', 'MonthlyBalance', 'NumOfOpenCreditLines',
            'NumberOfTime30-59DaysPastDueNotWorse', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfTimes90DaysLate', 'NumOfPastDue']
# features = train_frame.columns[1:]
x = train_frame[features]
y = train_frame['SeriousDlqin2yrs']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

**Taking a scalar transform**

In [None]:
scaler = StandardScaler()
scaler.fit(x_train)
x_train, x_test = scaler.transform(x_train), scaler.transform(x_test)

**Initialize Model**

In [None]:
rf = RandomForestClassifier(n_estimators=100, \
                                oob_score=True, \
                                min_samples_split=2, \
                                min_samples_leaf=50, \
                                n_jobs=-1, \
                                #class_weight="balanced",\
                                class_weight="balanced_subsample", \
                                bootstrap=True\
                                )

**Tuning Hyperparameters**

In [None]:
# param_grid = {"max_features": [2, 3, 4], "min_samples_leaf":[50]}
# grid_search = GridSearchCV(rf, cv=10, scoring='roc_auc', param_grid=param_grid, iid=False)
# grid_search.fit(x_train, y_train)

In [None]:
# pprint(grid_search.best_params_)
# pprint(grid_search.best_score_)

In [None]:
rf = RandomForestClassifier(n_estimators=100, \
                                oob_score=True, \
                                min_samples_split=2, \
                                min_samples_leaf=50, \
                                n_jobs=-1, \
                                #class_weight="balanced",\
                                class_weight="balanced_subsample", \
                                bootstrap=True, \
                                max_features=2, \
                                criterion='entropy'
                                )

In [None]:
rf.fit(x_train, y_train)

In [None]:
y_pred = rf.predict(x_test)

In [None]:
conf_matrix = metrics.confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot = True, fmt = "d", cmap = plt.cm.Greens)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion matrix')
plt.tight_layout()

In [None]:
importances = rf.feature_importances_
indices = numpy.argsort(importances)
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

In [None]:
test_data = pandas.read_csv('cs-test.csv')

In [None]:
test_data = test_data[data.columns[2:]]

In [None]:
test_data['MonthlyDebt'] = test_data['DebtRatio']*test_data['MonthlyIncome']
test_data['NumOfPastDue'] = test_data['NumberOfTimes90DaysLate']+test_data['NumberOfTime60-89DaysPastDueNotWorse'] +test_data['NumberOfTime30-59DaysPastDueNotWorse']
test_data['NumOfOpenCreditLines'] = test_data['NumberOfOpenCreditLinesAndLoans']-test_data['NumberRealEstateLoansOrLines']
test_data['IncomePerPerson'] = test_data['MonthlyIncome']/(test_data['NumberOfDependents']+1)
test_data['MonthlyBalance'] = test_data['MonthlyIncome']-test_data['MonthlyDebt']

test_data['MonthlyBalance'][test_data['MonthlyBalance'] <= 0] = 1

In [None]:
for column in ['RevolvingUtilizationOfUnsecuredLines', 'NumberOfTime30-59DaysPastDueNotWorse', 'NumberOfTimes90DaysLate', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumOfPastDue', 'MonthlyDebt',
              'MonthlyIncome', 'DebtRatio', 'NumberRealEstateLoansOrLines', 'IncomePerPerson', 'MonthlyBalance']:
    test_data[column] = numpy.log10(1 + test_data[column].values)

In [None]:
x = test_data[features]

In [None]:
scaler = StandardScaler()
x = scaler.fit_transform(x)

In [None]:
imp = SimpleImputer()
imp = imp.fit(x)
x = imp.transform(x)

In [None]:
probs = rf.predict_proba(x)

In [None]:
probs = [each[1] for each in probs]

In [None]:
out = pandas.DataFrame()
out['Probability'] = probs
out.index = out.index+1

In [None]:
out.to_csv('out.csv')