## ENSEMBLE MODEL

In [1]:
# import libraries

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

### Additional Libraries

In [217]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline

#Hyperparameter Optimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

#Classification Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

#Ensemble Algorithms
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier

#Classification Metrics
from sklearn.metrics import accuracy_score



In [218]:
from sklearn.exceptions import DataConversionWarning

import warnings
warnings.filterwarnings("ignore", category=DataConversionWarning) 
warnings.filterwarnings("ignore", category=FutureWarning) 

## Load and Explore the Dataset

Credit Scoring

In [2]:
# Load Dataset
df = pd.read_csv("crx_proc.csv")

In [220]:
# View Dataset
df.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,Target
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+


In [221]:
# Check the number of observations and features
df.shape

(690, 16)

Check Dtypes and Null Values


In [222]:
df.info() #missing values in features: A1, A2, A4, A5, A6, A7, A14

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
A1        678 non-null object
A2        678 non-null float64
A3        690 non-null float64
A4        684 non-null object
A5        684 non-null object
A6        681 non-null object
A7        681 non-null object
A8        690 non-null float64
A9        690 non-null object
A10       690 non-null object
A11       690 non-null int64
A12       690 non-null object
A13       690 non-null object
A14       677 non-null float64
A15       690 non-null int64
Target    690 non-null object
dtypes: float64(4), int64(2), object(10)
memory usage: 86.3+ KB


In [223]:
df.describe(include='all')

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,Target
count,678,678.0,690.0,684,684,681,681,690.0,690,690,690.0,690,690,677.0,690.0,690
unique,2,,,3,3,14,9,,2,2,,2,3,,,2
top,b,,,u,g,c,v,,t,f,,f,g,,,-
freq,468,,,519,519,137,399,,361,395,,374,625,,,383
mean,,31.568171,4.758725,,,,,2.223406,,,2.4,,,184.014771,1017.385507,
std,,11.957862,4.978163,,,,,3.346513,,,4.86294,,,173.806768,5210.102598,
min,,13.75,0.0,,,,,0.0,,,0.0,,,0.0,0.0,
25%,,22.6025,1.0,,,,,0.165,,,0.0,,,75.0,0.0,
50%,,28.46,2.75,,,,,1.0,,,0.0,,,160.0,5.0,
75%,,38.23,7.2075,,,,,2.625,,,3.0,,,276.0,395.5,


## Preprocessing

### Remove Null Values

In [224]:
clean_df = df.dropna(axis=0, how='any').copy()
clean_df.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,Target
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+


In [225]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 653 entries, 0 to 689
Data columns (total 16 columns):
A1        653 non-null object
A2        653 non-null float64
A3        653 non-null float64
A4        653 non-null object
A5        653 non-null object
A6        653 non-null object
A7        653 non-null object
A8        653 non-null float64
A9        653 non-null object
A10       653 non-null object
A11       653 non-null int64
A12       653 non-null object
A13       653 non-null object
A14       653 non-null float64
A15       653 non-null int64
Target    653 non-null object
dtypes: float64(4), int64(2), object(10)
memory usage: 86.7+ KB


Check Number of Remaining Samples

In [226]:
clean_df.shape

(653, 16)

### Convert Categorical Data to Numerical

In [227]:
#categoricaltype: 'Target','A1','A4','A5','A6','A7','A9','A10','A12','A13'
convert_df = pd.get_dummies(clean_df, drop_first=True)

In [228]:
convert_df.columns

Index([u'A2', u'A3', u'A8', u'A11', u'A14', u'A15', u'A1_b', u'A4_u', u'A4_y',
       u'A5_gg', u'A5_p', u'A6_c', u'A6_cc', u'A6_d', u'A6_e', u'A6_ff',
       u'A6_i', u'A6_j', u'A6_k', u'A6_m', u'A6_q', u'A6_r', u'A6_w', u'A6_x',
       u'A7_dd', u'A7_ff', u'A7_h', u'A7_j', u'A7_n', u'A7_o', u'A7_v',
       u'A7_z', u'A9_t', u'A10_t', u'A12_t', u'A13_p', u'A13_s', u'Target_-'],
      dtype='object')

### Split Features and Target

In [234]:
features = ['A2', 'A3','A8','A11','A14','A15','A1_b','A4_u','A4_y',
            'A5_gg','A5_p','A6_c','A6_cc','A6_d','A6_e','A6_ff', 
            'A6_i','A6_j','A6_k','A6_m','A6_q','A6_r','A6_w', 'A6_x',
            'A7_dd','A7_ff','A7_h','A7_j','A7_n','A7_o', 'A7_v',
            'A7_z','A9_t','A10_t','A12_t','A13_p','A13_s']

# Separating out the features
X = convert_df.loc[:, features].values
# Separating out the target
y = convert_df.loc[:,['Target_-']].values

### Generate the training and validation set with the following conditions
* Use the "train_test_split" function with these parameters:
    * test_size = 0.30
    * random_state = 123

* Use these variable names:
    * X_train
    * y_train
    * X_val
    * y_val

In [241]:
df_train, df_val = train_test_split(convert_df, test_size=0.3, random_state=123)

In [242]:
#separate the class column of train and test dataset
y_train = df_train["Target_-"]
y_val = df_val["Target_-"]

del df_val["Target_-"]
del df_train["Target_-"]
X_train = df_train
X_val = df_val
#X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.3, random_state = 123)

Verify the shapes of each variable to make sure it was properly assigned

In [243]:
print "X_train", X_train.shape
print "X_val", X_val.shape
print "y_train", y_train.shape
print "y_val", y_val.shape

X_train (457, 37)
X_val (196, 37)
y_train (457L,)
y_val (196L,)


Check value counts of target variable for both training and validation set

In [255]:
y_train.value_counts()

1    250
0    207
Name: Target_-, dtype: int64

In [256]:
y_val.value_counts()

1    107
0     89
Name: Target_-, dtype: int64

## Classification Algorithms

Build multiple classifiers

* Use the make_pipeline function
* Every algorithm should go through the pipeline (MinMaxScaler, Classifier) 
* For each classifier, get the accuracy score on the validation test set

In [257]:
model_LogReg = make_pipeline(MinMaxScaler(), LogisticRegression(random_state=123))
model_DecTre = make_pipeline(MinMaxScaler(), DecisionTreeClassifier(random_state=123))

### Logistic Regression

#### Build a Logistic Regression Classifier
* Use default parameters with random_State=123
* Use a pipeline (MinMaxScaler, LogisticRegression)
* Get accuracy score for the validation set

In [258]:
model_LogReg.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('logisticregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=123, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [259]:
predicted = model_LogReg.predict(X_val)

In [260]:
print ("Accuracy score:",accuracy_score(y_val, predicted))

('Accuracy score:', 0.8724489795918368)


### Decision Tree

#### Build a Decision Tree Classifier
* Use default parameters with random_State=123
* Use a pipeline (MinMaxScaler, DecisionTree)
* Get accuracy score for the validation set

In [261]:
model_DecTre.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('decisiontreeclassifier', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=123,
            splitter='best'))])

In [263]:
predicted_decTre = model_DecTre.predict(X_val)

In [264]:
print ("Accuracy score:",accuracy_score(y_val, predicted_decTre))

('Accuracy score:', 0.8367346938775511)


## Ensemble Models

### Random Forest

#### Build a Random Forest Classifier
* Use default parameters with random_State=123
* Use a pipeline (MinMaxScaler, RandomForestClassifier)
* Get accuracy score for the validation set

In [265]:
model_Random = make_pipeline(MinMaxScaler(), RandomForestClassifier(random_state=123))

In [266]:
model_Random.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_spl...timators=10, n_jobs=1,
            oob_score=False, random_state=123, verbose=0, warm_start=False))])

In [267]:
predicted_rand = model_Random.predict(X_val)
print ("Accuracy score:",accuracy_score(y_val, predicted_rand))

('Accuracy score:', 0.8826530612244898)


#### Get Feature Importance from the RandomForest Classifier
Display the Top 5 Features

In [268]:
clf = RandomForestClassifier()
clf = clf.fit(X_train, y_train)

In [269]:
importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]
importances

array([5.77859364e-02, 8.12249083e-02, 9.44046467e-02, 1.13911217e-01,
       6.90635511e-02, 9.70659353e-02, 9.27840936e-03, 6.46216536e-03,
       6.41387395e-03, 1.61212444e-03, 1.41808443e-02, 1.30790856e-02,
       4.36806578e-03, 1.66487838e-03, 1.61153914e-03, 9.70903142e-03,
       8.97842870e-03, 7.26545997e-04, 4.42209542e-03, 2.61324733e-03,
       1.23898273e-02, 1.32197855e-06, 1.07995183e-02, 1.39501872e-02,
       2.36473900e-04, 1.85380983e-03, 8.49287585e-03, 1.17056593e-03,
       3.61357638e-03, 3.81187384e-03, 7.81245757e-03, 1.36270691e-03,
       2.74447396e-01, 3.40311427e-02, 1.86702390e-02, 0.00000000e+00,
       8.77949715e-03])

In [270]:
print("Feature ranking:")

for f in range(5):
    print("%d. feature %d - %s (%f)"   % (f + 1, indices[f], convert_df[:-2].columns[f],importances[indices[f]]))
# Plot the feature importances of the forest

Feature ranking:
1. feature 32 - A2 (0.274447)
2. feature 3 - A3 (0.113911)
3. feature 5 - A8 (0.097066)
4. feature 2 - A11 (0.094405)
5. feature 1 - A14 (0.081225)


#### Perform Hyper Parameter Optimization on the Random Forest Classifier
* Choose either Grid Search or Random Search with the following parameters:
    * cv=5
    * refit=True
* Optimize the following parameters:
    * max_depth
    * max_features
    * min_samples_split
    * min_samples_leaf
    * bootstrap
    * criterion
* Get Accuracy Score for validation set

Build the Model

In [271]:
param_grid = {"max_depth": [3,10, 20],
              "max_features": [1, 10, 12],
              "min_samples_split": [2, 5, 10],
              "min_samples_leaf": [1, 2],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

In [272]:
scaler = MinMaxScaler()
clf = RandomForestClassifier(random_state=123)
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=5, refit=True)
model_Grid = make_pipeline(scaler, grid_search)

In [273]:
model_Grid.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('gridsearchcv', GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
        ...   pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0))])

Print the Best Score

In [274]:
rs = model_Grid.steps[1][1]
print "Best score:"
print rs.best_score_

Best score:
0.8862144420131292


Print the Best Parameters

In [276]:
print "Best parameters:"
print rs.best_params_

Best parameters:
{'bootstrap': True, 'min_samples_leaf': 2, 'min_samples_split': 5, 'criterion': 'entropy', 'max_features': 12, 'max_depth': 20}


### GBM

#### Build a GBM Classifier
* Use default parameters with random_State=123
* Use a pipeline (MinMaxScaler, GBM)
* Get accuracy score for the validation set

In [277]:
model_GBM = make_pipeline(MinMaxScaler(),GradientBoostingClassifier(random_state=123))

In [278]:
model_GBM.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('gradientboostingclassifier', GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              mi...       presort='auto', random_state=123, subsample=1.0, verbose=0,
              warm_start=False))])

In [279]:
predicted_GBM = model_GBM.predict(X_test)
print ("Accuracy score:",accuracy_score(y_test, predicted_GBM))

('Accuracy score:', 0.8979591836734694)


### VOTING

#### Build a Voting Classifier
* Use the following estimators:
    * Logistic Regression
    * Random Forest
    * GBM
* Choose either 'hard' or 'soft' voting
* Use a pipeline (MinMaxScaler, Voting)
* Get accuracy score for the validation set

In [295]:
est1 = LogisticRegression()
est2 = RandomForestClassifier()
est3 = GradientBoostingClassifier()

model_vc = VotingClassifier(estimators=[('lr', est1), ('rf', est2), ('gbc', est3)], voting='hard')

In [291]:
model_Voting = make_pipeline(MinMaxScaler(), model_vc)

In [292]:
from sklearn.model_selection import cross_val_score

In [296]:
for clf, label in zip([est1, est2, est3, model_Voting], ['Logistic Regression', 'Random Forest', 'Gradient Boosting', 'Ensemble']):
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.85 (+/- 0.02) [Logistic Regression]
Accuracy: 0.85 (+/- 0.02) [Random Forest]
Accuracy: 0.87 (+/- 0.02) [Gradient Boosting]


  if diff:
  if diff:
  if diff:
  if diff:


Accuracy: 0.87 (+/- 0.02) [Ensemble]


  if diff:
