## Random Forest
### Classification

In [1]:
%matplotlib inline

import matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split

In [3]:
from sklearn.ensemble import RandomForestClassifier

In [4]:
glass=pd.read_csv("glassClass.csv")

In [5]:
glass.head(7)

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1
5,1.51596,12.79,3.61,1.62,72.97,0.64,8.07,0.0,0.26,1
6,1.51743,13.3,3.6,1.14,73.09,0.58,8.17,0.0,0.0,1


In [6]:
X= glass.drop("Type", axis=1) #predictors
Y = glass["Type"] #predictor

### training and testing data

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = .2, random_state=25) #20% hold out for testing

In [8]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [9]:
random_forest.score(X_train, y_train) #100% accuracy when we implement the RF model on the same data set

1.0

In [10]:
Y_pred = random_forest.predict(X_test) #predict the classification of glass based on test predictors

In [11]:
Y_pred #predicted values of classifications

array([2, 3, 2, 1, 5, 5, 2, 1, 2, 7, 2, 6, 5, 1, 2, 6, 2, 1, 1, 2, 2, 2, 2,
       2, 2, 2, 5, 2, 7, 2, 1, 3, 7, 1, 7, 1, 7, 2, 1, 1, 2, 2, 7], dtype=int64)

In [12]:
random_forest.predict_proba(X_test)[0:10] #How confident is the classifier about each glass type? 

array([[ 0.34,  0.4 ,  0.21,  0.01,  0.03,  0.01],
       [ 0.4 ,  0.15,  0.44,  0.  ,  0.  ,  0.01],
       [ 0.27,  0.38,  0.33,  0.01,  0.01,  0.  ],
       [ 0.93,  0.03,  0.04,  0.  ,  0.  ,  0.  ],
       [ 0.04,  0.11,  0.  ,  0.78,  0.05,  0.02],
       [ 0.06,  0.23,  0.02,  0.55,  0.11,  0.03],
       [ 0.13,  0.83,  0.04,  0.  ,  0.  ,  0.  ],
       [ 0.4 ,  0.39,  0.2 ,  0.01,  0.  ,  0.  ],
       [ 0.27,  0.35,  0.08,  0.03,  0.24,  0.03],
       [ 0.02,  0.01,  0.01,  0.01,  0.07,  0.88]])

In [13]:
##performance on test set

In [14]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, Y_pred)
confusion_matrix

array([[ 9,  3,  0,  0,  0,  0],
       [ 1, 15,  1,  2,  0,  0],
       [ 0,  0,  1,  0,  0,  0],
       [ 0,  0,  0,  2,  0,  0],
       [ 0,  1,  0,  0,  2,  0],
       [ 0,  0,  0,  0,  0,  6]], dtype=int64)

In [15]:
from sklearn.metrics import accuracy_score

In [16]:
accuracy_score(y_test, Y_pred) #compare with the actual y values, y_test (hold outs) with predicted y

0.81395348837209303

In [17]:
from sklearn.metrics import cohen_kappa_score
cohen_kappa_score(y_test, Y_pred) #Scores above .8 are generally considered good agreement; 

0.7386018237082066

In [18]:
from sklearn.metrics import classification_report

In [19]:
report = classification_report(y_test, Y_pred)
print(report)

             precision    recall  f1-score   support

          1       0.90      0.75      0.82        12
          2       0.79      0.79      0.79        19
          3       0.50      1.00      0.67         1
          5       0.50      1.00      0.67         2
          6       1.00      0.67      0.80         3
          7       1.00      1.00      1.00         6

avg / total       0.84      0.81      0.82        43



In [20]:
#out of all the examples the classifier labeled , what fraction were correct? 

### feature selection

In [21]:
from sklearn.feature_selection import RFE  ## Recursive Feature Elimination (RFE) method is a feature selection approach.
#It works by recursively removing attributes and building a model on those attributes that remain. 
#It uses the model accuracy to identify which attributes (and combination of attributes) contribute the most to predicting 
#the target attribute

In [22]:
# create the RFE model and select 5 attributes
rfe = RFE(random_forest, 5)
fit = rfe.fit(X, Y)
# summarize the selection of the attributes
print("Num Features: %d") % fit.n_features_
print("Selected Features: %s") % fit.support_
print("Feature Ranking: %s") % fit.ranking_

Num Features: 5
Selected Features: [ True  True  True  True False False  True False False]
Feature Ranking: [1 1 1 1 2 3 1 4 5]


In [23]:
from sklearn.ensemble import ExtraTreesClassifier

In [24]:
forest = ExtraTreesClassifier(n_estimators=250,
                              random_state=0)

forest.fit(X, Y)
importances = forest.feature_importances_

In [25]:
indices = np.argsort(importances)[::-1]

In [26]:
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

Feature ranking:
1. feature 2 (0.176743)
2. feature 0 (0.138102)
3. feature 3 (0.135153)
4. feature 6 (0.132065)
5. feature 5 (0.100274)
6. feature 1 (0.094001)
7. feature 4 (0.088294)
8. feature 7 (0.081736)
9. feature 8 (0.053633)


In [27]:
X.head(5)

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0


In [28]:
from sklearn.model_selection import KFold
from sklearn import model_selection
seed = 10
kfold = model_selection.KFold(n_splits=10, random_state=seed)

In [29]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(random_forest, X, Y, cv=5)

In [30]:
print(scores)

[ 0.66666667  0.77272727  0.6744186   0.61904762  0.875     ]
