# Random Forest Classification

### Importing Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Importing Dataset

In [2]:
dataset = pd.read_csv('Social_Network_Ads.csv')

In [3]:
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, -1].values

In [4]:
dataset.head()

Unnamed: 0,Age,EstimatedSalary,Purchased
0,19,19000,0
1,35,20000,0
2,26,43000,0
3,27,57000,0
4,19,76000,0


In [5]:
dataset.shape

(400, 3)

In [6]:
dataset.info

<bound method DataFrame.info of      Age  EstimatedSalary  Purchased
0     19            19000          0
1     35            20000          0
2     26            43000          0
3     27            57000          0
4     19            76000          0
..   ...              ...        ...
395   46            41000          1
396   51            23000          1
397   50            20000          1
398   36            33000          0
399   49            36000          1

[400 rows x 3 columns]>

### Splitting the dataset into Training Set And Testing Set

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0)

In [9]:
X_train.shape

(300, 2)

In [10]:
X_test[:3]

array([[   30, 87000],
       [   38, 50000],
       [   35, 75000]], dtype=int64)

In [11]:
print(Y_train)

[0 1 0 1 1 1 0 0 0 0 0 0 1 1 1 0 1 0 0 1 0 1 0 1 0 0 1 1 1 1 0 1 0 1 0 0 1
 0 0 1 0 0 0 0 0 1 1 1 1 0 0 0 1 0 1 0 1 0 0 1 0 0 0 1 0 0 0 1 1 0 0 1 0 1
 1 1 0 0 1 1 0 0 1 1 0 1 0 0 1 1 0 1 1 1 0 0 0 0 0 1 0 0 1 1 1 1 1 0 1 1 0
 1 0 0 0 0 0 0 0 1 1 0 0 1 0 0 1 0 0 0 1 0 1 1 0 1 0 0 0 0 1 0 0 0 1 1 0 0
 0 0 1 0 1 0 0 0 1 0 0 0 0 1 1 1 0 0 0 0 0 0 1 1 1 1 1 0 1 0 0 0 0 0 1 0 0
 0 0 0 0 1 1 0 1 0 1 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 1 1 0 0 0 0 0
 0 1 1 0 0 0 0 1 0 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 1 1 0 0 0
 0 0 1 0 1 1 0 0 0 0 0 1 0 1 0 0 1 0 0 1 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1
 0 0 0 0]


In [12]:
print(Y_test)

[0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 1 0 0 0 0 0 1 1 0 0 0 0
 0 0 1 0 0 0 0 1 0 0 1 0 1 1 0 0 0 1 1 0 0 1 0 0 1 0 1 0 1 0 0 0 0 1 0 0 1
 0 0 0 0 1 1 1 0 0 0 1 1 0 1 1 0 0 1 0 0 0 1 0 1 1 1]


### Feature Scaling

In [13]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [14]:
print(X_train[0:10])

[[ 0.58164944 -0.88670699]
 [-0.60673761  1.46173768]
 [-0.01254409 -0.5677824 ]
 [-0.60673761  1.89663484]
 [ 1.37390747 -1.40858358]
 [ 1.47293972  0.99784738]
 [ 0.08648817 -0.79972756]
 [-0.01254409 -0.24885782]
 [-0.21060859 -0.5677824 ]
 [-0.21060859 -0.19087153]]


In [15]:
print(X_test[0:5])

[[-0.80480212  0.50496393]
 [-0.01254409 -0.5677824 ]
 [-0.30964085  0.1570462 ]
 [-0.80480212  0.27301877]
 [-0.30964085 -0.5677824 ]]


### Training the Random Forest Classification model on the Training set

In [16]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=5, criterion='entropy', random_state=0)
classifier.fit(X_train, Y_train)

RandomForestClassifier(criterion='entropy', n_estimators=5, random_state=0)

### Predict a New Result

In [17]:
print(classifier.predict(sc.transform([[30, 87000]])))

[0]


### Predict the Test Set Result

In [18]:
Y_pred = classifier.predict(X_test)
print(np.concatenate((Y_pred.reshape(len(Y_pred),1), Y_test.reshape(len(Y_test), 1)),1))

[[0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [0 0]
 [1 0]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]]


### Making the Confusion Matrix

In [19]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(Y_test, Y_pred)
print(cm)
accuracy_score(Y_test, Y_pred)

[[62  6]
 [ 3 29]]


0.91

### Visualising the Training Set Results

In [20]:
import warnings
warnings.filterwarnings('ignore')

In [21]:
"""from matplotlib.colors import ListedColormap
X_set, Y_set = sc.inverse_transform(X_train), Y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min()-10, stop=X_set[:, 0].max() + 10, step = 0.25), 
                    np.arange(start = X_set[:,1].min()-1000, stop=X_set[:, 1].max() + 1000, step = 0.25))
plt.contourf(X1, X2, classifier.predict(sc.transform(np.array([X1.ravel(), X2.ravel()]).T)).reshape(X1.shape),
            alpha = 0.75, cmap= ListedColormap(('red', 'Green')))
plt.xlim(X1.min(), X1.max())
plt.xlim(X2.min(), X2.max())
for i, j in enumerate(np.unique(Y_set)):
    plt.scatter(X_set[Y_set == j, 0], X_set[Y_set==j, 1], c=ListedColormap(('red', 'green'))(i), label=j)
plt.title('Random Forest Classification (Training Set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()"""

"from matplotlib.colors import ListedColormap\nX_set, Y_set = sc.inverse_transform(X_train), Y_train\nX1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min()-10, stop=X_set[:, 0].max() + 10, step = 0.25), \n                    np.arange(start = X_set[:,1].min()-1000, stop=X_set[:, 1].max() + 1000, step = 0.25))\nplt.contourf(X1, X2, classifier.predict(sc.transform(np.array([X1.ravel(), X2.ravel()]).T)).reshape(X1.shape),\n            alpha = 0.75, cmap= ListedColormap(('red', 'Green')))\nplt.xlim(X1.min(), X1.max())\nplt.xlim(X2.min(), X2.max())\nfor i, j in enumerate(np.unique(Y_set)):\n    plt.scatter(X_set[Y_set == j, 0], X_set[Y_set==j, 1], c=ListedColormap(('red', 'green'))(i), label=j)\nplt.title('Random Forest Classification (Training Set)')\nplt.xlabel('Age')\nplt.ylabel('Estimated Salary')\nplt.legend()\nplt.show()"

### Visualising the Test set results

In [22]:
from matplotlib.colors import ListedColormap
X_set, Y_set = sc.inverse_transform(X_test), Y_test
X1, X2 = np.meshgrid(np.arange(start=X_set[:,0].min()-10, stop=X_set[:,0].max()+10, step=0.25),
                    (np.arange(start=X_set[:,1].min()-1000, stop=X_set[:,1].max()+1000, step=0.25))
plt.contour(X1, X2, classifier.predict(sc.transform(np.array([X1.ravel(), X2.ravel()]).T)).reshape[X1.shape],
           alpha=0.75, cmap=ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[Y_set==j, 0], X_set[Y_set==j, 1], c= ListedColormap(('red', 'green'))(i), label=j)
plt.title('Random Forest Classification (Test Set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()

"from matplotlib.colors import ListedColormap\nX_set, Y_set = sc.inverse_transform(X_test), Y_test\nX1, X2 = np.meshgrid(np.arange(start=X_set[:,0].min()-10, stop=X_set[:,0].max()+10, step=0.25),\n                    (np.arange(start=X_set[:,1].min()-1000, stop=X_set[:,1].max()+1000, step=0.25))\nplt.contour(X1, X2, classifier.predict(sc.transform(np.array([X1.ravel(), X2.ravel()]).T)).reshape[X1.shape],\n           alpha=0.75, cmap=ListedColormap(('red', 'green')))\nplt.xlim(X1.min(), X1.max())\nplt.ylim(X2.min(), X2.max())\nfor i, j in enumerate(np.unique(y_set)):\n    plt.scatter(X_set[Y_set==j, 0], X_set[Y_set==j, 1], c= ListedColormap(('red', 'green'))(i), label=j)\nplt.title('Random Forest Classification (Test Set)')\nplt.xlabel('Age')\nplt.ylabel('Estimated Salary')\nplt.legend()\nplt.show()"

In [23]:
"""from sklearn.tree import export_graphviz
from IPython import display
from sklearn.ensemble import RandomForestRegressor

m = RandomForestRegressor(n_estimators=5, max_depth=3, bootstrap=False, n_jobs=-1)
m.fit(X_train, Y_train)

str_tree = export_graphviz(m,
                          out_file=None,
                          feature_names=dataset.coluns,
                          filled=True,
                          special_characters=True,
                          rotate=True,
                          precision=1)
display.display(str_tree)"""

'from sklearn.tree import export_graphviz\nfrom IPython import display\nfrom sklearn.ensemble import RandomForestRegressor\n\nm = RandomForestRegressor(n_estimators=5, max_depth=3, bootstrap=False, n_jobs=-1)\nm.fit(X_train, Y_train)\n\nstr_tree = export_graphviz(m,\n                          out_file=None,\n                          feature_names=dataset.coluns,\n                          filled=True,\n                          special_characters=True,\n                          rotate=True,\n                          precision=1)\ndisplay.display(str_tree)'

In [26]:
"""X_train.columns"""

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [27]:
dataset.head()

Unnamed: 0,Age,EstimatedSalary,Purchased
0,19,19000,0
1,35,20000,0
2,26,43000,0
3,27,57000,0
4,19,76000,0


### Visualizing the Tree

In [31]:
import PIL
import pydotplus
from glob import glob
from IPython.display import display, Image
from sklearn.tree import export_graphviz

In [32]:
def save_tree_as_png(clf, iteration, feature_name, target_name):
    file_name='Purchased_' +str(iteration)+'png'
    dot_data = export_graphviz(clf,
                              out_file=None,
                              feature_names=feature_name,
                              class_names=target_name,
                              rounded=True,
                              proportion=False,
                              precision=2,
                              filled=True,
                              )
    graph= pydotplus.graph_from_dot_data(dot_data)
    graph.write_png(file_name)
    print('Decision Tree {} saved as png file'.format(iteration + 1))

In [33]:
col = dataset.columns.tolist()
feature_names=col[:2]
target_names=col[2]

In [34]:
target_names

'Purchased'

In [35]:
classifier.estimators_

[DecisionTreeClassifier(criterion='entropy', max_features='auto',
                        random_state=209652396),
 DecisionTreeClassifier(criterion='entropy', max_features='auto',
                        random_state=398764591),
 DecisionTreeClassifier(criterion='entropy', max_features='auto',
                        random_state=924231285),
 DecisionTreeClassifier(criterion='entropy', max_features='auto',
                        random_state=1478610112),
 DecisionTreeClassifier(criterion='entropy', max_features='auto',
                        random_state=441365315)]

In [36]:
classifier.estimators_[0]

DecisionTreeClassifier(criterion='entropy', max_features='auto',
                       random_state=209652396)

In [37]:
for i in range(len(classifier.estimators_)):
    save_tree_as_png(classifier.estimators_[i], i, feature_names, target_names)

Decision Tree 1 saved as png file
Decision Tree 2 saved as png file
Decision Tree 3 saved as png file
Decision Tree 4 saved as png file
Decision Tree 5 saved as png file


In [38]:
images = [PIL.Image.open(f) for f in glob('/*.png')]

In [41]:
images

[]

In [42]:
for im in images:
    display(Image(filename=im.filename, retina=True))