# Classification using Random Forests

Using sklearn
-----------------------

To understand the parameters of the RF Classifier,

http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html



Source: http://dataaspirant.com/2017/06/26/random-forest-classifier-python-scikit-learn/

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

import os

In [2]:
# File Paths
INPUT_PATH = "./random_forest/inputs/breast-cancer-wisconsin.data"
OUTPUT_PATH = "./random_forest/inputs/breast-cancer-wisconsin.csv"

def data_file_to_csv():
    """
 
    :return:
    """
    
    # Headers 
    headers = ["CodeNumber", "ClumpThickness", "UniformityCellSize", "UniformityCellShape", "MarginalAdhesion",
               "SingleEpithelialCellSize", "BareNuclei", "BlandChromatin", "NormalNucleoli", "Mitoses",
               "CancerType"]
    # Load the dataset into Pandas data frame
    dataset = read_data(INPUT_PATH)
    # Add the headers to the loaded dataset
    dataset = add_headers(dataset, headers)
    # Save the loaded dataset into csv format
    dataset.to_csv(OUTPUT_PATH, index=False)
    print("File saved ...!")

def add_headers(dataset, headers):
    """
    Add the headers to the dataset
    :param dataset:
    :param headers:
    :return:
    """
    dataset.columns = headers
    return dataset

In [3]:
def dataset_statistics(dataset):
    """
    Basic statistics of the dataset
    :param dataset: Pandas dataframe
    :return: None, print the basic statistics of the dataset
    """
    print(dataset.describe())

In [4]:
headers = ["CodeNumber", "ClumpThickness", "UniformityCellSize", "UniformityCellShape", "MarginalAdhesion",
               "SingleEpithelialCellSize", "BareNuclei", "BlandChromatin", "NormalNucleoli", "Mitoses",
               "CancerType"]

dataset = pd.read_csv(INPUT_PATH)

In [5]:
df_cancer = pd.DataFrame(dataset)
print(df_cancer.shape)
print(df_cancer.size)
df_cancer.sample(6)

(698, 11)
7678


Unnamed: 0,CodeNumber,ClumpThickness,UniformityCellSize,UniformityCellShape,MarginalAdhesion,SingleEpithelialCellSize,BareNuclei,BlandChromatin,NormalNucleoli,Mitoses,CancerType
58,1113906,9,5,5,2,2,2,5,1,1,4
447,1182404,1,1,1,1,1,1,1,1,1,2
374,636437,1,1,1,1,2,1,1,1,1,2
17,1050670,10,7,7,6,4,10,4,1,2,4
509,1298360,1,1,1,1,2,1,1,1,1,2
516,1320077,1,1,1,1,1,1,2,1,1,2


In [6]:
dataset_statistics(dataset)

         CodeNumber  ClumpThickness  UniformityCellSize  UniformityCellShape  \
count  6.980000e+02      698.000000          698.000000           698.000000   
mean   1.071807e+06        4.416905            3.137536             3.210602   
std    6.175323e+05        2.817673            3.052575             2.972867   
min    6.163400e+04        1.000000            1.000000             1.000000   
25%    8.702582e+05        2.000000            1.000000             1.000000   
50%    1.171710e+06        4.000000            1.000000             1.000000   
75%    1.238354e+06        6.000000            5.000000             5.000000   
max    1.345435e+07       10.000000           10.000000            10.000000   

       MarginalAdhesion  SingleEpithelialCellSize  BlandChromatin  \
count        698.000000                698.000000      698.000000   
mean           2.809456                  3.217765        3.438395   
std            2.856606                  2.215408        2.440056   
min

We are missing the BareNuclei column. The CSV file has ? in some of this column entries

In [7]:
def handle_missing_values(dataset, missing_values_header, missing_label):
    """
    Filter missing values from the dataset
    :param dataset:
    :param missing_values_header:
    :param missing_label:
    :return:
    """

    return dataset[dataset[missing_values_header] != missing_label]

We are removing entries with missing values.

In [8]:
dataset = handle_missing_values(dataset, headers[6], '?')
df_cancer = pd.DataFrame(dataset)
print(df_cancer.shape)
print(df_cancer.size)
df_cancer.sample(6)

(682, 11)
7502


Unnamed: 0,CodeNumber,ClumpThickness,UniformityCellSize,UniformityCellShape,MarginalAdhesion,SingleEpithelialCellSize,BareNuclei,BlandChromatin,NormalNucleoli,Mitoses,CancerType
220,1225799,10,6,4,3,10,10,9,10,1,4
398,1206314,1,2,3,1,2,1,1,1,1,2
639,1285722,4,1,1,3,2,1,1,1,1,2
543,1186936,2,1,3,2,2,1,2,1,1,2
18,1050718,6,1,1,1,2,1,3,1,1,2
36,1081791,6,2,1,1,1,1,7,1,1,2


In [9]:
train_x, test_x, train_y, test_y = train_test_split(df_cancer[df_cancer.columns[1:-1]], df_cancer[df_cancer.columns[-1]] , test_size=0.3, random_state=42) 

In [10]:
print("Train_x Shape :: ", train_x.shape)
print("Train_y Shape :: ", train_y.shape)
print("Test_x Shape :: ", test_x.shape)
print("Test_y Shape :: ", test_y.shape)
 

Train_x Shape ::  (477, 9)
Train_y Shape ::  (477,)
Test_x Shape ::  (205, 9)
Test_y Shape ::  (205,)


In [11]:
classifier = RandomForestClassifier()
trained_model = classifier.fit(train_x, train_y)

In [12]:
predictions = trained_model.predict(test_x)
for i in range(0, 10):
    print("Actual outcome :: {} and Predicted outcome :: {}".format(list(test_y)[i], predictions[i]))

Actual outcome :: 4 and Predicted outcome :: 4
Actual outcome :: 2 and Predicted outcome :: 2
Actual outcome :: 4 and Predicted outcome :: 4
Actual outcome :: 2 and Predicted outcome :: 2
Actual outcome :: 4 and Predicted outcome :: 4
Actual outcome :: 2 and Predicted outcome :: 2
Actual outcome :: 2 and Predicted outcome :: 2
Actual outcome :: 2 and Predicted outcome :: 2
Actual outcome :: 2 and Predicted outcome :: 2
Actual outcome :: 2 and Predicted outcome :: 2


In [13]:
print("Train Accuracy :: ", accuracy_score(train_y, trained_model.predict(train_x)))
print("Test Accuracy  :: ", accuracy_score(test_y, predictions))
print(" Confusion matrix \n", confusion_matrix(test_y, predictions))

Train Accuracy ::  0.9958071278825996
Test Accuracy  ::  0.9609756097560975
 Confusion matrix 
 [[134   2]
 [  6  63]]
