# Random Forests Example

Example file showing random forest implementation for our data set

## Classifier training and testing

In [43]:
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.cross_validation import KFold

import immas
from immas import MammogramImage
from immas import get_img_features
from immas.basic_functions import show_image_plt
from immas import get_dataset_features

In [44]:
# Import the data
data = pd.read_csv("classifier-train-data.csv")
data.sample(5)

Unnamed: 0.1,Unnamed: 0,perimeter,area,circularity,ac,mean_intensity,standard_deviation,smoothness,skewness,class_id
1867,1868,899.095453,14885.5,4.321538,3444.491487,33043.33714,18935.918275,1.0,12635300000.0,-1.0
3390,3391,172.727922,1420.5,1.671375,849.898855,34477.479416,18878.9737,1.0,-311762500000.0,-1.0
1588,1589,452.526911,6216.5,2.621398,2371.444238,33082.684734,18999.99636,1.0,116301300000.0,-1.0
2380,2381,167.656854,1114.0,2.007925,554.801491,33239.54591,19124.833629,1.0,-142521600000.0,-1.0
1444,1445,248.727922,2068.5,2.380037,869.104166,32677.747719,18878.957259,1.0,-123699700000.0,-1.0


In [45]:
# Separate the features(X) from the labels(y)
X_all = data.drop(['class_id', 'Unnamed: 0'], axis=1)
y_all = data['class_id']

# Train 80% of data, test 20%
num_test = 0.20
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=num_test, random_state=23)

In [27]:
# Fit and tune the algorithm

# Choose the type of classifier. 
clf = RandomForestClassifier()

# Choose some parameter combinations to try
parameters = {'n_estimators': [4, 6, 9], 
              'max_features': ['log2', 'sqrt','auto'], 
              'criterion': ['entropy', 'gini'],
              'max_depth': [2, 3, 5, 10], 
              'min_samples_split': [2, 3, 5],
              'min_samples_leaf': [1,5,8]
             }

# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(accuracy_score)

# Run the grid search
grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)

# Set the clf to the best combination of parameters
clf = grid_obj.best_estimator_

# Fit the best algorithm to the data. 
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=3, max_features='log2', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=4, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [28]:
predictions = clf.predict(X_test)
print(accuracy_score(y_test, predictions))

0.9894736842105263


In [29]:
# Validate with k-fold
def run_kfold(clf):
    kf = KFold(891, n_folds=10)
    outcomes = []
    fold = 0
    for train_index, test_index in kf:
        fold += 1
        X_train, X_test = X_all.values[train_index], X_all.values[test_index]
        y_train, y_test = y_all.values[train_index], y_all.values[test_index]
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        outcomes.append(accuracy)
        print("Fold {0} accuracy: {1}".format(fold, accuracy))     
    mean_outcome = np.mean(outcomes)
    print("Mean Accuracy: {0}".format(mean_outcome)) 

run_kfold(clf)

Fold 1 accuracy: 1.0
Fold 2 accuracy: 1.0
Fold 3 accuracy: 0.9887640449438202
Fold 4 accuracy: 1.0
Fold 5 accuracy: 0.9662921348314607
Fold 6 accuracy: 0.9887640449438202
Fold 7 accuracy: 1.0
Fold 8 accuracy: 0.9887640449438202
Fold 9 accuracy: 0.9887640449438202
Fold 10 accuracy: 0.9775280898876404
Mean Accuracy: 0.9898876404494381


In [30]:
# Let's see how many masses we managed to detect out of all regions
x_test2 = X_test[y_test == 1]
y_test2 = y_test[y_test == 1]

print(f"Length of test data {len(X_test)}")
print(f"Number of masses in the test data {len(x_test2)}")

predictions = clf.predict(x_test2)

print(predictions)
print(f"Detected percentage of masses {accuracy_score(y_test2, predictions)}")

Length of test data 855
Number of masses in the test data 26
[ 1. -1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1. -1.  1.
  1. -1. -1.  1.  1.  1.  1.  1.]
Detected percentage of masses 0.8461538461538461


In [31]:
# let's test performance on the initial data
x_test_all = X_all[y_all == 1]
y_test_all = y_all[y_all == 1]

print(f"Length of test data {len(X_all)}")
print(f"Number of masses in the test data {len(x_test_all)}")

predictions = clf.predict(x_test_all)
print(f"Detected percentage of masses {accuracy_score(y_test_all, predictions)}")

Length of test data 4273
Number of masses in the test data 114
Detected percentage of masses 0.7719298245614035


## Example of saving classifier to the disk and loading it back

Let's save our classifier

In [41]:
import pickle

# save the classifier
with open("./models/rf_classifier.pkl", "wb") as fid:
    pickle.dump(clf, fid)    

Now we will load classifier and test it again

In [46]:
# load it again
with open("./models/rf_classifier.pkl", "rb") as fid:
    rf_classifier_loaded = pickle.load(fid)
    
# let's test performance on the initial data
print(f"Length of test data {len(X_all)}")
print(f"Number of masses in the test data {len(x_test_all)}")

predictions_loaded = rf_classifier_loaded.predict(x_test_all)
print(f"Detected percentage of masses {accuracy_score(y_test_all, predictions_loaded)}")

Length of test data 4273
Number of masses in the test data 114
Detected percentage of masses 0.7719298245614035
