# Random_Forests_Example

Example file showing random forest implementation for our data set

In [1]:
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.cross_validation import KFold

import immas
from immas import MammogramImage
from immas import get_img_features
from immas.basic_functions import show_image_plt
from immas import get_dataset_features



In [2]:
# Import test data
data_train = pd.read_csv("sample-features-train.csv")
data_test = pd.read_csv("sample-features-test.csv")
data_train.sample(5)

Unnamed: 0.1,Unnamed: 0,perimeter,area,circularity,ac,mean_intensity,standard_deviation,smoothness,skewness,class_id
6,6,295.698484,3319.5,2.0961176738068854,1583.642007,32924.413395,18998.025002,1.0,-128801500000.0,-1.0
249,249,81.414214,372.5,1.4160034868367417,263.064324,32501.533816,18632.243054,1.0,444261200000.0,-1.0
141,141,101.414214,630.5,1.2980837080346426,485.71598,32660.953079,19341.114835,1.0,208462500000.0,-1.0
235,235,80.0,399.0,1.2764306212883838,312.590433,31901.018182,18736.335489,1.0,-214072400000.0,-1.0
174,174,90.0,504.0,1.2789236498455876,394.081382,31874.674545,19177.687846,1.0,218761200000.0,-1.0


In [3]:
# Separate the features(X) from the labels(y)
X_all = data_train.drop(['class_id', 'Unnamed: 0'], axis=1)
y_all = data_train['class_id']

# Train 80% of data, test 20%
num_test = 0.20
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=num_test, random_state=23)

In [4]:
# Fit and tune the algorithm

# Choose the type of classifier. 
clf = RandomForestClassifier()

# Choose some parameter combinations to try
parameters = {'n_estimators': [4, 6, 9], 
              'max_features': ['log2', 'sqrt','auto'], 
              'criterion': ['entropy', 'gini'],
              'max_depth': [2, 3, 5, 10], 
              'min_samples_split': [2, 3, 5],
              'min_samples_leaf': [1,5,8]
             }

# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(accuracy_score)

# Run the grid search
grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)

# Set the clf to the best combination of parameters
clf = grid_obj.best_estimator_

# Fit the best algorithm to the data. 
clf.fit(X_train, y_train)



ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [29]:
predictions = clf.predict(X_test)
print(accuracy_score(y_test, predictions))

1.0


In [32]:
# Validate with k-fold
def run_kfold(clf):
    kf = KFold(891, n_folds=10)
    outcomes = []
    fold = 0
    for train_index, test_index in kf:
        fold += 1
        X_train, X_test = X_all.values[train_index], X_all.values[test_index]
        y_train, y_test = y_all.values[train_index], y_all.values[test_index]
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        outcomes.append(accuracy)
        print("Fold {0} accuracy: {1}".format(fold, accuracy))     
    mean_outcome = np.mean(outcomes)
    print("Mean Accuracy: {0}".format(mean_outcome)) 

run_kfold(clf)

IndexError: index 90 is out of bounds for axis 0 with size 12