In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

In [2]:
def load_data():
    
    ### import training data from csv files
    main_file = "labels.csv"
    main_label = np.loadtxt(main_file, delimiter = ",")
    main_label = main_label.reshape((main_label.shape[0],1))
    m = main_label.shape[0]
    
    # changing the labels to digits starting from 0
    labels = np.zeros((m,1), dtype = "int")
    num_class = int(0)
    current_bus = main_label[0]
    bus_reference = {str(num_class): current_bus} # dictionary containing original labels

    for i in range(m):
        if main_label[i] != current_bus:
            num_class += int(1)
            current_bus = main_label[i]
            bus_reference[str(num_class)] = current_bus
    
        labels[i] = num_class
    labels = labels.astype(int)
    
    main_file = "features.csv"
    main_features = np.loadtxt(main_file, delimiter=",")
    
    DATA = {'labels': labels,
            'main_features': main_features}
    num_class+=1
    
    return DATA, bus_reference, num_class

In [3]:
def select_features(selected_buses, DATA):
    
    # U can select what features to use for training
    main_features = DATA['main_features']
    m = len(selected_buses)
    n = DATA['main_features'].shape[0]
    features = main_features[:,selected_buses]
    
    return features

In [4]:
def split_dataset(features, labels, test_percentage):
    
    #Split data to training/crossval/test sets
    indices = np.random.permutation(features.shape[0])
    num_test = int(features.shape[0]*test_percentage)
    num_test2 = num_test // 2
    crossval_index, test_index, train_index = indices[:num_test2], indices[num_test2:num_test], indices[num_test:]
    
    crossval_features, test_features, train_features = features[crossval_index,:], features[test_index,:], features[train_index,:]
    crossval_labels, test_labels, train_labels = labels[crossval_index,:], labels[test_index,:], labels[train_index,:]
    
    dataset={'train_labels':       train_labels.reshape(train_labels.shape[0],),
             'train_features':     train_features,
             'test_labels':        test_labels.reshape(test_labels.shape[0],),
             'test_features':      test_features,
             'crossval_labels':    crossval_labels.reshape(crossval_labels.shape[0],),
             'crossval_features':  crossval_features}
    
    return dataset

In [5]:
DATA, bus_reference, num_class = load_data()

selected_buses = [1, 2, 3, 11, 27, 29, 50, 57, 74, 89]
features = select_features(selected_buses, DATA)

labels = DATA['labels']

dataset = split_dataset(features, labels, 0.2)

train_labels      = dataset['train_labels']
train_features    = dataset['train_features']
test_labels       = dataset['test_labels']
test_features     = dataset['test_features']
crossval_labels   = dataset['crossval_labels']
crossval_features = dataset['crossval_features']

In [6]:
# hyperparameter optimization using crossval set

arr1 = range(10,100,5); arr2 = range(5,20)
n_est_best = 0; n_depth_best = 0; max_acc = 0

for i in arr1:
    for j in arr2:
        model = RandomForestClassifier(n_estimators=i, max_depth=j, random_state=0).fit(crossval_features, crossval_labels)
        acc = model.score(crossval_features, crossval_labels)
        if acc>max_acc:
            max_acc = acc; n_est_best = i; n_depth_best = j
            
print("best acc: %f, n_estimators: %i, max_depth: %i" %(max_acc, n_est_best, n_depth_best))

best acc: 1.000000, n_estimators: 35, max_depth: 19


In [7]:
# training and testing the model

model = RandomForestClassifier(n_estimators=n_est_best, max_depth=n_depth_best, random_state=0).fit(train_features, train_labels)
acc = model.score(train_features, train_labels)
print("training accuracy: %f" %acc)

model = RandomForestClassifier(n_estimators=n_est_best, max_depth=n_depth_best, random_state=0).fit(test_features, test_labels)
acc = model.score(test_features, test_labels)
print("test accuracy: %f" %acc)

training accuracy: 0.995096
test accuracy: 0.998462
