This notebook prepares data extracted by the  01_get_training_data notebook from the HDF5 file contaning simulation data for ML, and trains and evaluates several ML models (decision trees, random forest, polynomial SVM and MLP).

Author: Mariia Lundvall (lundvm@uw.edu) <br>
Date: 01/21/2019

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from pandas import Series
from matplotlib import pyplot
from tqdm import tqdm_notebook as tqdm

<b> NOTE: Update the paths prior to running this notebook.  <b>

In [2]:
# path to temporal training data
# these files are created by the 01_get_training_data notebook
temp_path = '/home/NETID/lundvm/data/train_data/temp_'
# path to spacial training data
# these files are created by the 01_get_training_data notebook
space_path = '/home/NETID/lundvm/data/train_data/space_'

In [3]:
def prepare_data(path, window_size, data_type):
    """
    Prepares data for training. Creates training and test sets (80%/20%). 

    Args:
        path(str): path to the csv containing data
        window_size(int): window size
        data_type(str): data type, must be 'temp' for temporal and 'space' for spacial.

    Returns:
        X_train, X_test, y_train, y_test: training and test sets for sklearn machine learning models.
    """

    file_name = path + str(window_size) + '.csv'
    data = pd.read_csv(file_name, header=None, dtype='int')
    if data_type == 'space':
        window_size = window_size*2
    y = data[window_size]
    X = data.drop(window_size, axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    return X_train, X_test, y_train, y_test

In [4]:
def train_model(model, X_train, X_test, y_train, y_test):
    """
    Trains and tests the given model on the given data. Returns the mean of 
    cross-validation scores and the test score. 

    Args:
        model(sklearn classifier): model to train and test
        X_train, X_test, y_train, y_test: training and test data

    Returns:
        cv_score(float): mean of the model cross-validation scores (10-fold)
        test_score(float): model test score
    """
    cv_score = cross_val_score(model, X_train, y_train, cv=10).mean()
    model.fit(X_train, y_train)
    y_pred_ = model.predict(X_train)
    test_score = model.score(X_test, y_test)

    return cv_score, test_score

In [5]:
def run_ML(path, windows, models, data_type):
    """
    Runs ML. 

    Args:
        path(str): path to the csv containing data
        windows(list of ints): window sizes
        models(list of sklearn classifiers): list of models to train
        data_type(str): data type, must be 'temp' for temporal and 'space' for spacial.

    Returns:
        none
    """

    for w in tqdm(windows):

        results = pd.DataFrame(index=['Desicion Tree', 'Random Forest', 'Poly SVM', 'MLP'], columns=[
                               'CV Score', 'Test Score'])
        cv_scores = []
        test_scores = []
        X_train, X_test, y_train, y_test = prepare_data(path, w, data_type)

        for m in models:
            cv_score, test_score = train_model(m, X_train, X_test, y_train, y_test)
            cv_scores.append(cv_score)
            test_scores.append(test_score)

        results['CV Score'] = cv_scores
        results['Test Score'] = test_scores

        print("Window size: " + str(w))
        print(results)
        print('')

In [None]:
print('Running ML on temporal data\n')
windows=[5, 10, 50, 100, 500]
models=[DecisionTreeClassifier(), RandomForestClassifier(max_depth=2, random_state=0, n_estimators=10), 
        SVC(degree=2, gamma='scale'), MLPClassifier(hidden_layer_sizes=(100), alpha=0.0001, random_state=0)]
run_ML(temp_path, windows, models, 'temp') 

Running ML on temporal data



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

Window size: 5
               CV Score  Test Score
Desicion Tree  0.979247    0.977835
Random Forest  0.979634    0.977577
Poly SVM       0.980987    0.977320
MLP            0.980923    0.977835

Window size: 10
               CV Score  Test Score
Desicion Tree  0.988270    0.988402
Random Forest  0.990332    0.990722
Poly SVM       0.990203    0.990722
MLP            0.989946    0.989948

Window size: 50
               CV Score  Test Score
Desicion Tree  0.984274    0.983505
Random Forest  0.983179    0.981701
Poly SVM       0.989559    0.990206
MLP            0.988270    0.991237

Window size: 100
               CV Score  Test Score
Desicion Tree  0.987304    0.987371
Random Forest  0.974542    0.970619
Poly SVM       0.989752    0.987371
MLP            0.948811    0.987887

Window size: 500
               CV Score  Test Score
Desicion Tree  0.991171    0.992268
Random Forest  0.932972    0.937113
Poly SVM       0.989817    0.991753
MLP            0.907518    0.975515




In [None]:
print('Running ML on spacial data\n')
windows=[5, 10, 50, 100, 500]
models=[DecisionTreeClassifier(), RandomForestClassifier(max_depth=2, random_state=0, n_estimators=10), 
        SVC(degree=2, gamma='scale'), MLPClassifier(hidden_layer_sizes=(100), alpha=0.0001, random_state=0)]
run_ML(space_path, windows, models, 'space') 

Running ML on spacial data



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))



Window size: 5
               CV Score  Test Score
Desicion Tree  0.943160    0.947165
Random Forest  0.876907    0.880670
Poly SVM       0.942257    0.945619
MLP            0.955209    0.939691

Window size: 10
               CV Score  Test Score
Desicion Tree  0.955983    0.956186
Random Forest  0.912354    0.918814
Poly SVM       0.901269    0.897423
MLP            0.969453    0.970876

Window size: 50
               CV Score  Test Score
Desicion Tree  0.948766    0.954124
Random Forest  0.932783    0.931701
Poly SVM       0.701168    0.687887
MLP            0.968358    0.968299

Window size: 100
               CV Score  Test Score
Desicion Tree  0.945544    0.949742
Random Forest  0.932911    0.940206
Poly SVM       0.624477    0.636856
MLP            0.963782    0.969072

