In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from pandas import Series
from matplotlib import pyplot
from tqdm import tqdm_notebook as tqdm

In [2]:
def prepare_data(path, window_size, data_type):
    """
    Prepares data for training. Creates training and test sets (80%/20%). 

    Args:
        path(str): path to the csv containing data
        window_size(int): window size
        data_type(str): data type, must be 'temp' for temporal and 'space' for spacial.

    Returns:
        X_train, X_test, y_train, y_test: training and test sets for sklearn machine learning models.
    """

    file_name = path + str(window_size) + '.csv'
    data = pd.read_csv(file_name, header=None, dtype='int')
    if data_type == 'space':
        window_size = window_size*2
    y = data[window_size]
    X = data.drop(window_size, axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    return X_train, X_test, y_train, y_test

In [3]:
def train_model(model, X_train, X_test, y_train, y_test):
    """
    Trains and tests the given model on the given data. Returns the mean of 
    cross-validation scores and the test score. 

    Args:
        model(sklearn classifier): model to train and test
        X_train, X_test, y_train, y_test: training and test data

    Returns:
        cv_score(float): mean of the model cross-validation scores (10-fold)
        test_score(float): model test score
    """
    cv_scores = cross_val_score(model, X_train, y_train, cv=10).mean()
    model.fit(X_train, y_train)
    y_pred_ = model.predict(X_train)
    test_score = model.score(X_test, y_test)

    return cv_score, test_score

In [4]:
def run_ML(path, windows, models, data_type):
    """
    Runs ML. 

    Args:
        path(str):
        windows(list of ints):
        models(list of sklearn classifiers):
        data_type(str): data type, must be 'temp' for temporal and 'space' for spacial.

    Returns:
        none
    """

    for w in tqdm(windows):

        results = pd.DataFrame(index=['Desicion Tree', 'Random Forest', 'Poly SVM', 'MLP'], columns=[
                               'CV Score', 'Test Score'])
        cv_scores = []
        test_scores = []
        X_train, X_test, y_train, y_test = prepare_data(path, w, data_type)

        for m in models:
            cv_score, test_score = train_model(
                m, X_train, X_test, y_train, y_test)
            cv_scores.append(cv_score)
            test_scores.append(test_score)

        results['CV Score'] = cv_scores
        results['Test Score'] = test_scores

        print("Window size: " + str(w))
        print(results)
        print('')

In [5]:
print('Running ML on temporal data\n')
run_ML(path='train_data/temp_', windows=[5, 10, 50, 100, 500],
       models=[DecisionTreeClassifier(), RandomForestClassifier(max_depth=2, random_state=0, n_estimators=10),
               SVC(degree=2, gamma='scale'), MLPClassifier(hidden_layer_sizes=(100), alpha=0.0001, random_state=0)],
       data_type='temp')

Running ML on temporal data



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

Window size: 5
               CV Score  Test Score
Desicion Tree  0.976571    0.977858
Random Forest  0.975863    0.976056
Poly SVM       0.977987    0.977858
MLP            0.978309    0.977343

Window size: 10
               CV Score  Test Score
Desicion Tree  0.987641    0.989186
Random Forest  0.989572    0.991504
Poly SVM       0.989766    0.990731
MLP            0.989315    0.990216

Window size: 50
               CV Score  Test Score
Desicion Tree  0.984874    0.984809
Random Forest  0.976699    0.978373
Poly SVM       0.989380    0.987642
MLP            0.989701    0.987899

Window size: 100
               CV Score  Test Score
Desicion Tree  0.987191    0.988157
Random Forest  0.965049    0.964470
Poly SVM       0.989187    0.989701
MLP            0.985582    0.988671

Window size: 500
               CV Score  Test Score
Desicion Tree  0.988929    0.987642
Random Forest  0.914973    0.913749
Poly SVM       0.986483    0.988414
MLP            0.966464    0.901905




In [6]:
print('Running ML on spacial data\n')
run_ML(path='train_data/space_', windows=[5, 10, 50, 100, 500],
       models=[DecisionTreeClassifier(), RandomForestClassifier(max_depth=2, random_state=0, n_estimators=10),
               SVC(degree=2, gamma='scale'), MLPClassifier(hidden_layer_sizes=(100), alpha=0.0001, random_state=0)],
       data_type='space')

Running ML on spacial data



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

Window size: 5
               CV Score  Test Score
Desicion Tree  0.947540    0.950322
Random Forest  0.881760    0.864093
Poly SVM       0.957646    0.960618
MLP            0.960415    0.961133

Window size: 10
               CV Score  Test Score
Desicion Tree  0.952368    0.960618
Random Forest  0.919092    0.914286
Poly SVM       0.912268    0.907336
MLP            0.966980    0.964479

Window size: 50
               CV Score  Test Score
Desicion Tree  0.952175    0.944402
Random Forest  0.937630    0.932561
Poly SVM       0.681385    0.684685
MLP            0.964472    0.969112

Window size: 100
               CV Score  Test Score
Desicion Tree  0.946061    0.941055
Random Forest  0.934797    0.926898
Poly SVM       0.603052    0.602574
MLP            0.962151    0.966023

Window size: 500
               CV Score  Test Score
Desicion Tree  0.937757    0.939768
Random Forest  0.924882    0.927671
Poly SVM       0.508432    0.508623
MLP            0.959708    0.955985


