# Machine Learning Models
The following classification models are carried out here:
    1. Random Forest
    2. Support Vector Machines
    3. Feed-forward Neural Networks

In [47]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import sklearn
import sklearn.preprocessing
import sklearn.model_selection
import sklearn.ensemble
import sklearn.multioutput

%matplotlib inline

# 1. Load in data

In [17]:
X_16 = pd.read_csv('../../../Data/model_inputs/gdf_2016_X.csv')
y_16 = pd.read_csv('../../../Data/model_inputs/gdf_2016_y.csv')
X_17 = pd.read_csv('../../../Data/model_inputs/gdf_2017_X.csv')
y_17 = pd.read_csv('../../../Data/model_inputs/gdf_2017_y.csv')

In [18]:
X_16.head()

Unnamed: 0,id_trip,mode_f,duration,distance_m,magnitude,carddir_f,start_down,end_downto,weekday,temporal_c,precip,temperatur,startrush,endrush,thrurush,startclust,endclust,land_use_s_f,land_use_e_f
0,1724206,0,460,415.23633,0.227492,0,1,1,1,3,2e-06,28.012522,0,0,0,9,9,0,0
1,1889461,1,447,1843.264582,0.470022,0,1,1,1,5,0.000134,25.844886,1,1,1,0,0,1,1
2,1724219,2,591,2657.42183,0.303495,1,1,1,1,3,0.00024,25.389363,0,0,0,0,9,1,0
3,2071991,3,844,2761.792383,0.223787,2,1,1,1,5,0.001427,24.93072,1,1,1,9,9,0,4
4,1667922,3,1211,1068.301088,0.293601,3,1,1,1,4,0.001429,21.769356,0,0,0,0,0,0,0


In [19]:
y_16.head()

Unnamed: 0,purpose_f
0,0
1,1
2,0
3,2
4,2


In [20]:
y_16['purpose_f'].value_counts()

0    15554
4    14981
1     7430
2     5790
3     5682
5     2473
6     2262
7     2168
Name: purpose_f, dtype: int64

# 2. Setup model

### 2.1 Encode model inputs

In [21]:
def encode_model_inputs(data, col):
    encoded_input = ''
    if col in data.columns:
        encoded_input = enc.fit_transform(data[col].values.reshape(-1, 1)).toarray()
    return encoded_input

In [22]:
enc = sklearn.preprocessing.OneHotEncoder(handle_unknown='ignore')

In [23]:
encoded_inputs = {}
for col in ["mode_f","carddir_f","startclust","endclust", "temporal_c", "land_use_s_f", "land_use_e_f"]:
    encoded_inputs[col] = encode_model_inputs(X_16, col)
    
encoded_inputs['y_codes'] = encode_model_inputs(y_16, 'purpose_f')
encoded_inputs['unique_ids'] = encode_model_inputs(X_16,'id_trip')

In [24]:
encoded_inputs['unique_ids']

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### 2.2 Normalise and split training and testing

In [34]:
def normalise(X):
    X = X / np.amax(X, axis=0)
    return X


def model_setup(X, y, test_size=0.33, norm=False):
    if norm:
        X = normalise(X)
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=test_size,\
                                                                                random_state=42, stratify=y)
    return X_train, X_test, y_train, y_test    

In [35]:
feature_list = list(X_16.columns)
feature_list[:3]

['id_trip', 'mode_f', 'duration']

In [36]:
X_train, X_test, y_train, y_test  = model_setup(X_16, y_16, norm=False)

# 3. Begin Modelling
#### Model functions:
- `run_rf` == run the random forest classification model.
- `run_sv` == run the support vector machine classification model.
- `run_ann` == run the multi-layer perceptron classification model.  
- `run_mcrf` == *Experimental* run the multi-output random forest classification model.

#### Notes:
- each model will have a cross-validation option. To use this the function parameters will need to be set to `cv=True` and `cv_val` to the number of k-folds defaulting to `cv_val=5`

In [52]:
# All Model functions
def run_rf(X_train, X_test, y_train, y_test, n_estimators=10, cv=False, cv_val=5):
    cv_scores = []
    clf = sklearn.ensemble.RandomForestClassifier(n_estimators=n_estimators, n_jobs=-1)
    if cv:
        cv_scores = sklearn.model_selection.cross_val_score(clf, X_train, y_train, cv=cv_val)
    clf.fit(X_train, y_train)
    score = clf.score(X_test,y_test)
    preds = clf.predict(X_test)
    print("Random Forest Classifcation accuracy:", score)
    return score, preds, cv_scores


def run_svc(X_train, X_test, y_train, y_test, gamma_val=0.01, C_val=0.1, cv=False, cv_val=5):
    cv_scores = []
    if cv:
        clf = sklearn.svm.SVC(gamma=gamma_val, C=C_val, decision_function_shape='ova')
        cv_scores = sklearn.model_selection.cross_val_score(clf, X_train, y_train, cv=cv_val)
    clf = sklearn.svm.SVC(gamma=gamma_val, C=C_val, decision_function_shape='ova')
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    preds = clf.predict(X_test)  
    print("Support Vector classification accuracy:", score)
    return score, preds, cv_scores


def run_ann(X_train, X_test, y_train, y_test, alpha_val=0.01, C_val=0.1, cv=False, cv_val=5):
    cv_scores = []
    if cv:
        clf = sklearn.neural_network.MLPClassifier(solver='lbfgs', alpha=alpha_val,\
                                            hidden_layer_sizes=(50, 50, 50), random_state=1, max_iter=500)
        cv_scores = sklearn.model_selection.cross_val_score(clf,X_train,y_train, cv=cv_val)
    clf = sklearn.neural_network.MLPClassifier(solver='lbfgs', alpha=alpha_val,\
                                               hidden_layer_sizes=(50, 50, 50), random_state=1, max_iter=500)
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    preds = clf.predict(X_test)
    print("Neural Network classification accuracy:", score)
    return score, preds, cv_scores


## Experimental
def run_mcrf(X_train, X_test, y_train, y_test, n_estimators=10, cv=False, cv_val=5):
    cv_scores = []
    clf = sklearn.ensemble.RandomForestClassifier(n_estimators = n_estimators,\
                                                  criterion = 'entropy', random_state = 42)
    multi_target_forest = sklearn.multioutput.MultiOutputClassifier(clf, n_jobs=-1)
    if cv:
        cv_scores = sklearn.model_selection.cross_val_score(multi_target_forest, X_train, y_train, cv=cv_val)
    multi_target_forest.fit(X_train, y_train)
    score = multi_target_forest.score(X_test, y_test)
    preds = multi_target_forest.predict(X_test)
    print("Multi-output Random Forest classification accuracy:", score)
    return score, preds, cv_scores

In [53]:
run_mcrf(X_train, X_test, y_train, y_test)

Multi-output Random Forest Score: 0.4202119077072016


(0.4202119077072016, array([[0],
        [1],
        [0],
        ...,
        [0],
        [0],
        [0]]), [])

In [None]:
def calc_feature_imp(clf, feature_list):
    # Get numerical feature importances
    importances = list(clf.feature_importances_)
    # List of tuples with variable and importance
    feature_importances = [(feature, round(importance, 5)) for feature, importance in zip(feature_list, importances)]
    # Sort the feature importances by most important first
    feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
    above_005 = [i[0] if i[1] >= 0.05 else "purpose_f" for i in feature_importances]
    return feature_importances, above_005


def subset_to_imp_features(data, above_005):
    above_005.append("id_trip")
    data = data[data.columns[[col in above_005 for col in data.columns]]]
    return data


def plot_feature_imp(feature_importances, title):    
    fig, ax = plt.subplots(1, figsize=(10,6))
    feat_imp = pd.DataFrame(feature_importances, columns=['importance', 'feature'])
    feat_imp = feat_imp.loc[(feat_imp.importance != 'id_trip')]
    feat_imp.plot(kind='barh', ax=ax, legend=False)
    ax.set_yticklabels([new_column_labels[i] for i in list(feat_imp.importance.values)], size=16);
    plt.xticks(size=16);
    ax.set_xlim(0,0.2)
    ax.axvline(0.05, -10,40, color='r',linestyle='--')
    ax.set_xlabel("Feature Importance", size=20)
    ax.set_ylabel("Feature", size=20)
    ax.set_title("{0}".format(title), size=22)
    return ax

## Model reports

In [None]:
sklearn.metrics.classification_report

## Grid Search

In [None]:
sklearn.model_selection.GridSearchCV