# Machine Learning Models
The following classification models are carried out here:
    1. Random Forest
    2. Support Vector Machines
    3. Feed-forward Neural Networks

In [3]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import cross_val_score


import sklearn
import shapely.wkt
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.neural_network import MLPClassifier
import mplleaflet
from scipy.spatial.distance import pdist, squareform, euclidean, directed_hausdorff
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV


## Load in data

In [11]:
X_16 = pd.read_csv('../../../Data/model_inputs/gdf_2016_X.csv')
y_16 = pd.read_csv('../../../Data/model_inputs/gdf_2016_y.csv')
X_17 = pd.read_csv('../../../Data/model_inputs/gdf_2017_X.csv')
y_17 = pd.read_csv('../../../Data/model_inputs/gdf_2017_y.csv')

In [14]:
X_16.head()

Unnamed: 0,id_trip,mode_f,duration,distance_m,magnitude,carddir_f,start_down,end_downto,weekday,temporal_c,precip,temperatur,startrush,endrush,thrurush,startclust,endclust,land_use_s_f,land_use_e_f
0,1724206,0,460,415.23633,0.227492,0,1,1,1,3,2e-06,28.012522,0,0,0,9,9,0,0
1,1889461,1,447,1843.264582,0.470022,0,1,1,1,5,0.000134,25.844886,1,1,1,0,0,1,1
2,1724219,2,591,2657.42183,0.303495,1,1,1,1,3,0.00024,25.389363,0,0,0,0,9,1,0
3,2071991,3,844,2761.792383,0.223787,2,1,1,1,5,0.001427,24.93072,1,1,1,9,9,0,4
4,1667922,3,1211,1068.301088,0.293601,3,1,1,1,4,0.001429,21.769356,0,0,0,0,0,0,0


In [18]:
y_16.head()

Unnamed: 0,purpose_f
0,0
1,1
2,0
3,2
4,2


In [17]:
y_16['purpose_f'].value_counts()

0    15554
4    14981
1     7430
2     5790
3     5682
5     2473
6     2262
7     2168
Name: purpose_f, dtype: int64

## Setup model

In [None]:
def model_setup(data, norm=False, oversample=False):
    data['land_use_start_f'] = data['land_use_start'].apply(factorisation, land_use=True)
    data['land_use_end_f'] = data['land_use_end'].apply(factorisation, land_use=True)
    data['purpose_f'] = data['purpose'].apply(factorisation, purpose=True)
    data['mode_f'] = data['mode'].apply(factorisation, mode=True)
    data['carddir_f'] = data['carddir'].apply(factorisation, carddir=True)
    
    X = data[['id_trip',"mode_f","seconds","distance_m","magnitude","carddir_f","start_down",\
          "end_downto","weekday", "temporal_c","precip","temperatur",\
        "startrush","endrush","thrurush","startclust","endclust","land_use_start_f","land_use_end_f"]]
    y = data[["purpose_f"]] # purpose_labels
    X.weekday = X.weekday.astype(int)
#     X = np.nan_to_num(X)
    feature_list = list(X.columns)
    if oversample:
        X, y = ros.fit_resample(X, y)
    if norm:
        X = normalise(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    return X_train, X_test, y_train, y_test, feature_list
    
def normalise(X):
    X = X / np.amax(X, axis=0)
    return X

def subset_to_imp_features(data, above_005):
    above_005.append("id_trip")
    data = data[data.columns[[col in above_005 for col in data.columns]]]
    return data
    

def run_rf(data, X_train=None, X_test=None, y_train=None, y_test=None, feature_list=None, norm=False, cv=False, oversample=False):
    if not X_train:
        X_train, X_test, y_train, y_test, feature_list = model_setup(data, norm=norm, oversample=oversample)
    try:
        X_train.fillna(0, inplace=True)
        X_test.fillna(0, inplace=True)
    except:
        X_train = np.nan_to_num(X_train)
        X_test = np.nan_to_num(X_test)
    clf = RandomForestClassifier(n_estimators=256, n_jobs=-1)
    if cv:
        scores = cross_val_score(clf, X_train, y_train, cv=5)
        print("Score:", scores)
    else:
        clf.fit(X_train, y_train)
        scores = clf.score(X_test,y_test)
        print("Score:", scores)
    return clf, feature_list, scores

def calc_feature_imp(clf, feature_list):
    # Get numerical feature importances
    importances = list(clf.feature_importances_)
    # List of tuples with variable and importance
    feature_importances = [(feature, round(importance, 5)) for feature, importance in zip(feature_list, importances)]
    # Sort the feature importances by most important first
    feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
    above_005 = [i[0] if i[1] >= 0.05 else "purpose_f" for i in feature_importances]
    return feature_importances, above_005

def plot_feature_imp(feature_importances, title):    
    fig, ax = plt.subplots(1, figsize=(10,6))
    feat_imp = pd.DataFrame(feature_importances, columns=['importance', 'feature'])
    feat_imp = feat_imp.loc[(feat_imp.importance != 'id_trip')]
    feat_imp.plot(kind='barh', ax=ax, legend=False)
    ax.set_yticklabels([new_column_labels[i] for i in list(feat_imp.importance.values)], size=16);
    plt.xticks(size=16);
    ax.set_xlim(0,0.2)
    ax.axvline(0.05, -10,40, color='r',linestyle='--')
    ax.set_xlabel("Feature Importance", size=20)
    ax.set_ylabel("Feature", size=20)
    ax.set_title("{0}".format(title), size=22)
    return ax

def encode_inputs(data):
    non_code_columns = ["mode_f","carddir_f","startclust","endclust", "purpose_f", "id_trip"]
    enc = OneHotEncoder(handle_unknown='ignore')
    mode_codes = np.array([])
    carddir_codes = np.array([])
    stcl_codes = np.array([])
    encl_codes = np.array([])
    land_use_start_codes = np.array([])
    land_use_end_codes = np.array([])
    if 'mode_f' in data.columns:
        mode_codes = enc.fit_transform(data['mode_f'].values.reshape(-1, 1)).toarray()
    elif 'land_use_start_f' in data.columns:
        data['land_use_start_f'] = data['land_use_start_f'].astype(int)
        land_use_start_codes = enc.fit_transform(data['land_use_start_f'].values.reshape(-1, 1)).toarray()
    elif 'land_use_end_f' in data.columns:
        data['land_use_end_f'] = data['land_use_end_f'].astype(int)
        land_use_end_codes = enc.fit_transform(data['land_use_end_f'].values.reshape(-1, 1)).toarray()
    elif 'carddir_f' in data.columns:
        carddir_codes = enc.fit_transform(data['carddir_f'].values.reshape(-1, 1)).toarray()
    elif 'startclust' in data.columns:
        stcl_codes = enc.fit_transform(data['startclust'].values.reshape(-1, 1)).toarray()
    elif 'endclust' in data.columns:
        encl_codes = enc.fit_transform(data['endclust'].values.reshape(-1, 1)).toarray()
    if 'weekday' in data.columns:
        data['weekday'] = data['weekday'].astype(int)
    y_codes = enc.fit_transform(data['purpose_f'].values.reshape(-1, 1)).toarray()
    
    unique_ids = data['id_trip'].values.reshape(-1,1)
    non_codes = data[data.columns[[col not in non_code_columns for col in data.columns]]]
    feature_list = list(non_codes)
    non_codes = non_codes.values
    # normalise
    non_codes = non_codes / np.amax(non_codes, axis=0)
    other_inputs = {}
    # only return codes that are above 0.05 importance
    for i, cds in enumerate([non_codes, mode_codes, carddir_codes, stcl_codes, encl_codes, land_use_start_codes, land_use_end_codes, unique_ids]):
        if len(cds) != 0:
            other_inputs[i] = cds
    other_inputs = np.concatenate(tuple(other_inputs.values()), axis=1)
    return other_inputs, y_codes

def run_mcrf(all_input, y_codes, cv=False, X_train=None, X_test=None, y_train=None, y_test=None):
    if type(X_train) == None:
        X_train, X_test, y_train, y_test = train_test_split(all_input, y_codes, test_size=0.33, random_state=42)
    cv_scores = []
    clf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
    multi_target_forest = MultiOutputClassifier(clf, n_jobs=-1)
    if cv:
        cv_scores = cross_val_score(multi_target_forest, X_train, y_train, cv=5)
        
    clf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
    multi_target_forest = MultiOutputClassifier(clf, n_jobs=-1)
    multi_target_forest.fit(X_train, y_train)
    score = multi_target_forest.score(X_test, y_test)
    preds = multi_target_forest.predict(X_test)
    return cv_scores, score, preds

def run_svc(X_train, X_test, y_train, y_test, cv=False):
    clf = svm.SVC(gamma=0.01, C=0.1, decision_function_shape='ova')
    cv_scores = []
    if cv:
        cv_scores = cross_val_score(clf, X_train, y_train, cv=5)
    clf = svm.SVC(gamma=0.01, C=0.1, decision_function_shape='ova')
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    preds = clf.predict(X_test)  
    return cv_scores, score, preds


def run_ann(X_train, X_test, y_train, y_test, cv=False):
    clf = MLPClassifier(solver='lbfgs', alpha=0.01, hidden_layer_sizes=(50, 50, 50), random_state=1, max_iter=500)
    cv_scores = []
    if cv:
        cv_scores = cross_val_score(clf,X_train,y_train, cv=5)
        clf.fit(X_train, y_train)
    clf = MLPClassifier(solver='lbfgs', alpha=0.01, hidden_layer_sizes=(50, 50, 50), random_state=1, max_iter=500)
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    preds = clf.predict(X_test)
    return cv_scores, score, preds