In [138]:
# stacked generalization with linear meta model on blobs dataset
from sklearn.datasets import make_blobs
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from keras.models import load_model
from numpy import dstack
from keras.utils import np_utils
import time
import numpy as np
import random
import pickle
from sklearn.model_selection import train_test_split

In [3]:
# load models from file
def load_all_models(n_models):
	all_models = list()
	for i in range(n_models):
		# define filename for this ensemble
		filename = 'models/model_CNN_' + str(i + 1) + '.h5'
		# load model from file
		model = load_model(filename)
		# add to list of members
		all_models.append(model)
		print('>loaded %s' % filename)
	return all_models

# create stacked model input dataset as outputs from the ensemble
def stacked_dataset(members, inputX_M,inputX_G):
    stackX = None
    for i in range(len(members)):
        model=members[i]
        if i<7:
            # make prediction
            yhat = model.predict(inputX_M, verbose=0)
            # stack predictions into [rows, members, probabilities]
            if stackX is None:
                stackX = yhat
            else:
                stackX = dstack((stackX, yhat))
        else:
            # make prediction
            yhat = model.predict(inputX_G, verbose=0)
            # stack predictions into [rows, members, probabilities]
            if stackX is None:
                stackX = yhat
            else:
                stackX = dstack((stackX, yhat))            
                
    # flatten predictions to [rows, members x probabilities]
    stackX = stackX.reshape((stackX.shape[0], stackX.shape[1]*stackX.shape[2]))
    return stackX

# fit a model based on the outputs from the ensemble members
def fit_stacked_model(members, inputX, inputy):
    # create dataset using ensemble
    inputX_M=inputX[:, :, :, :4]
    inputX_G=inputX[:, :, :, 4:]
    stackedX = stacked_dataset(members, inputX_M,inputX_G)
    # fit standalone model
    model = LogisticRegression(max_iter=6500)
    model.fit(stackedX, inputy)
    return model

# make a prediction with the stacked model
def stacked_prediction(members, model, inputX):
    # create dataset using ensemble
    inputX_M=inputX[:, :, :, :4]
    inputX_G=inputX[:, :, :, 4:]
    stackedX = stacked_dataset(members, inputX_M,inputX_G)
    # make a prediction
    yhat = model.predict(stackedX)
    return yhat


start_time = time.clock()
np.random.seed(7)
random.seed(7)

filename = 'Combined Trajectory_Label_Geolife/Revised_KerasData_Smoothing_GIS_new_8.pickle'

with open(filename, mode='rb') as f:
    TotalInput, FinalLabel = pickle.load(f, encoding='latin1')  # Also can use the encoding 'iso-8859-1'

    
filename = 'Data/Revised_KerasData_Smoothing_8_60_final.pickle'

with open(filename, mode='rb') as f:
    TotalInput_UN, FinalStage_UN = pickle.load(f, encoding='latin1')  # Also can use the encoding 'iso-8859-1'
    
    
NoClass = len(list(set(np.ndarray.flatten(FinalLabel))))
Threshold = len(TotalInput[0, 0, :, 0])

# Making training and test data: 80% Training, 20% Test
Train_X, Test_X, Train_Y, Test_Y_ori = train_test_split(TotalInput, FinalLabel, test_size=0.20, random_state=7)

Test_M_X=Test_X[:, :, :, :4]
Test_G_X=Test_X[:, :, :, 4:]


# load all models
n_members = 14
members = load_all_models(n_members)
print('Loaded %d models' % len(members))
# evaluate standalone models on test dataset
for i in range(len(members)):
    model=members[i]
    if i <7:
        testy_enc = np_utils.to_categorical(Test_Y_ori, num_classes=NoClass)
        _, acc = model.evaluate(Test_M_X, testy_enc, verbose=0)
        print('Model Accuracy: %.3f' % acc)
    else:
        testy_enc = np_utils.to_categorical(Test_Y_ori, num_classes=NoClass)
        _, acc = model.evaluate(Test_G_X, testy_enc, verbose=0)
        print('Model Accuracy: %.3f' % acc)
# fit stacked model using the ensemble
model = fit_stacked_model(members, Test_X, Test_Y_ori)
# evaluate model on test set
yhat = stacked_prediction(members, model, Test_X)
acc = accuracy_score(Test_Y_ori, yhat)
print('Stacked Test Accuracy: %.3f' % acc)



>loaded models/model_CNN_1.h5
>loaded models/model_CNN_2.h5
>loaded models/model_CNN_3.h5
>loaded models/model_CNN_4.h5
>loaded models/model_CNN_5.h5
>loaded models/model_CNN_6.h5
>loaded models/model_CNN_7.h5
>loaded models/model_CNN_8.h5
>loaded models/model_CNN_9.h5
>loaded models/model_CNN_10.h5
>loaded models/model_CNN_11.h5
>loaded models/model_CNN_12.h5
>loaded models/model_CNN_13.h5
>loaded models/model_CNN_14.h5
Loaded 14 models
Model Accuracy: 0.801
Model Accuracy: 0.801
Model Accuracy: 0.804
Model Accuracy: 0.815
Model Accuracy: 0.814
Model Accuracy: 0.815
Model Accuracy: 0.815
Model Accuracy: 0.568
Model Accuracy: 0.613
Model Accuracy: 0.630
Model Accuracy: 0.668
Model Accuracy: 0.623
Model Accuracy: 0.673
Model Accuracy: 0.679
Stacked Test Accuracy: 0.836


In [4]:
#loading segments for classification 
filename = 'Data/Revised_KerasData_Smoothing_8_60_final_segmentID.pickle'

with open(filename, mode='rb') as f:
    TotalInput_UN, FinalStage_UN = pickle.load(f, encoding='latin1')  # Also can use the encoding 'iso-8859-1'

In [6]:
#selecting validation sample
Train_X, Test_X, Train_Y, Test_Y_ori = train_test_split(TotalInput_UN, FinalStage_UN, test_size=0.2, random_state=7)

In [7]:
%%time
#classifiaction
yhat = stacked_prediction(members, model, Test_X)

CPU times: user 18.7 s, sys: 565 ms, total: 19.2 s
Wall time: 6.58 s


In [9]:
#validation sample size
len(yhat)

1488

In [10]:
#loading segments GPS points for validation and plotting
filename = 'Data/Proximity+200_Unlabeled_60_final.pickle'
with open(filename, 'rb') as f:
    Bus_All_Segment,SegmentID,Rail_All_Segment, Traffic_All_Segment, Stage, Data_All_Segment, SegmentNumber = pickle.load(f, encoding='latin1')

In [11]:
%%time
# assigning segments ID to predicted modes
import pandas as pd
segments_index=[i for i,x in enumerate(SegmentID) if x in Test_Y_ori]

df_pred = pd.DataFrame(columns = ['lat','lon','timestamp','segmentID','stageID','pred_mode'])
for i in range(len(segments_index)):
    df = pd.DataFrame(Data_All_Segment[segments_index[i]])
    df=df.rename(columns={0:'lat',1:'lon',2:'timestamp'})
    df['segmentID']=Test_Y_ori[i]
    df['stageID']=Stage[segments_index[i]]
    df['pred_mode']=yhat[i]   
    df_pred=pd.concat([df_pred,df]).reset_index(drop=True)

In [13]:
df_pred

Unnamed: 0,lat,lon,timestamp,segmentID,stageID,pred_mode
0,51.506708,-0.106041,44333.643808,1332,6.0,4
1,51.506749,-0.105608,44333.644028,1332,6.0,4
2,51.506749,-0.105608,44333.644248,1332,6.0,4
3,51.506737,-0.105656,44333.644294,1332,6.0,4
4,51.506721,-0.105683,44333.644329,1332,6.0,4
...,...,...,...,...,...,...
74808,51.586065,-0.061815,44338.424688,5737,16078.0,3
74809,51.586073,-0.061813,44338.424711,5737,16078.0,3
74810,51.586115,-0.061833,44338.424734,5737,16078.0,3
74811,51.586109,-0.061843,44338.424757,5737,16078.0,3


In [139]:
#saving data frame with predicted modes
#df_pred.to_pickle('Data/segments_60_1_percent_predictions.pkl')
df_pred = pd.read_pickle('Data/segments_60_1_percent_predictions.pkl')

### Validation process

In [276]:
# selecting segments with modes 0,1,2,3 or 4 (walk,bike,bus,driving or train)
df_pred[df_pred.pred_mode==1].segmentID.unique()

array([1315, 4840, 5911, 1017, 6827, 4430, 164, 4084, 7326, 6009, 3770,
       1961, 1805, 920, 2128, 6056, 2044, 803, 5003, 5740, 4117, 3690,
       315, 1057, 6028, 5461, 5481, 2984, 2714, 7100], dtype=object)

In [339]:
#selecing a segment for validation
import skmob
import folium

df_pred[df_pred.segmentID==5740]

In [341]:
#plotting selected segment for validation

data= skmob.TrajDataFrame(
df_pred[df_pred.segmentID==5740],
latitude= "lat",
longitude="lon",
datetime='timestamp',
parameters= {1:'segmentID'})

#plotting
data.plot_trajectory(zoom=12, weight=10, opacity=0.9, tiles='OpenStreetMap'
                     ,start_end_markers=True,dashArray='5, 5')