In [1]:
import pandas as pd
from ast import literal_eval
import gmplot
from fastdtw import fastdtw
from math import radians, cos, sin, asin, sqrt
from scipy.spatial.distance import euclidean
import numpy as np
from collections import Counter
import matplotlib

def haversine(A, B):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1 = A[0]
    lat1 = A[1]
    lon2 = B[0]
    lat2 = B[1]
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r


class Knn:
    def __init__(self, k = 5):
        self.k = k
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        return

    def find_Knn(self, item):
        distances = []
        for d in self.X_train:
            temp, path = fastdtw(item, d, dist=haversine)
            distances.append(temp)
        #find the K smallest distances
        ind = np.argpartition(distances, self.k)
        JIds = []
        for i in range(0,self.k):
            JIds.append(self.y_train[ind[i]])
        data = Counter(JIds)
        return data.most_common(1)[0][0]
            
    def predict(self, X_test):
        predictions = []
        n = 0
        for item in X_test:
            predictions.append(self.find_Knn(item))
        return predictions

In [2]:
train_set = pd.read_csv('datasets/train_set.csv', # replace with the correct path
                        converters={"Trajectory": literal_eval})
train_set = train_set[0:(len(train_set)/7)]
#print len(train_set)

In [3]:
Ids = []
for x in train_set['journeyPatternId']:
    Ids.append(x)

In [4]:
test_set = pd.read_csv('datasets/test_set_a1.csv', # replace with the correct path
                        converters={"Trajectory": literal_eval})

In [None]:
Trainnp = []
for x  in train_set["Trajectory"]:
    temp1 = np.asarray(x)
    temp1 = temp1[:, [1,2]]
    Trainnp.append(temp1)
    
Testnp = []
for x  in test_set["Trajectory"]:
    temp = np.asarray(x)
    temp = temp[:, [1,2]]
    Testnp.append(temp)

In [None]:
#10-fold cross-validation 
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from time import time
#Split the train set by preserving the percentage of samples for each class.
n_folds = 10 
folds = StratifiedKFold(n_splits = n_folds)
scores = list()
clf = Knn()
i = 0
total_accuracy = 0
X = np.asarray(Trainnp)
y = np.asarray(Ids)
t1 = time()
for train,test in folds.split(X, y):
    print "Fold %d :" % i
    X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
    clf.fit(X_train,y_train)
    print "Done fitting the classifier!"
    #Predict the test split
    y_pred = clf.predict(X_test)

    #Get scores for this fold 
    scores.append(y_pred)
    total_accuracy += accuracy_score(y[test], y_pred)
    print "\tAccuracy:",accuracy_score(y[test], y_pred)
    i += 1
print "\nFinished 10 fold cross validation for Knn"
print "Average Accuracy: ", (total_accuracy/n_folds)
print "Elapsed time :", time()-t1," s."




Fold 0 :
Done fitting the classifier!
	Accuracy: 0.509615384615
Fold 1 :
Done fitting the classifier!
	Accuracy: 0.755555555556
Fold 2 :
Done fitting the classifier!
	Accuracy: 0.868421052632
Fold 3 :
Done fitting the classifier!
	Accuracy: 0.909090909091
Fold 4 :
Done fitting the classifier!
	Accuracy: 0.965517241379
Fold 5 :
Done fitting the classifier!


In [None]:
clf = Knn()
clf.fit(Trainnp, Ids)
y_pred = clf.predict(Testnp)
print y_pred

In [None]:
longitudes=[]
latitudes=[]
for point in Testnp[4]:
    longitudes.append(point[0])
    latitudes.append(point[1])
gmap = gmplot.GoogleMapPlotter(latitudes[0],longitudes[0],16)
gmap.plot(latitudes, longitudes, 'cornflowerblue', edge_width=10)

jrns = train_set['journeyPatternId']
#print jrns
y = 0
for i in range(1, len(train_set)):
    if jrns[i] == '01300001':
        y = Trainnp[i]
        break
       
    
longitudes=[]
latitudes=[]
for point in y:
    longitudes.append(point[0])
    latitudes.append(point[1])
gmap.plot(latitudes, longitudes, 'red', edge_width=8)
map_name="matched.html"
gmap.draw(map_name)

In [None]:
fig, ax = plt.subplots(2, 3, sharex='col', sharey='row'