In [14]:
import pandas as pd
from ast import literal_eval
import gmplot
from fastdtw import fastdtw
from math import radians, cos, sin, asin, sqrt
from scipy.spatial.distance import euclidean
import numpy as np
from collections import Counter 
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from time import time

def haversine(A, B):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1 = A[0]
    lat1 = A[1]
    lon2 = B[0]
    lat2 = B[1]
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r


class Knn:
    def __init__(self, k = 5):
        self.k = k
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        return

    def find_Knn(self, item):
        distances = [float('inf')] * self.k
        index = [-1] * self.k
        maxval = float('inf')
        maxpos = 0
        for i in range(0,len(self.X_train)):
            temp, path = fastdtw(item, self.X_train[i], dist=haversine)
            if temp < maxval:
                distances[maxpos] = temp
                index[maxpos] = i
                maxval = max(distances)
                maxpos = distances.index(maxval)
        JIds = []
        for i in range(0,self.k):
            JIds.append(self.y_train[index[i]])
        data = Counter(JIds)
        return data.most_common(1)[0][0]
        '''
        distances = []
        for d in self.X_train:
            temp, path = fastdtw(item, d, dist=haversine)
            distances.append(temp)
        #find the K smallest distances
        ind = np.argpartition(distances, self.k)
        JIds = []
        for i in range(0,self.k):
            JIds.append(self.y_train[ind[i]])
        data = Counter(JIds)
        return data.most_common(1)[0][0]
        '''
    def predict(self, X_test):
        predictions = []
        n = 0
        for item in X_test:
            predictions.append(self.find_Knn(item))
        return predictions

In [15]:
train_set = pd.read_csv('datasets/train_set.csv', converters={"Trajectory": literal_eval})
test_set = pd.read_csv('datasets/test_set_a1.csv', sep ='\t', converters={"Trajectory": literal_eval})

In [16]:
Trainnp = []
for x  in train_set["Trajectory"]:
    temp1 = np.asarray(x)
    temp1 = temp1[:, [1,2]]
    Trainnp.append(temp1)
Ids = []
for x in train_set['journeyPatternId']:
    Ids.append(x)    

Testnp = []
for x  in test_set["Trajectory"]:
    temp = np.asarray(x)
    temp = temp[:, [1,2]]
    Testnp.append(temp)

In [17]:
# Predict the Test_set using Knn with k = 5 (default)
clf = Knn()
t1 = time()
clf.fit(Trainnp, Ids)
y_pred = clf.predict(Testnp)
print "Finished the prediction in :",time()-t1," s."
#print y_pred

Finished the prediction in : 123.740674973  s.


In [18]:
#Create the testSet_JourneyPatternIDs.csv from the predicted data
i = 0
Ids = []
for item in y_pred:
    Ids.append(i)
    i +=1
raw_data = {
    'Test_Trip_ID': Ids,
    'Predicted_JourneyPatternID': y_pred
    }
df = pd.DataFrame(raw_data, columns = ['Test_Trip_ID','Predicted_JourneyPatternID'])
df.to_csv('testSet_JourneyPatternIDs.csv',sep='\t',index = False)

In [19]:
#10-fold cross-validation 

#Reduce the size of the train_set to 1/10 of the original
tmp = train_set[0: (len(train_set)/10)]
Tmpnp = []
for x  in tmp["Trajectory"]:
    temp1 = np.asarray(x)
    temp1 = temp1[:, [1,2]]
    Tmpnp.append(temp1)
Idstmp = []
for x in tmp['journeyPatternId']:
    Idstmp.append(x)
#Split the train set by preserving the percentage of samples for each class.
n_folds = 10 
X = np.asarray(Tmpnp)
y = np.asarray(Idstmp)
folds = StratifiedKFold(n_splits = n_folds)
scores = list()
clf = Knn()
i = 0
total_accuracy = 0
t1 = time()
for train,test in folds.split(X, y):
    X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
    clf.fit(X_train,y_train)
    #Predict the test split
    y_pred = clf.predict(X_test)

    #Get scores for this fold 
    scores.append(y_pred)
    total_accuracy += accuracy_score(y[test], y_pred)
    print "Fold %d :" % i," Accuracy:",accuracy_score(y[test], y_pred)
    i += 1
print "\nFinished 10 fold cross validation for Knn"
print "Average Accuracy: ", (total_accuracy/n_folds)
print "Elapsed time :", time()-t1," s."


Fold 0 :  Accuracy: 0.542253521127
Fold 1 :  Accuracy: 0.776595744681
Fold 2 :  Accuracy: 0.878048780488
Fold 3 :  Accuracy: 0.942857142857
Fold 4 :  Accuracy: 0.983870967742
Fold 5 :  Accuracy: 0.980392156863
Fold 6 :  Accuracy: 0.976744186047
Fold 7 :  Accuracy: 1.0
Fold 8 :  Accuracy: 0.972222222222
Fold 9 :  Accuracy: 1.0

Finished 10 fold cross validation for Knn
Average Accuracy:  0.905298472203
Elapsed time : 1715.05784202  s.
