In [14]:
import numpy as np
import os as os
import re as re
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.util import MLUtils

######## SU's code for sampling #############

# read directory of files and return a list of all driverIDs from csv's insid directory
def get_drivers(dirpath):
    """
    :param dirpath: string, path to directory containing driver csv's
    :return: list, contains all driverIDs as strings
    """
    try:
        allfiles = os.listdir(dirpath)
        drivers = [re.sub(r'[^0-9]', '', i) for i in allfiles]
        drivers.remove('')
        return drivers
    except Exception as e:
        print e

# produces random samples of driverIDs and tripIDs in two separate lists
def random_samples(targ_driv, driv_list, K=200):
    """
    :param targ_driv: str, driverID we want to make false trips for
    :param driv_list: list, list of all drivers, produced by get_drivers()
    :param K: number of trips we want to make for targ_driv
    :return: tuple of lists, first list is random driverIDs, second list is list of tripIDs, both are strings
    """
    try:
        driv_list.remove(targ_driv) #removes the target driver from list of drivers to sample from
        drivers = np.random.choice(driv_list, K, True)
        trips = np.random.choice(np.arange(1,K+1).astype(str), K, True)
        return (drivers, trips)
    except Exception as e:
        print e

# reads directory of files and returns RDD of observations from trips in the sample (driverID, tripID combo)
# NOTE: this function is VERY SLOW, it is what slows the entire workflow down
def sample_data(path, driverIDs, tripIDs):
    """
    :param path: string, path to directory containing driver.csv's
    :param driverIDs: list, list of randomly sampled driverIDs as strings, produced by random_sample()
    :param tripIDs: list, list of randomly sampled tripIDs as strings, produced by random_samples()
        NOTE: the above two zip into a list of (driverID, tripID) tuples, with each tuple being a single item in the
        sample
    :return: RDD, contains only observations from the sample
    """
    try:
        combos = zip(driverIDs, tripIDs)
        samplefiles = [path + '/' + 'driver_' + i + '.csv' for i in driverIDs]
        samplefiles = ','.join(set(samplefiles))  #### NOTE: this set() action is a hack for small num. files
        RDD = sc.textFile(samplefiles)   #### NOTE: with large num. files, might need to set num. partitions
        RDDsplit = RDD.map(lambda x: x.split(','))
        RDDsamples = RDDsplit.filter(lambda x: (x[2],x[3]) in combos)
        RDDsamples.cache()
        return RDDsamples
    except Exception as e:
        print e

# takes RDD of samples and assigns new driverID and tripID to observations in a new RDD
def ID_Data(targ_driver, RDD, K = 200):
    """
    :param targ_driver: string, target driver we used to generate samples
    :param RDD: RDD, trip data RDD produced by sample_data(), format will be original form (x,y,driverID,tripID,step)
    :param K: int, number of trips we sampled
    :return: RDD, in original format, but with driverID and tripID changed to look like new observations of the target
    driver
    """
    try:
        newID1 = [targ_driver] * K
        newID2 = np.arange(200, 201+K).astype(str)
        newID = zip(newID1, newID2)
        oldID = RDD.map(lambda x: (x[2],x[3])).distinct().collect()
        glossary = sc.parallelize(zip(oldID, newID))
        newRDD = RDD.map(lambda x: ((x[2],x[3]), ([x[0],x[1],x[4]]))).join(glossary)
        newID_RDD = newRDD.map(lambda x: (x[1][0][0], x[1][0][1], x[1][1][0], x[1][1][1], x[1][0][2]))
        return newID_RDD
    except Exception as e:
        print e


# takes RDD in original form and converts it into key-value tuple with values being x,y,step,label
def processRDD(RDD, label):
    """
    :param RDD: RDD in original format (x,y,driverID,tripID,step)
    :param label: category of observation, 1 for positive, 0 for negative
    # note, not sure if it needs to be int or float
    :return: RDD, RDD returned in new key/value format: (driverID, tripID), (x, y, step, label)
    # note, x, y, step, and label will be floats
    """
    try:
        newRDD = RDD.map(lambda x: ((x[2],x[3]),(float(x[0]),float(x[1]),float(x[4]),label)))
        return newRDD
    except Exception as e:
        print e

# takes a driver to target, path to directory of driver.csv's, and returns an RDD labeled with
# (driverID, tripID),(x,y,step,label), where a label 1 is from an actual trip, and label 0 is from
# a trip randomly sampled from other drivers
def labelRDDs(targ_driv, path, K=200):
    """
    :param targ_driv: string, driver we want to create positive and negative labeled data for
    :param path: string, path to directory where driver.csvs are stored
    :param K: int, number of negative (manufactured) trips to sample
    :return: RDD with key, value tuple where key is (driverID, tripID) and value is (x,y,step,label)
    """
    try:
        full_path = path + '/' + 'driver_' + targ_driv + '.csv'
        #print full_path
        target = sc.textFile(path + '/' + 'driver_' + targ_driv + '.csv') #load target driver's data
        target2 = target.map(lambda x: x.split(',')) #convert from string to list of strings
        positives = processRDD(target2, 1.0) #label target driver's RDD
        driv_lis = get_drivers(path) #get python list of all possible drivers to sample from
        #print driv_lis
        sampdriv, samptrip = random_samples(targ_driv, driv_lis, K) #generate random samples of drivers and tripIDs
        samples = sample_data(path, sampdriv, samptrip) #generate RDD of random samples
        #print "GETS HERE"
        samplesRDD = ID_Data(targ_driv, samples, K) #relabel samples to look like target driver's trips
        #print "GETS HERE TOO"
        negatives = processRDD(samplesRDD, 0.0) #label samples
        finalRDD = positives.union(negatives).cache() #join target driver and samples together
        return finalRDD
    except Exception as e:
        print e

In [15]:

path = '/Users/mayankkedia/code/kaggle/axa_telematics/sample_drivers'
driver = '1'
driver_1_RDD = labelRDDs('1', path)

In [16]:
import numpy as np
import math
from pyspark.mllib.regression import LabeledPoint

"""
Consists of functions to help us build features on RDDs.

"""


def vectorRDD(RDD):
    """
    :param RDD: RDD, created from labelRDDs in 'sampling.py'
    :return: RDD with driver_id, trip_id, vectorized x and y coordinates,
    step number, and label
    """

    # The RDD created from labelRDDs is first mapped into the key, value pair
    # ((driver_id, trip_id), (x, y, step)). Each element in the value is a
    # list of one element. This is so we can create a list of each x, y, and
    # step coordinate when reducing. We then reduce by key creating a list of
    # x and y coordinates, and a list of trip step number. This will create an
    # RDD of the form ((driver_id, trip_id), ([x coordinates],
    # [y coordinates], [trip steps])). Finally, we map this into the key, value
    # pair ((driver_id, trip_id), ([x cooridinates], [y coordinates],
    # [trip steps], label)). This RDD is the returned.

    vectorRDD = RDD.map(lambda x: (x[0], ([x[1][0]], [x[1][1]],
        [x[1][2]])))\
    .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1], x[2] + y[2]))\
    .map(lambda x: (x[0], (x[1][0], x[1][1], x[1][2], 1)) if int(x[0][1])\
        < 201 else (x[0], (x[1][0], x[1][1], x[1][2], 0)))
    return vectorRDD


def get_polars(RDD):
    """
    :param RDD: RDD, created from vectorRDD
    "return: RDD, same as vectorRDD but also with polar coordinates
    """

    # The RDD created from vectorRDD is mapped into the key, value pair
    # ((driver_id, trip_id), ([x coordinates], [y coordinates],
    # [r coordinates], [theta coordinates], [step numbers], label)). Here,
    # the r coordinate is the radial coordinate from the origin, computed using
    # Euclidean distance, i.e. the first inner map statement. The theta
    # coordinate is the angular polar coordinate, computed as the arctan of
    #  y / x, i.e. the second inner map statement.

    polars = RDD.map(lambda x: (x[0], (x[1][0], x[1][1],
        map(lambda x: (x[0] ** 2 + x[1] ** 2) ** 0.5, zip(x[1][0], x[1][1])),
        map(lambda x: math.atan2(x[1], x[0]), zip(x[1][0], x[1][1])),
         x[1][2], x[1][3])))
    return polars


def step_level_features(polarRDD):
    """
    :param RDD: RDD, created from get_polars
    :return: RDD with speed and acceleration at each stage of the trip
    """

    # First, the RDD created from get_polars is mapped to the key, value pair
    # ((driver_id, trip_id), ([x coordinates], [y coordinates],
    # [r coordinates], [theta coordinates], [v coordinates], [step numbers],
    # label)). v is the current speed (m/s), which is computed from the
    # euclidean, distance between the current point and the previous point,
    # i.e. the two innermost maps square the individual coordinate differences.
    # These are then added and square rooted for speed. The second step maps
    # this key, value pair into the key, value pair ((driver_id, trip_id),
    # ([x coordinates], [y coordinates], [r coordinates], [theta coordinates],
    # [v coordinates], [a coordinates], label)). a is the current acceleration,
    # which is computed as the difference between the current speed and the
    # previous speed

    step_lv = polarRDD.map(lambda x: (x[0], (x[1][0], x[1][1], x[1][2],
        x[1][3], map(lambda x: (x[0] + x[1]) ** 0.5,
            zip(map(lambda x: (x[0] - x[1]) ** 2,
            zip(x[1][0], [0.0] + x[1][0][:-1])),
            map(lambda x: (x[0] - x[1]) ** 2,
                zip(x[1][1], [0.0] + x[1][1][:-1])))), x[1][4], x[1][5])))\
    .map(lambda x: (x[0], (x[1][0], x[1][1], x[1][2], x[1][3], x[1][4],
        map(lambda x: x[0] - x[1],
            zip(x[1][4], [0.0] + x[1][4][:-1])), x[1][5], x[1][6])))

    return step_lv


def get_percentiles(vector):
    """
    Generates the percentiles for 5, 10, 15 ... 95 for a given vector
    """
    return np.percentile(vector, range(5, 100, 5))


def trip_features(x):
    """
    Calculates the features of the trip from a row which is of the form
    ((driver_id, trip_id), ([x coordinates], [y coordinates],
    [r coordinates], [theta coordinates], [v coordinates], [step numbers], label))
    This is the form of the rows of the output from step_level_features.

    :@param x:
    """
    theta = x[1][3]
    v = x[1][4]
    a = x[1][5]
    a_pos = a[a>=0]
    a_neg = a[a<0]
    min_v = min(v)
    max_v = max(v)
    min_a = min(a)
    max_a = max(a)
    trip_length = len(x[1][0])
    mean_v = np.mean(v)
    std_v = np.std(v)
    mean_a = np.mean(a)
    std_a = np.std(a)
    mean_pos_a = np.mean(a_pos)
    mean_neg_a = np.mean(a_neg)
    std_pos_a = np.mean(a_pos)
    std_neg_a = np.mean(a_neg)
    time_stop = sum([elem < 0.5 for elem in x[1][4]])
    label = x[1][7]

    numerical_features = (min_v, max_v,
                   min_a, max_a,
                   trip_length,
                   mean_v, std_v,
                   mean_a, std_a,
                   mean_pos_a, std_pos_a,
                   mean_neg_a, std_neg_a,
                   time_stop,
                   label)

    v_percentiles = get_percentiles(v)
    a_percentiles = get_percentiles(a)
    a_pos_percentiles = get_percentiles(a_pos)
    a_neg_percentiles = get_percentiles(a_neg)
    percentiles = np.append(v_percentiles, a_percentiles)
    percentiles = np.append(percentiles, a_pos_percentiles)
    percentiles = np.append(percentiles, a_neg_percentiles)
    second_tuple = np.append(percentiles, numerical_features).tolist()

    return x[0], second_tuple


def trip_level_features(RDD):
    """
    :param RDD: RDD, created from step_level_features
    :return: RDD with features, aggregated over the trip
    """

    # The RDD created from step_level_features is mapped to the key, value
    # pair ((driver_id, trip_id), (min(v), max(v), min(a), max(a), trip
    # length, trip distance, mean(v), stddev(v), mean(a), stddev(a),
    # length of time stopped, label). Trip length (s) is the number of
    # coordinates. Trip distance (m) is computed as the sum of the speeds as
    # they are computed per second. Length of time stopped is computed as the
    # number of seconds where the speed is less than 0.5 m/s.

    trip_lv = RDD.map(trip_features)

    return trip_lv

def create_labelled_vectors(x):
     vector = list(x[1])
     l = len(vector) -1
     label = float(vector.pop(l))
     return LabeledPoint(label, vector)

In [17]:
polar_RDD = get_polars(vectorRDD(driver_1_RDD))

In [18]:
row = polar_RDD.take(1)[0]

In [20]:
x = row[1][0]
y = row[1][1]

In [25]:
x2 = [(x_new - x_old)**2 for x_new, x_old in zip(x, [0.0] + x[:-1])]
y2 = [(y_new - y_old)**2 for y_new, y_old in zip(y, [0.0] + y[:-1])]



In [28]:
(row[0], (x,y))


(('1', '210'),
 ([0.0,
   5.3,
   11.1,
   17.2,
   23.5,
   30.6,
   38.0,
   45.7,
   53.7,
   61.8,
   69.4,
   76.2,
   82.3,
   87.9,
   92.6,
   96.5,
   100.1,
   103.1,
   106.7,
   111.1,
   116.3,
   122.5,
   129.7,
   137.2,
   145.1,
   152.5,
   159.4,
   165.4,
   171.1,
   176.4,
   181.4,
   186.0,
   190.3,
   193.1,
   194.4,
   195.1,
   195.2,
   195.2,
   195.6,
   195.7,
   195.8,
   195.9,
   196.0,
   196.1,
   196.1,
   196.1,
   196.1,
   196.1,
   196.2,
   196.2,
   196.2,
   196.1,
   196.0,
   196.0,
   195.9,
   195.7,
   195.6,
   195.4,
   195.4,
   195.3,
   195.3,
   195.2,
   195.2,
   195.0,
   195.0,
   195.0,
   195.0,
   195.0,
   195.6,
   197.6,
   201.1,
   205.6,
   210.6,
   216.6,
   223.4,
   230.8,
   239.1,
   248.2,
   257.5,
   267.3,
   277.0,
   286.8,
   296.4,
   306.2,
   315.6,
   324.8,
   333.5,
   341.7,
   349.6,
   357.2,
   364.2,
   371.0,
   377.8,
   384.4,
   391.2,
   397.5,
   403.8,
   410.2,
   416.6,
   423.1,
   

In [5]:
feature_RDD = step_level_features(get_polars(vectorRDD(driver_1_RDD)))
trip_features = trip_level_features(feature_RDD)

In [7]:
a = feature_RDD.take(1)

In [10]:
######## HARRY's code for basic feature generation ##########

def vectorRDD(RDD):
    vectorRDD = RDD.map(lambda x: (x[0], ([x[1][0]], [x[1][1]], x[1][2], (x[1][3], 1))))\
                    .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))\
                    .map(lambda x: (x[0], (x[1][0], x[1][1], 1)) if int(x[0][1])\
                           < 201 else (x[0], (x[1][0], x[1][1], 0)))
    return vectorRDD

VectorRDD = vectorRDD(driver_1_RDD).map(lambda x: (x[0], (zip(x[1][0], [0.0] + x[1][0][:-1]), 
                                zip(x[1][1], [0.0] + x[1][1][:-1]))))\
         .map(lambda x: (x[0], zip(map(lambda x: (x[0] - x[1]) ** 2, x[1][0]), 
                                   map(lambda x: (x[0] - x[1]) ** 2, x[1][1]))))\
         .map(lambda x: (x[0], map(lambda x: (x[0] + x[1]) ** 0.5, x[1])))\
         .map(lambda x: (x[0], zip(x[1], [0.0] + x[1][:-1])))\
         .map(lambda x: (x[0], map(lambda x: ([x[0]], [x[0] - x[1]]), x[1])))\
         .map(lambda x: (x[0], reduce(lambda x, y: (x[0] + y[0], x[1] + y[1]), x[1])))\
         .map(lambda x: (x[0], (min(x[1][0]), max(x[1][0]), min(x[1][1]), max(x[1][1]))))
# computes previous points of each x, y coordinate
# computes distance in each x, y direction squared from previous point
# computes euclidean distance from previous point (also speed as in m/s)
# computes previous speed
# computes speed difference (acceleration)
# IF REDUCING A LIST OF TUPLES, MUST USE BOTH TUPLE ELEMENTS!!!



In [18]:
####### MODELING using Gradient boosted trees with only a very minimal set of features ##########

def create_labelled_vectors(x):
    label = 1.0
    if int(x[0][1]) > 200:
        label = 0.0
    return LabeledPoint(label, x[1])
total_data = VectorRDD.map(create_labelled_vectors)
(trainingData, testData) = total_data.randomSplit([0.7, 0.3])
model = GradientBoostedTrees.trainClassifier(trainingData, categoricalFeaturesInfo={}, numIterations=3)

In [25]:
predictions = model.predict(testData.map(lambda x:x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v,p): v!=p).count()/float(testData.count())
print 'Test Error = {}'.format(testErr)


Test Error = 0.282442748092


In [89]:
###### Code to create spark data frame from Su's sampled RDD #############

def combine_tuples(row):
    data = row[1]
    x = []
    y = []
    steps = []
    label = None
    for d in data:
        x.append(d[0])
        y.append(d[1])
        steps.append(d[2])
        label = d[3]
    return Row(label=label,signature={"x":x, "y":y, "steps":steps})
path = '/Users/mayankkedia/code/kaggle/axa_telematics/sample_drivers'
driver = '1'
driver_1_RDD = labelRDDs('1', path).groupByKey().map(lambda x: combine_tuples(x))
driver_1_df = sqlContext.createDataFrame(driver_1_RDD)

In [92]:
driver_1_df.take(1)

[Row(label=0.0, signature={u'y': [0.0, 5.6, 10.6, 15.2, 23.4, 32.4, 43.9, 55.4, 66.9, 79.8, 93.2, 106.0, 119.2, 133.3, 148.8, 162.7, 177.0, 192.7, 209.2, 225.6, 241.3, 260.1, 279.4, 296.1, 311.5, 329.5, 347.7, 362.5, 374.9, 388.3, 403.4, 421.7, 437.6, 456.4, 468.8, 482.9, 486.7, 495.8, 506.5, 515.8, 526.8, 540.1, 554.5, 566.8, 578.3, 590.9, 602.2, 614.5, 626.8, 638.8, 650.8, 663.2, 674.6, 685.2, 700.5, 714.4, 727.9, 741.6, 756.1, 770.1, 784.1, 798.0, 811.5, 825.2, 838.1, 851.6, 862.9, 874.0, 887.0, 899.2, 912.1, 924.6, 937.7, 950.8, 965.4, 979.6, 993.6, 1006.3, 1016.1, 1025.2, 1030.3, 1034.2, 1039.6, 1045.7, 1052.6, 1060.0, 1067.5, 1075.9, 1084.8, 1092.9, 1099.8, 1105.5, 1109.4, 1111.8, 1113.4, 1114.6, 1117.9, 1122.6, 1128.9, 1135.6, 1141.7, 1146.5, 1149.9, 1152.5, 1153.5, 1153.8, 1153.8, 1154.0, 1154.0, 1154.1, 1154.3, 1154.3, 1154.5, 1154.8, 1154.7, 1154.6, 1154.7, 1154.9, 1154.8, 1155.0, 1154.7, 1154.7, 1154.4, 1154.4, 1154.4, 1154.3, 1154.3, 1154.3, 1154.3, 1154.0, 1154.0, 1154.0, 