In [22]:
import numpy as np
import os as os
import re as re
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.util import MLUtils

######## SU's code for sampling #############

# read directory of files and return a list of all driverIDs from csv's insid directory
def get_drivers(dirpath):
    """
    :param dirpath: string, path to directory containing driver csv's
    :return: list, contains all driverIDs as strings
    """
    try:
        allfiles = os.listdir(dirpath)
        drivers = [re.sub(r'[^0-9]', '', i) for i in allfiles]
        drivers.remove('')
        return drivers
    except Exception as e:
        print e

# produces random samples of driverIDs and tripIDs in two separate lists
def random_samples(targ_driv, driv_list, K=200):
    """
    :param targ_driv: str, driverID we want to make false trips for
    :param driv_list: list, list of all drivers, produced by get_drivers()
    :param K: number of trips we want to make for targ_driv
    :return: tuple of lists, first list is random driverIDs, second list is list of tripIDs, both are strings
    """
    try:
        driv_list.remove(targ_driv) #removes the target driver from list of drivers to sample from
        drivers = np.random.choice(driv_list, K, True)
        trips = np.random.choice(np.arange(1,K+1).astype(str), K, True)
        return (drivers, trips)
    except Exception as e:
        print e

# reads directory of files and returns RDD of observations from trips in the sample (driverID, tripID combo)
# NOTE: this function is VERY SLOW, it is what slows the entire workflow down
def sample_data(path, driverIDs, tripIDs):
    """
    :param path: string, path to directory containing driver.csv's
    :param driverIDs: list, list of randomly sampled driverIDs as strings, produced by random_sample()
    :param tripIDs: list, list of randomly sampled tripIDs as strings, produced by random_samples()
        NOTE: the above two zip into a list of (driverID, tripID) tuples, with each tuple being a single item in the
        sample
    :return: RDD, contains only observations from the sample
    """
    try:
        combos = zip(driverIDs, tripIDs)
        samplefiles = [path + '/' + 'driver_' + i + '.csv' for i in driverIDs]
        samplefiles = ','.join(set(samplefiles))  #### NOTE: this set() action is a hack for small num. files
        RDD = sc.textFile(samplefiles)   #### NOTE: with large num. files, might need to set num. partitions
        RDDsplit = RDD.map(lambda x: x.split(','))
        RDDsamples = RDDsplit.filter(lambda x: (x[2],x[3]) in combos)
        RDDsamples.cache()
        return RDDsamples
    except Exception as e:
        print e

# takes RDD of samples and assigns new driverID and tripID to observations in a new RDD
def ID_Data(targ_driver, RDD, K = 200):
    """
    :param targ_driver: string, target driver we used to generate samples
    :param RDD: RDD, trip data RDD produced by sample_data(), format will be original form (x,y,driverID,tripID,step)
    :param K: int, number of trips we sampled
    :return: RDD, in original format, but with driverID and tripID changed to look like new observations of the target
    driver
    """
    try:
        newID1 = [targ_driver] * K
        newID2 = np.arange(200, 201+K).astype(str)
        newID = zip(newID1, newID2)
        oldID = RDD.map(lambda x: (x[2],x[3])).distinct().collect()
        glossary = sc.parallelize(zip(oldID, newID))
        newRDD = RDD.map(lambda x: ((x[2],x[3]), ([x[0],x[1],x[4]]))).join(glossary)
        newID_RDD = newRDD.map(lambda x: (x[1][0][0], x[1][0][1], x[1][1][0], x[1][1][1], x[1][0][2]))
        return newID_RDD
    except Exception as e:
        print e


# takes RDD in original form and converts it into key-value tuple with values being x,y,step,label
def processRDD(RDD, label):
    """
    :param RDD: RDD in original format (x,y,driverID,tripID,step)
    :param label: category of observation, 1 for positive, 0 for negative
    # note, not sure if it needs to be int or float
    :return: RDD, RDD returned in new key/value format: (driverID, tripID), (x, y, step, label)
    # note, x, y, step, and label will be floats
    """
    try:
        newRDD = RDD.map(lambda x: ((x[2],x[3]),(float(x[0]),float(x[1]),float(x[4]),label)))
        return newRDD
    except Exception as e:
        print e

# takes a driver to target, path to directory of driver.csv's, and returns an RDD labeled with
# (driverID, tripID),(x,y,step,label), where a label 1 is from an actual trip, and label 0 is from
# a trip randomly sampled from other drivers
def labelRDDs(targ_driv, path, K=200):
    """
    :param targ_driv: string, driver we want to create positive and negative labeled data for
    :param path: string, path to directory where driver.csvs are stored
    :param K: int, number of negative (manufactured) trips to sample
    :return: RDD with key, value tuple where key is (driverID, tripID) and value is (x,y,step,label)
    """
    try:
        full_path = path + '/' + 'driver_' + targ_driv + '.csv'
        #print full_path
        target = sc.textFile(path + '/' + 'driver_' + targ_driv + '.csv') #load target driver's data
        target2 = target.map(lambda x: x.split(',')) #convert from string to list of strings
        positives = processRDD(target2, 1.0) #label target driver's RDD
        driv_lis = get_drivers(path) #get python list of all possible drivers to sample from
        #print driv_lis
        sampdriv, samptrip = random_samples(targ_driv, driv_lis, K) #generate random samples of drivers and tripIDs
        samples = sample_data(path, sampdriv, samptrip) #generate RDD of random samples
        #print "GETS HERE"
        samplesRDD = ID_Data(targ_driv, samples, K) #relabel samples to look like target driver's trips
        #print "GETS HERE TOO"
        negatives = processRDD(samplesRDD, 0.0) #label samples
        finalRDD = positives.union(negatives).cache() #join target driver and samples together
        return finalRDD
    except Exception as e:
        print e

In [6]:

path = '/Users/mayankkedia/code/kaggle/axa_telematics/sample_drivers'
driver = '1'
driver_1_RDD = labelRDDs('1', path)

In [10]:
######## HARRY's code for basic feature generation ##########

def vectorRDD(RDD):
    vectorRDD = RDD.map(lambda x: (x[0], ([x[1][0]], [x[1][1]], x[1][2], (x[1][3], 1))))\
                    .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))\
                    .map(lambda x: (x[0], (x[1][0], x[1][1], 1)) if int(x[0][1])\
                           < 201 else (x[0], (x[1][0], x[1][1], 0)))
    return vectorRDD

VectorRDD = vectorRDD(driver_1_RDD).map(lambda x: (x[0], (zip(x[1][0], [0.0] + x[1][0][:-1]), 
                                zip(x[1][1], [0.0] + x[1][1][:-1]))))\
         .map(lambda x: (x[0], zip(map(lambda x: (x[0] - x[1]) ** 2, x[1][0]), 
                                   map(lambda x: (x[0] - x[1]) ** 2, x[1][1]))))\
         .map(lambda x: (x[0], map(lambda x: (x[0] + x[1]) ** 0.5, x[1])))\
         .map(lambda x: (x[0], zip(x[1], [0.0] + x[1][:-1])))\
         .map(lambda x: (x[0], map(lambda x: ([x[0]], [x[0] - x[1]]), x[1])))\
         .map(lambda x: (x[0], reduce(lambda x, y: (x[0] + y[0], x[1] + y[1]), x[1])))\
         .map(lambda x: (x[0], (min(x[1][0]), max(x[1][0]), min(x[1][1]), max(x[1][1]))))
# computes previous points of each x, y coordinate
# computes distance in each x, y direction squared from previous point
# computes euclidean distance from previous point (also speed as in m/s)
# computes previous speed
# computes speed difference (acceleration)
# IF REDUCING A LIST OF TUPLES, MUST USE BOTH TUPLE ELEMENTS!!!



In [18]:
####### MODELING using Gradient boosted trees with only a very minimal set of features ##########

def create_labelled_vectors(x):
    label = 1.0
    if int(x[0][1]) > 200:
        label = 0.0
    return LabeledPoint(label, x[1])
total_data = VectorRDD.map(create_labelled_vectors)
(trainingData, testData) = total_data.randomSplit([0.7, 0.3])
model = GradientBoostedTrees.trainClassifier(trainingData, categoricalFeaturesInfo={}, numIterations=3)

In [25]:
predictions = model.predict(testData.map(lambda x:x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v,p): v!=p).count()/float(testData.count())
print 'Test Error = {}'.format(testErr)


Test Error = 0.282442748092


In [89]:
###### Code to create spark data frame from Su's sampled RDD #############

def combine_tuples(row):
    data = row[1]
    x = []
    y = []
    steps = []
    label = None
    for d in data:
        x.append(d[0])
        y.append(d[1])
        steps.append(d[2])
        label = d[3]
    return Row(label=label,signature={"x":x, "y":y, "steps":steps})
path = '/Users/mayankkedia/code/kaggle/axa_telematics/sample_drivers'
driver = '1'
driver_1_RDD = labelRDDs('1', path).groupByKey().map(lambda x: combine_tuples(x))
driver_1_df = sqlContext.createDataFrame(driver_1_RDD)

In [92]:
driver_1_df.take(1)

[Row(label=0.0, signature={u'y': [0.0, 5.6, 10.6, 15.2, 23.4, 32.4, 43.9, 55.4, 66.9, 79.8, 93.2, 106.0, 119.2, 133.3, 148.8, 162.7, 177.0, 192.7, 209.2, 225.6, 241.3, 260.1, 279.4, 296.1, 311.5, 329.5, 347.7, 362.5, 374.9, 388.3, 403.4, 421.7, 437.6, 456.4, 468.8, 482.9, 486.7, 495.8, 506.5, 515.8, 526.8, 540.1, 554.5, 566.8, 578.3, 590.9, 602.2, 614.5, 626.8, 638.8, 650.8, 663.2, 674.6, 685.2, 700.5, 714.4, 727.9, 741.6, 756.1, 770.1, 784.1, 798.0, 811.5, 825.2, 838.1, 851.6, 862.9, 874.0, 887.0, 899.2, 912.1, 924.6, 937.7, 950.8, 965.4, 979.6, 993.6, 1006.3, 1016.1, 1025.2, 1030.3, 1034.2, 1039.6, 1045.7, 1052.6, 1060.0, 1067.5, 1075.9, 1084.8, 1092.9, 1099.8, 1105.5, 1109.4, 1111.8, 1113.4, 1114.6, 1117.9, 1122.6, 1128.9, 1135.6, 1141.7, 1146.5, 1149.9, 1152.5, 1153.5, 1153.8, 1153.8, 1154.0, 1154.0, 1154.1, 1154.3, 1154.3, 1154.5, 1154.8, 1154.7, 1154.6, 1154.7, 1154.9, 1154.8, 1155.0, 1154.7, 1154.7, 1154.4, 1154.4, 1154.4, 1154.3, 1154.3, 1154.3, 1154.3, 1154.0, 1154.0, 1154.0, 