In [1]:
import pandas
import dask.array as da
import numpy as np
import struct
from haversine import haversine, Unit
from math import sqrt
from scipy.signal import medfilt
from datetime import datetime, timedelta
from timezonefinder import TimezoneFinder
import pytz

In [2]:
from pyspark.rdd import RDD
from pyspark.sql import Row
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark.sql.functions import desc

from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.ml import Pipeline

MAX_MEMORY = '5g'
spark = SparkSession \
        .builder \
        .appName("FitRec") \
        .config("spark.executor.memory", MAX_MEMORY) \
        .config("spark.driver.memory", MAX_MEMORY) \
        .getOrCreate()

sc = spark.sparkContext

In [3]:
data = spark.read.json('endomondoHR_proper.json')
data.printSchema()

root
 |-- altitude: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- gender: string (nullable = true)
 |-- heart_rate: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- id: long (nullable = true)
 |-- latitude: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- longitude: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- speed: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- sport: string (nullable = true)
 |-- timestamp: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- url: string (nullable = true)
 |-- userId: long (nullable = true)



In [4]:
data.count()

167783

In [5]:
sample = data.rdd.takeSample(False, 100, 123)

In [6]:
sample = sc.parallelize(sample)

In [7]:
def get_time(timestamp, long, lat):
    utc_time = datetime.fromtimestamp(timestamp)
#     tf = TimezoneFinder()
#     timezone = tf.timezone_at(lng=long, lat=lat)
#     try:
#         offset = pytz.timezone(timezone).utcoffset(utc_time)
#     except:
#         offset = timedelta(hours=0)
    true_time = utc_time - timedelta(hours=7)
    return true_time

In [14]:
def derive_feature(row):
    """
    Calculate distance
    """
    lat = medfilt(row.latitude, 3).tolist()
    long = medfilt(row.longitude, 3).tolist()
    
    alt = [0.0001893939*i for i in row.altitude] #Convert from ft to mile
    indices = range(1,len(lat))
    
    diff_alt = [0.0]
    diff_alt += [alt[i] - alt[i-1] for i in indices]
    
    diff_time = [0.0]
    diff_time += [row.timestamp[i] - row.timestamp[i-1] for i in indices]
    
    #Calculate different of heart rate between 2 consecutive timestamp
    diff_heart = [0.0]
    diff_heart = [row.heart_rate[i] - row.heart_rate[i-1] for i in indices]
    
    #Calculate distance derive between 2 consecutive timestamps.
    #Unit: mile
    distance = [0.0]
    distance += [haversine((lat[i-1],long[i-1]), (lat[i], long[i]), unit=Unit.MILES) for i in indices]
    d_distance = [sqrt(d**2 + a**2) for d, a in zip(distance, diff_alt)]
    
    #Calculate average derived speed between 2 consecutive timestamps.
    #Unit: MPH
    d_speed = [0.0]
    try:
        d_speed += [dist/time*3600 for dist, time in zip(d_distance[1:], diff_time[1:])]
    except:
        d_speed = [0.0] * len(row.timestamp)
        
    #Get local hours
    hours = []
    minutes = []
    for (lg, lt, ts) in zip(row.longitude, row.latitude, row.timestamp):
        local_time = get_time(ts,lg,lt)
        hours.append(local_time.hour)
        minutes.append(local_time.minute)
        
    return Row(altitude = row.altitude,\
               gender = row.gender,\
               heart_rate = row.heart_rate,\
               id = row.id,\
               latitude = row.latitude,\
               longitude = row.longitude,\
               speed = row.speed,\
               sport = row.sport,\
               timestamp = row.timestamp,\
               url = row.url,\
               userId = row.userId,\
               distance = d_distance,\
               derive_speed = d_speed,\
               diff_time = diff_time,\
               diff_heart_rate = diff_heart,\
               hours = hours,\
               minutes = minutes)

In [74]:
def transform(row, lag=3):
    """
    Transform a workout session to multiples window frames.
    """
    prefix = [row.id, row.url, row.userId, row.sport, row.gender]
    flatted = []
    speed = row.speed if row.speed is not None else row.derive_speed
    a_features = [row.longitude, row.latitude, row.hours]
    b_features = [speed, row.distance, row.diff_time]
    c_features = [row.heart_rate, row.diff_heart_rate]
    for idx in range(len(row.timestamp)):
        a_row = []
        b_row = []
        c_row = []
        if idx < lag:
            mask = [0.0] * (lag-idx-1)
            for a in a_features:
                a_row += mask + a[0:idx+1]
            for b in b_features:
                roller = mask + b[0:idx+1]
                b_row += roller + [float(np.min(roller)), float(np.max(roller)), float(np.mean(roller)),\
                          float(np.std(roller))]
            for c in c_features:
                roller = mask + c[0:idx+1]
                b_row += roller + [float(np.min(roller[:-1])), float(np.max(roller[:-1])), \
                                   float(np.mean(roller[:-1])), float(np.std(roller[:-1]))]
        else:
            for a in a_features:
                a_row += a[idx+1-lag:idx+1]
            for b in b_features:
                roller = b[idx+1-lag:idx+1]
#                 print(len(roller), idx)
                b_row += roller + [float(np.min(roller)), float(np.max(roller)), float(np.mean(roller)),\
                          float(np.std(roller))]
            for c in c_features:
                roller = c[idx+1-lag:idx+1]
                c_row += roller + [float(np.min(roller[:-1])), float(np.max(roller[:-1])), \
                                   float(np.mean(roller[:-1])), float(np.std(roller[:-1]))]
                
        if len(prefix) + len(a_row) + len(b_row) + len(c_row) != 49:
            print(len(prefix) + len(a_row) + len(b_row) + len(c_row))
            continue
        flatted.append(prefix + a_row + b_row + c_row)
#         assert len(prefix) == 5
#         assert len(a_row) == 16, print(a_row)
#         assert len(b_row) == 35, print(b_row)
        for x in prefix + a_row + b_row + c_row:
            if x is None:
                print('aaa')
    return flatted

In [70]:
agg_name = ['min', 'max', 'mean', 'std']
a_name = ['longitude', 'latitude', 'hours']
b_name = ['speed', 'distance', 'diff_time', 'heart_rate', 'diff_heart_rate']
column = ['id', 'url', 'userId', 'sport', 'gender']

for name in a_name:
    column += [name + '_{}'.format(i) for i in range(2, -1, -1)]
for name in b_name:
    column += [name + '_{}'.format(i) for i in range(2, -1, -1)]
    column += [name + '_{}'.format(i) for i in agg_name]
print(column)

['id', 'url', 'userId', 'sport', 'gender', 'longitude_2', 'longitude_1', 'longitude_0', 'latitude_2', 'latitude_1', 'latitude_0', 'hours_2', 'hours_1', 'hours_0', 'speed_2', 'speed_1', 'speed_0', 'speed_min', 'speed_max', 'speed_mean', 'speed_std', 'distance_2', 'distance_1', 'distance_0', 'distance_min', 'distance_max', 'distance_mean', 'distance_std', 'diff_time_2', 'diff_time_1', 'diff_time_0', 'diff_time_min', 'diff_time_max', 'diff_time_mean', 'diff_time_std', 'heart_rate_2', 'heart_rate_1', 'heart_rate_0', 'heart_rate_min', 'heart_rate_max', 'heart_rate_mean', 'heart_rate_std', 'diff_heart_rate_2', 'diff_heart_rate_1', 'diff_heart_rate_0', 'diff_heart_rate_min', 'diff_heart_rate_max', 'diff_heart_rate_mean', 'diff_heart_rate_std']


In [77]:
df = sample.map(derive_feature).flatMap(transform).toDF(column)

df.rdd.take(100)

[Row(id=108239842, url='https://www.endomondo.com/users/7313790/workouts/108239842', userId=7313790, sport='run', gender='male', longitude_2=0.0, longitude_1=0.0, longitude_0=-0.5706459, latitude_2=0.0, latitude_1=0.0, latitude_0=44.8498437, hours_2=0.0, hours_1=0.0, hours_0=4, speed_2=0.0, speed_1=0.0, speed_0=0.0, speed_min=0.0, speed_max=0.0, speed_mean=0.0, speed_std=0.0, distance_2=0.0, distance_1=0.0, distance_0=0.0, distance_min=0.0, distance_max=0.0, distance_mean=0.0, distance_std=0.0, diff_time_2=0.0, diff_time_1=0.0, diff_time_0=0.0, diff_time_min=0.0, diff_time_max=0.0, diff_time_mean=0.0, diff_time_std=0.0, heart_rate_2=0.0, heart_rate_1=0.0, heart_rate_0=102, heart_rate_min=0.0, heart_rate_max=0.0, heart_rate_mean=0.0, heart_rate_std=0.0, diff_heart_rate_2=0.0, diff_heart_rate_1=0.0, diff_heart_rate_0=14, diff_heart_rate_min=0.0, diff_heart_rate_max=0.0, diff_heart_rate_mean=0.0, diff_heart_rate_std=0.0),
 Row(id=108239842, url='https://www.endomondo.com/users/7313790/wor

In [26]:
df = sample.map(derive_feature).flatMap(transform).toDF(column)
category_cols = ['sport', 'gender']

stages =[]
for category_col in category_cols:
    str_indexer = StringIndexer(inputCol = category_col, outputCol = category_col + '_index')
    encoder = OneHotEncoderEstimator(inputCols=[str_indexer.getOutputCol()]\
                                     , outputCols=[category_col + "_vec"])
    stages += [str_indexer, encoder]
numeric_cols = [x for x in column[5:] if x != 'diff_heart_rate_0' and x != 'heart_rate_0']
assembler_input = [c + "_vec" for c in category_cols] + numeric_cols
assembler = VectorAssembler(inputCols=assembler_input, outputCol='features', handleInvalid='skip')
stages += [assembler]

partial_pipeline = Pipeline().setStages(stages)
pipeline_model = partial_pipeline.fit(df)
prepared_df = pipeline_model.transform(df)

In [50]:
df.rdd.takeSample(False, 100, 0)

[Row(id=350635600, url='https://www.endomondo.com/users/982359/workouts/350635600', userId=982359, sport='bike (transport)', gender='male', longitude_2=None, longitude_1=None, longitude_0=9.913743417710066, latitude_2=None, latitude_1=None, latitude_0=57.008927101269364, hours_2=13, hours_1=13, hours_0=13, minutes_2=9, minutes_1=9, minutes_0=9, speed_2=None, speed_1=None, speed_0=17.89962880641459, speed_min=17.89962880641459, speed_max=18.417598194862713, speed_mean=18.10994629456908, speed_std=0.2223788999016237, heart_rate_2=124, heart_rate_1=124, heart_rate_0=125, heart_rate_min=124.0, heart_rate_max=125.0, heart_rate_mean=124.33333333333333, heart_rate_std=0.4714045207910317, distance_2=None, distance_1=None, distance_0=0.009944238225785884, distance_min=0.005115999498572976, distance_max=0.010007006601349967, distance_mean=0.00835574810856961, distance_std=0.0022909915261336193, diff_heart_rate_2=0, diff_heart_rate_1=1, diff_heart_rate_0=1, diff_heart_rate_min=0.0, diff_heart_rat

In [30]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.regression import RandomForestRegressor

train, test = prepared_df.randomSplit([0.8, 0.2], seed=123)
rf = RandomForestRegressor(labelCol="diff_heart_rate_0", featuresCol="features")
rf_model = rf.fit(train)

predict = rf_model.transform(test)

AttributeError: 'DataFrame' object has no attribute 'ctx'

In [38]:
predict.count()

25

In [41]:
value_and_pred = predict.rdd.map(lambda x: (float(x.diff_heart_rate_0), float(x.prediction)))
metrics = RegressionMetrics(value_and_pred)

print("MSE = %s" % metrics.meanSquaredError)
print("RMSE = %s" % metrics.rootMeanSquaredError)

MSE = 4.7942839843914165
RMSE = 2.189585345308882


In [34]:
predict.select('prediction').show(1)

+------------------+
|        prediction|
+------------------+
|6.2211580086580085|
+------------------+
only showing top 1 row

