# 1. Load Dependencies

In [1]:
import pandas as pd
import numpy as np
import time
import datetime
from sys import getsizeof
import ast
import os
import joblib

!pip install -q -U torch watermark
%reload_ext watermark

!pip install scikit-tensor-py3
from sktensor import dtensor, cp_als

from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import TSNE

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
%matplotlib inline

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%watermark -v -p pandas,numpy,sklearn,matplotlib,seaborn,sktensor

CPython 3.6.9
IPython 5.5.0

pandas 1.1.4
numpy 1.16.6
sklearn 0.0
matplotlib 3.2.2
seaborn 0.11.0
sktensor 0.4.1


# 2. Helper Functions

In [3]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


def show_memory(unit='MB', threshold='MB'):
    '''check memory usage

    :param unit: memory unit, `B`,`KB`,`MB`,`GB`
    :param threshold: only show memory usage greater than the threshold
    '''

    scale = {'B': 1, 'KB': 1024, 'MB': 1048576, 'GB': 1073741824}
    for i in list(globals().keys()):
        memory = eval('getsizeof({})'.format(i))
        if memory >= scale[threshold]:
            print(i, str(memory//scale[unit]) + ' ' + unit)

# 3. Load Data

In [5]:
# Read data

file_path_train = '/content/drive/My Drive/Colab Notebooks/DATA5703/Data/Dataset/TrainData_1021.csv'
file_path_test = '/content/drive/My Drive/Colab Notebooks/DATA5703/Data/Dataset/TestData_1021.csv'

train_df = pd.read_csv(file_path_train)
test_df = pd.read_csv(file_path_test)

data_df = pd.concat([train_df, test_df], ignore_index=True)

data_df.head(1)

Unnamed: 0,id,userId,gender,sport,duration,calories,distance,avg_heart_rate,longitude,latitude,altitude,timestamp,heart_rate,speed,url,derived_distance,derived_speed,time_elapsed,validate,avg_alti,change_alti,max_alti,min_alti,diff_alti,avg_speed,Cluster,Route,Route_id
0,611012078,2568526,male,run,3158,830.588,10.02,154.914,"[7.099486151710153, 7.0994688011705875, 7.0993...","[43.68301374837756, 43.683006623759866, 43.682...","[137.8, 137.8, 138.2, 138.8, 138.8, 138.6, 139...","[1443653973, 1443653974, 1443653978, 144365398...","[140, 140, 141, 149, 149, 150, 153, 157, 160, ...",,https://www.endomondo.com/users/2568526/workou...,"[0.0016049429213742246, 0.01260242289257531, 0...","[5.777794516947209, 10.005378447215753, 10.428...","[1, 4, 5, 1, 5, 6, 11, 4, 6, 7, 8, 7, 7, 9, 5,...",True,87.8552,756.8,139.4,76.0,63.4,11.574343,1,"('run', 1)",9


In [6]:
print(data_df.shape)
show_memory()

(62831, 28)
train_df 2492 MB
test_df 623 MB
data_df 3116 MB


In [7]:
# sequence data was stored in string formatted list
# convert sequence data to array

def str_to_arr(str_ls):
    return np.fromstring(str_ls[1:-1], dtype=np.float, sep=',')


start = time.time()

data_df['heart_rate'] = data_df.apply(
    lambda x: str_to_arr(x['heart_rate']), axis=1)
data_df['altitude'] = data_df.apply(
    lambda x: str_to_arr(x['altitude']), axis=1)
data_df['derived_speed'] = data_df.apply(
    lambda x: str_to_arr(x['derived_speed']), axis=1)
data_df['derived_distance'] = data_df.apply(
    lambda x: str_to_arr(x['derived_distance']), axis=1)
data_df['timestamp'] = data_df.apply(
    lambda x: str_to_arr(x['timestamp']), axis=1)

elapsed = format_time(time.time() - start)
print(elapsed)

data_df.describe()

0:00:40


Unnamed: 0,id,userId,duration,calories,distance,avg_heart_rate,avg_alti,change_alti,max_alti,min_alti,diff_alti,avg_speed,Cluster,Route_id
count,62831.0,62831.0,62831.0,62831.0,62831.0,62831.0,62831.0,62831.0,62831.0,62831.0,62831.0,62831.0,62831.0,62831.0
mean,367986000.0,4437249.0,4237.423151,838.722699,20.815504,142.371731,153.780615,397.032384,189.831314,124.942693,64.888621,17.216757,1.731438,6.528768
std,155600400.0,3856116.0,2227.567507,514.669974,16.925158,16.1414,254.752939,351.424533,271.461019,245.407699,76.921453,7.959321,1.050199,4.18458
min,1346435.0,69.0,499.0,100.0,1.11,53.778,-284.4912,0.2,-261.0,-500.0,0.2,5.010181,0.0,0.0
25%,264050200.0,1367691.0,2680.0,503.0,9.95,132.332,18.558,166.6,40.8,1.0,24.0,10.794519,1.0,2.0
50%,361235000.0,3301823.0,3751.0,733.0,14.46,143.044,57.5928,285.0,89.0,32.8,43.6,12.894028,2.0,9.0
75%,483681900.0,6692748.0,5292.0,1026.0,26.45,153.184,178.99,509.8,221.2,149.05,74.4,24.923234,2.0,10.0
max,667596500.0,15481420.0,17848.0,5569.0,131.1,210.194,1996.0816,2982.0,2405.2,1987.4,999.2,34.989718,5.0,13.0


# 4. Min-Max scaling

In [10]:
# Load Min-max scaler models

scaler_dic = dict()
features = ['calories', 'distance', 'duration', 'heart_rate',
            'time_elapsed', 'altitude', 'derived_distance', 'speed']

dir = '/content/drive/My Drive/Colab Notebooks/DATA5703/scaler_model/'

for feature in features:
    path = os.path.join(dir, 'scaler_'+feature+'_2.m')
    scaler_dic[feature] = joblib.load(path)

scaler_dic

{'altitude': MinMaxScaler(copy=True, feature_range=(0, 1)),
 'calories': MinMaxScaler(copy=True, feature_range=(0, 1)),
 'derived_distance': MinMaxScaler(copy=True, feature_range=(0, 1)),
 'distance': MinMaxScaler(copy=True, feature_range=(0, 1)),
 'duration': MinMaxScaler(copy=True, feature_range=(0, 1)),
 'heart_rate': MinMaxScaler(copy=True, feature_range=(0, 1)),
 'speed': MinMaxScaler(copy=True, feature_range=(0, 1)),
 'time_elapsed': MinMaxScaler(copy=True, feature_range=(0, 1))}

In [11]:
sequence_df = data_df[['altitude', 'heart_rate',
                       'derived_distance', 'derived_speed']].copy()
sequence_df.rename(columns={'derived_speed': 'speed'}, inplace=True)

context_df = data_df[['userId', 'gender', 'sport',
                      'duration', 'calories', 'distance', 'Route_id']].copy()

display(sequence_df.head(1))
display(context_df.head(1))

Unnamed: 0,altitude,heart_rate,derived_distance,speed
0,"[137.8, 137.8, 138.2, 138.8, 138.8, 138.6, 139...","[140.0, 140.0, 141.0, 149.0, 149.0, 150.0, 153...","[0.0016049429213742246, 0.01260242289257531, 0...","[5.777794516947209, 10.005378447215753, 10.428..."


Unnamed: 0,userId,gender,sport,duration,calories,distance,Route_id
0,2568526,male,run,3158,830.588,10.02,9


In [12]:
# Apply min max scaler for each feature

start = time.time()

# for features in sequence_df
for feature in sequence_df.columns:
    # Reshape data using array.reshape(-1, 1) if data has a single feature
    sequence_df[feature] = sequence_df.apply(lambda x: np.concatenate(
        scaler_dic[feature].transform(x[feature].reshape(-1, 1)), axis=0)[0:499], axis=1)

# for features in context_df
for feature in context_df[context_df.columns.difference(['userId', 
                                                         'gender', 
                                                         'sport', 
                                                         'Route_id'])].columns:
    # Reshape data using array.reshape(-1, 1) if data has a single feature
    context_df[feature] = scaler_dic[feature].transform(
        context_df[feature].to_numpy().reshape(-1, 1))


elapsed = format_time(time.time() - start)
print(elapsed)

display(sequence_df.head(1))
display(context_df.head(1))

0:00:58


Unnamed: 0,altitude,heart_rate,derived_distance,speed
0,"[0.21051532033426185, 0.21051532033426185, 0.2...","[0.5833333333333334, 0.5833333333333334, 0.587...","[0.0008705472935918672, 0.006835760322515244, ...","[0.05777888189652973, 0.10005540659781222, 0.1..."


Unnamed: 0,userId,gender,sport,duration,calories,distance,Route_id
0,2568526,male,run,0.176939,0.149145,0.07643,9


In [13]:
data_df = pd.concat([context_df, sequence_df, data_df[['timestamp']]], axis=1)

data_df.head(1)

Unnamed: 0,userId,gender,sport,duration,calories,distance,Route_id,altitude,heart_rate,derived_distance,speed,timestamp
0,2568526,male,run,0.176939,0.149145,0.07643,9,"[0.21051532033426185, 0.21051532033426185, 0.2...","[0.5833333333333334, 0.5833333333333334, 0.587...","[0.0008705472935918672, 0.006835760322515244, ...","[0.05777888189652973, 0.10005540659781222, 0.1...","[1443653973.0, 1443653974.0, 1443653978.0, 144..."


# 5. Process data to construct User-Workout Route-Context Tensor

In [14]:
# Construct user profile, which will be used in embedding visualisation

# Count workout numbers and workout frequency for each user

def get_freq_from_ts(ts_list):

    deltas = []

    ts_list.sort()
    for i, ts in enumerate(ts_list):
        if i == 0:
            pass
        else:
            ts_prev = ts_list[i-1]

            dt_prev = datetime.datetime.fromtimestamp(ts_prev).date()
            dt = datetime.datetime.fromtimestamp(ts).date()

            deltas.append(abs(dt-dt_prev).days)
    try:
        if (len(deltas) == 1) & (sum(deltas) == 0):
            return 0
        else:
            return 1./(sum(deltas)/len(deltas))
    except:
        print('deltas: ', deltas)
        raise

start = time.time()

userlist = data_df.userId.unique().tolist()
print('There are {} users.'.format(len(userlist)))

user_profile = []

for user in userlist:
    time_stamps_list = data_df[data_df.userId == user].timestamp.tolist()
    time_stamp_list = [time_stamps[0] for time_stamps in time_stamps_list]
    gender = data_df[data_df.userId == user].gender.iloc[0]
    records = len(time_stamp_list)
    if records > 1:
        user_profile.append([user,
                             records, 
                             get_freq_from_ts(time_stamp_list)])
    else:
        user_profile.append([user, 
                             1,
                             0])

user_profile_df = pd.DataFrame(user_profile, columns = ['userId', 
                                                        'workout_records', 
                                                        'workout_freq'])

elapsed = format_time(time.time() - start)
print(elapsed)

user_profile_df.head(1)

There are 929 users.
0:00:02


Unnamed: 0,userId,workout_records,workout_freq
0,2568526,225,0.24086


In [15]:
# Compute avg_calories, avg_speed, avg_distance for each user

user_profile_list = []

for user in userlist:

    avg_calories = data_df[data_df.userId == user].calories.mean()
    avg_distance = data_df[data_df.userId == user].distance.mean()
    avg_speed = data_df[data_df.userId == user].apply(
        lambda x: x.speed.mean(), axis=1).mean()

    user_arr = np.array([user,
                         avg_calories,
                         avg_speed,
                         avg_distance])

    user_profile_list.append(user_arr)

user_df = pd.DataFrame(user_profile_list, columns=['userId',
                                                   'avg_calories',
                                                   'avg_speed',
                                                   'avg_distance'])
user_df.head(1)

Unnamed: 0,userId,avg_calories,avg_speed,avg_distance
0,2568526.0,0.172433,0.160723,0.176215


In [16]:
# Add additional features to user_profile_df

user_profile_df = user_profile_df.join(
    user_df.set_index('userId'), how='left', on='userId')

user_profile_df.head(1)

Unnamed: 0,userId,workout_records,workout_freq,avg_calories,avg_speed,avg_distance
0,2568526,225,0.24086,0.172433,0.160723,0.176215


In [17]:
# Add workout numbers and workout frequency to data_df
start = time.time()

data_df['workout_records'] = data_df.apply(
    lambda x: user_profile_df[user_profile_df.userId == x.userId].workout_records.values[0], axis=1)
data_df['workout_freq'] = data_df.apply(
    lambda x: user_profile_df[user_profile_df.userId == x.userId].workout_freq.values[0], axis=1)

elapsed = format_time(time.time() - start)
print(elapsed)

data_df.head(1)

0:01:11


Unnamed: 0,userId,gender,sport,duration,calories,distance,Route_id,altitude,heart_rate,derived_distance,speed,timestamp,workout_records,workout_freq
0,2568526,male,run,0.176939,0.149145,0.07643,9,"[0.21051532033426185, 0.21051532033426185, 0.2...","[0.5833333333333334, 0.5833333333333334, 0.587...","[0.0008705472935918672, 0.006835760322515244, ...","[0.05777888189652973, 0.10005540659781222, 0.1...","[1443653973.0, 1443653974.0, 1443653978.0, 144...",225,0.24086


In [18]:
# Scale workout_records and workout_freq

scaler_dic = {}

scaler_workout_records = MinMaxScaler()
scaler_workout_freq = MinMaxScaler()

scaler_workout_records.fit(data_df.workout_records.to_numpy().reshape(-1, 1))
scaler_workout_freq.fit(data_df.workout_freq.to_numpy().reshape(-1, 1))

data_df['workout_records'] = scaler_workout_records.transform(
    data_df.workout_records.to_numpy().reshape(-1, 1)).reshape(1, -1)[0]
data_df['workout_freq'] = scaler_workout_freq.transform(
    data_df.workout_freq.to_numpy().reshape(-1, 1)).reshape(1, -1)[0]

In [19]:
# Compute some parameters from sequence data

start = time.time()

data_df['min_altitude'] = data_df.apply(lambda x: np.amin(x.altitude), axis=1)
data_df['avg_altitude'] = data_df.apply(lambda x: x.altitude.mean(), axis=1)
data_df['max_altitude'] = data_df.apply(lambda x: np.amax(x.altitude), axis=1)

data_df['min_heart_rate'] = data_df.apply(
    lambda x: np.amin(x.heart_rate), axis=1)
data_df['avg_heart_rate'] = data_df.apply(
    lambda x: x.heart_rate.mean(), axis=1)
data_df['max_heart_rate'] = data_df.apply(
    lambda x: np.amax(x.heart_rate), axis=1)

data_df['min_distance'] = data_df.apply(
    lambda x: np.amin(x.derived_distance), axis=1)
data_df['avg_distance'] = data_df.apply(
    lambda x: x.derived_distance.mean(), axis=1)
data_df['max_distance'] = data_df.apply(
    lambda x: np.amax(x.derived_distance), axis=1)

data_df['min_speed'] = data_df.apply(lambda x: np.amin(x.speed), axis=1)
data_df['avg_speed'] = data_df.apply(lambda x: x.speed.mean(), axis=1)
data_df['max_speed'] = data_df.apply(lambda x: np.amax(x.speed), axis=1)

elapsed = format_time(time.time() - start)
print(elapsed)

0:00:14


In [20]:
# Construct route profile, which will be used in embedding visualisation

route_list = data_df.Route_id.unique()
route_profile_list = []

for route in route_list:
    tmp_df = data_df[data_df.Route_id == route].copy()
    avg_distance_route = tmp_df.avg_distance.mean()
    avg_speed_route = tmp_df.avg_speed.mean()
    avg_calories_route = tmp_df.calories.mean()
    sport = tmp_df.sport.iloc[0]

    route_arr = np.array([route,
                          sport,
                          avg_calories_route,
                          avg_speed_route,
                          avg_distance_route])

    route_profile_list.append(route_arr)

route_profile_df = pd.DataFrame(route_profile_list, columns=['Route_id',
                                                             'sport',
                                                             'avg_calories',
                                                             'avg_speed',
                                                             'avg_distance'])

route_profile_df['Route_id'] = route_profile_df['Route_id'].astype(int)
route_profile_df[['avg_calories', 'avg_speed', 'avg_distance']] = route_profile_df[[
    'avg_calories', 'avg_speed', 'avg_distance']].apply(pd.to_numeric)
route_profile_df.head(1)

Unnamed: 0,Route_id,sport,avg_calories,avg_speed,avg_distance
0,9,run,0.180111,0.108633,0.015417


In [21]:
# Encode gender feature

conditions = [
    (data_df['gender'] == 'male'),
    (data_df['gender'] == 'female'),
    (data_df['gender'] == 'unknown')]

choices = [0, 1, 2]

data_df['genderId'] = np.select(conditions, choices, default=0)

In [22]:
# Encode sport feature

conditions = [
    (data_df['sport'] == 'run'),
    (data_df['sport'] == 'bike'),
    (data_df['sport'] == 'mountain bike')]

choices = [0, 1, 2]

data_df['sportId'] = np.select(conditions, choices, default=0)

In [23]:
data_df.head(1)

Unnamed: 0,userId,gender,sport,duration,calories,distance,Route_id,altitude,heart_rate,derived_distance,speed,timestamp,workout_records,workout_freq,min_altitude,avg_altitude,max_altitude,min_heart_rate,avg_heart_rate,max_heart_rate,min_distance,avg_distance,max_distance,min_speed,avg_speed,max_speed,genderId,sportId
0,2568526,male,run,0.176939,0.149145,0.07643,9,"[0.21051532033426185, 0.21051532033426185, 0.2...","[0.5833333333333334, 0.5833333333333334, 0.587...","[0.0008705472935918672, 0.006835760322515244, ...","[0.05777888189652973, 0.10005540659781222, 0.1...","[1443653973.0, 1443653974.0, 1443653978.0, 144...",0.270531,0.333499,0.188997,0.193092,0.211072,0.316667,0.645316,0.783333,0.000871,0.01089,0.024076,0.057779,0.115745,0.156878,0,0
