In [144]:
import pandas as pd
import numpy as np 

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

from math import radians, sin, cos, sqrt, atan2

In [145]:
df_train = pd.read_csv('dataset_train.csv')
df_test = pd.read_csv('dataset_eval.csv')

In [146]:
df_train.head()

Unnamed: 0,Activity ID,Activity Date,Distance,Elapsed Time,Moving Time,Starting Latitude,Starting Longitude,Finish Latitude,Finish Longitude,Label
0,15561313085,"Aug 23, 2025, 4:55:36 PM",11.19,4332,2900.0,44.38688,38.845067,44.38666,38.845216,PURE_LEISURE
1,16321410676,"Nov 1, 2025, 10:58:07 AM",10.51,9262,2724.0,44.387036,38.843094,44.403151,38.781384,PURE_LEISURE
2,12867426384,"Nov 10, 2024, 10:18:40 AM",19.1,8214,5130.0,44.386809,38.845026,44.386922,38.845124,PURE_LEISURE
3,11903829345,"Jul 16, 2024, 4:02:30 PM",15.31,5822,4259.0,44.387054,38.811028,44.386923,38.845091,LEISURELY_COMMUTE
4,14323523606,"Apr 29, 2025, 3:48:33 PM",4.46,1383,1194.0,44.38741,38.811053,44.386965,38.845283,COMMUTE


In [147]:
df_train.isnull().sum()

Activity ID           0
Activity Date         0
Distance              0
Elapsed Time          0
Moving Time           0
Starting Latitude     0
Starting Longitude    0
Finish Latitude       0
Finish Longitude      0
Label                 0
dtype: int64

In [148]:
df_train['Month'] = df_train['Activity Date'].str[:3]

In [149]:
df_train['Month']

0      Aug
1      Nov
2      Nov
3      Jul
4      Apr
      ... 
224    Aug
225    Sep
226    Jun
227    Oct
228    Jul
Name: Month, Length: 229, dtype: object

In [150]:
df_train['Speed'] = df_train['Distance'] / (df_train['Moving Time'] / 3600)

In [151]:
df_train['Speed']

0      13.891034
1      13.889868
2      13.403509
3      12.941066
4      13.447236
         ...    
224    14.075221
225    15.589867
226    10.572646
227    13.303651
228    11.845671
Name: Speed, Length: 229, dtype: float64

In [152]:
monthly = df_train.groupby('Month').agg({
    'Distance': 'sum',
    'Moving Time': 'sum'
}).reset_index()


In [153]:
monthly

Unnamed: 0,Month,Distance,Moving Time
0,Apr,56.5,16271.0
1,Aug,205.82,54911.0
2,Dec,186.79,53667.0
3,Feb,24.78,8061.0
4,Jan,164.21,40154.0
5,Jul,219.61,60299.0
6,Jun,129.63,36179.0
7,Mar,81.84,25557.0
8,May,104.13,29167.0
9,Nov,71.03,18400.0


In [154]:
monthly['AvgSpeed'] = monthly['Distance'] / (monthly['Moving Time'] / 3600)

In [155]:
monthly

Unnamed: 0,Month,Distance,Moving Time,AvgSpeed
0,Apr,56.5,16271.0,12.500768
1,Aug,205.82,54911.0,13.49369
2,Dec,186.79,53667.0,12.529935
3,Feb,24.78,8061.0,11.066617
4,Jan,164.21,40154.0,14.722219
5,Jul,219.61,60299.0,13.111262
6,Jun,129.63,36179.0,12.898864
7,Mar,81.84,25557.0,11.528114
8,May,104.13,29167.0,12.85247
9,Nov,71.03,18400.0,13.897174


In [156]:
order = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']

monthly['Month'] = pd.Categorical(monthly['Month'], order)
monthly = monthly.sort_values('Month')

In [157]:
monthly['AvgSpeed'] = np.floor(monthly['AvgSpeed'] * 100000) / 10000

In [158]:
monthly

Unnamed: 0,Month,Distance,Moving Time,AvgSpeed
4,Jan,164.21,40154.0,147.2221
3,Feb,24.78,8061.0,110.6661
7,Mar,81.84,25557.0,115.2811
0,Apr,56.5,16271.0,125.0076
8,May,104.13,29167.0,128.5247
6,Jun,129.63,36179.0,128.9886
5,Jul,219.61,60299.0,131.1126
1,Aug,205.82,54911.0,134.9368
11,Sep,210.97,55409.0,137.0701
10,Oct,407.92,107529.0,136.5689


In [159]:
df_test

Unnamed: 0,Activity ID,Activity Date,Distance,Elapsed Time,Moving Time,Starting Latitude,Starting Longitude,Finish Latitude,Finish Longitude
0,14319106971,"Apr 29, 2025, 7:06:13 AM",3.72,1122,1031.0,44.386980,38.845179,44.386881,38.811096
1,12589508252,"Oct 6, 2024, 11:26:17 AM",8.44,3377,2274.0,44.421864,38.789683,44.386936,38.845286
2,11908121839,"Jul 17, 2024, 7:06:33 AM",3.30,899,745.0,44.386222,38.841742,44.385895,38.811379
3,16687631914,"Dec 8, 2025, 6:39:38 PM",9.16,5157,2451.0,44.386866,38.845191,44.386939,38.845205
4,12212251514,"Aug 22, 2024, 9:31:29 AM",10.68,4245,3153.0,44.387108,38.811026,44.386753,38.811187
...,...,...,...,...,...,...,...,...,...
95,14537059131,"May 20, 2025, 8:44:40 AM",4.99,1164,1118.0,44.386849,38.845483,44.386555,38.811482
96,13500405361,"Jan 31, 2025, 10:13:30 AM",6.62,2804,1603.0,44.387760,38.845266,44.386550,38.811232
97,15589048727,"Aug 26, 2025, 7:11:04 AM",4.12,1214,967.0,44.386712,38.845163,44.386444,38.811381
98,15174665209,"Jul 20, 2025, 8:40:53 AM",11.68,6610,3264.0,44.386852,38.845114,44.386864,38.845140


In [160]:
def geo_distance(lat1, lon1, lat2, lon2):
    R = 6371
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1)*cos(lat2)*sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    return R * c

In [161]:
for df in [df_train, df_test]:
    df['GeoDist'] = df.apply(lambda x: geo_distance(
        x['Starting Latitude'], x['Starting Longitude'],
        x['Finish Latitude'], x['Finish Longitude']
    ), axis=1)

    df['Speed'] = df['Distance'] / (df['Moving Time'] / 3600)
    df['DetourRatio'] = df['Distance'] / (df['GeoDist'] + 1e-6)

In [162]:
features = ['Distance', 'Speed', 'GeoDist', 'DetourRatio']
X = df_train[features]
y = df_train['Label']

X_eval = df_test[features]

In [163]:
y

0           PURE_LEISURE
1           PURE_LEISURE
2           PURE_LEISURE
3      LEISURELY_COMMUTE
4                COMMUTE
             ...        
224         PURE_LEISURE
225              COMMUTE
226    LEISURELY_COMMUTE
227              COMMUTE
228    LEISURELY_COMMUTE
Name: Label, Length: 229, dtype: object

In [164]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_eval_scaled = scaler.transform(X_eval)


In [165]:
X_scaled

array([[ 4.14958201e-01,  1.14656950e-01, -1.61592846e+00,
         8.87162149e-01],
       [ 3.22552740e-01,  1.14048652e-01,  2.53764377e+00,
        -3.25350539e-01],
       [ 1.48985114e+00, -1.39543577e-01, -1.62584247e+00,
         3.49223634e+00],
       [ 9.74826585e-01, -3.80665633e-01,  5.27337056e-01,
        -3.14569878e-01],
       [-4.99584086e-01, -1.16743691e-01,  5.38272434e-01,
        -3.26457124e-01],
       [-5.29479971e-01,  2.47823575e-01,  5.33042391e-01,
        -3.26685337e-01],
       [-6.62652548e-01, -1.16987486e+00,  7.41171797e-02,
        -3.26800928e-01],
       [-5.49863529e-01,  7.62010370e-01,  5.13352829e-01,
        -3.26808082e-01],
       [ 3.71829323e-02,  5.49849522e-01,  5.25272073e-01,
        -3.22105146e-01],
       [-5.37633394e-01,  6.55638549e-01,  5.46820345e-01,
        -3.26779514e-01],
       [-7.21085413e-01,  5.92844218e-01,  4.02641941e-02,
        -3.27316586e-01],
       [-5.52581336e-01,  4.97209789e-03,  5.13580448e-01,
      

In [166]:
model = KNeighborsClassifier(n_neighbors=7)
model.fit(X_scaled, y)

In [167]:
y_pred = model.predict(X_eval_scaled)

In [168]:
y_pred
df_test['Label'] = y_pred

In [169]:
sub1 = pd.DataFrame({
    'subtaskID': 1,
    'Answer1': monthly['Month'],
    'Answer2': monthly['AvgSpeed']
})

In [170]:
sub2 = pd.DataFrame({
    'subtaskID': 2, 
    'Answer1': df_test['Activity ID'],
    'Answer2': df_test['Label']
})

In [171]:
sub2

Unnamed: 0,subtaskID,Answer1,Answer2
0,2,14319106971,COMMUTE
1,2,12589508252,PURE_LEISURE
2,2,11908121839,COMMUTE
3,2,16687631914,PURE_LEISURE
4,2,12212251514,PURE_LEISURE
...,...,...,...
95,2,14537059131,COMMUTE
96,2,13500405361,LEISURELY_COMMUTE
97,2,15589048727,COMMUTE
98,2,15174665209,PURE_LEISURE


In [172]:
final = pd.concat([sub1, sub2], ignore_index=True)
final.to_csv('submission.csv', index=False)