# Loading up the Data

The data will be loaded from the following two files

- data/train_events.csv
- data/train_series.parquet

into two pandas dataframe `events` and `series`

In [1]:
import pandas as pd

In [2]:
events = pd.read_csv('data/train_events.csv')
events.head()

Unnamed: 0,series_id,night,event,step,timestamp
0,038441c925bb,1,onset,4992.0,2018-08-14T22:26:00-0400
1,038441c925bb,1,wakeup,10932.0,2018-08-15T06:41:00-0400
2,038441c925bb,2,onset,20244.0,2018-08-15T19:37:00-0400
3,038441c925bb,2,wakeup,27492.0,2018-08-16T05:41:00-0400
4,038441c925bb,3,onset,39996.0,2018-08-16T23:03:00-0400


In [None]:
series = pd.read_parquet("data/train_series.parquet", engine='fastparquet')

In [6]:
series.head()

Unnamed: 0,series_id,step,timestamp,anglez,enmo
0,038441c925bb,0.0,2018-08-14T15:30:00-0400,2.6367,0.0217
1,038441c925bb,1.0,2018-08-14T15:30:05-0400,2.6368,0.0215
2,038441c925bb,2.0,2018-08-14T15:30:10-0400,2.637,0.0216
3,038441c925bb,3.0,2018-08-14T15:30:15-0400,2.6368,0.0213
4,038441c925bb,4.0,2018-08-14T15:30:20-0400,2.6368,0.0215


In [4]:
len(series), len(events)

(127946340, 14508)

# Testing the Evaluation metric

Here I'm checking to see the evaluation metric works as I would expect it to work properly

In [7]:
import evaluations

In [14]:
# Lets load up sample submissions
sample = pd.read_csv("data/sample_submission.csv", index_col='row_id')

In [15]:
sample

Unnamed: 0_level_0,series_id,step,event,score
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,038441c925bb,100,onset,0.0
1,038441c925bb,105,wakeup,0.0
2,03d92c9f6f8a,80,onset,0.5
3,03d92c9f6f8a,110,wakeup,0.5
4,0402a003dae9,90,onset,1.0
5,0402a003dae9,120,wakeup,1.0


In [13]:
s_ids = sample['series_id'].unique()
s_ids

array(['038441c925bb', '03d92c9f6f8a', '0402a003dae9'], dtype=object)

In [16]:
series.columns

Index(['series_id', 'step', 'timestamp', 'anglez', 'enmo'], dtype='object')

In [72]:
ground_truths = events[['series_id', 'event', 'step']]
ground_truths.isna().sum()

series_id       0
event           0
step         4923
dtype: int64

In [74]:
ground_truths = ground_truths.dropna()
ground_truths.isna().sum()

series_id    0
event        0
step         0
dtype: int64

In [75]:
ground_truths.head()

Unnamed: 0,series_id,event,step
0,038441c925bb,onset,4992.0
1,038441c925bb,wakeup,10932.0
2,038441c925bb,onset,20244.0
3,038441c925bb,wakeup,27492.0
4,038441c925bb,onset,39996.0


In [76]:
def scoreIt(preds_df: pd.DataFrame, targs_df: pd.DataFrame):
    tol = [12, 36, 60, 90, 120, 150, 180, 240, 300, 360]
    tol = [float(i) for i in tol]
    tols = {
        'onset': tol,
        'wakeup': tol
    }
    return evaluations.score(
        solution=targs_df,
        submission=preds_df,
        tolerances=tols,
        series_id_column_name='series_id',
        time_column_name='step',
        event_column_name='event',
        score_column_name='score',
    )

        

In [77]:
scoreIt(sample, ground_truths)

0.0

In [78]:
s2 = ground_truths.iloc[:10]
s2.loc[:,'score']=[(i%10)/10 for i in range(len(s2))]
s2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  s2.loc[:,'score']=[(i%10)/10 for i in range(len(s2))]


Unnamed: 0,series_id,event,step,score
0,038441c925bb,onset,4992.0,0.0
1,038441c925bb,wakeup,10932.0,0.1
2,038441c925bb,onset,20244.0,0.2
3,038441c925bb,wakeup,27492.0,0.3
4,038441c925bb,onset,39996.0,0.4
5,038441c925bb,wakeup,44400.0,0.5
6,038441c925bb,onset,57240.0,0.6
7,038441c925bb,wakeup,62856.0,0.7
10,038441c925bb,onset,91296.0,0.8
11,038441c925bb,wakeup,97860.0,0.9


In [79]:
scoreIt(s2, ground_truths.iloc[:10])

1.0

In [80]:
s2.describe()

Unnamed: 0,step,score
count,10.0,10.0
mean,45730.8,0.45
std,31809.97602,0.302765
min,4992.0,0.0
25%,22056.0,0.225
50%,42198.0,0.45
75%,61452.0,0.675
max,97860.0,0.9


In [97]:
import random
s3 = s2.copy()
r = 10
s3.loc[:,'step'] += [random.randint(-1*r,r) for _ in range(len(s3))]
s3

Unnamed: 0,series_id,event,step,score
0,038441c925bb,onset,4984.0,0.0
1,038441c925bb,wakeup,10934.0,0.1
2,038441c925bb,onset,20240.0,0.2
3,038441c925bb,wakeup,27482.0,0.3
4,038441c925bb,onset,39993.0,0.4
5,038441c925bb,wakeup,44396.0,0.5
6,038441c925bb,onset,57247.0,0.6
7,038441c925bb,wakeup,62865.0,0.7
10,038441c925bb,onset,91295.0,0.8
11,038441c925bb,wakeup,97855.0,0.9


In [98]:
scoreIt(s3, ground_truths.iloc[:10])

1.0

In [92]:
from math import sqrt
sqrt((s3['step'] - s2['step']).apply(lambda i: i**2).sum())

1634.7406522136775

In [60]:
s4 = s3
s4.loc[:,'step'] = 10
s4

Unnamed: 0,series_id,event,step,score
0,038441c925bb,onset,10.0,0.0
1,038441c925bb,wakeup,10.0,0.1
2,038441c925bb,onset,10.0,0.2
3,038441c925bb,wakeup,10.0,0.3
4,038441c925bb,onset,10.0,0.4
5,038441c925bb,wakeup,10.0,0.5
6,038441c925bb,onset,10.0,0.6
7,038441c925bb,wakeup,10.0,0.7
10,038441c925bb,onset,10.0,0.8
11,038441c925bb,wakeup,10.0,0.9


In [61]:
scoreIt(s4, ground_truths.iloc[:10])

0.2