In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import time

events_csv = 'data/train_events.csv'
data_csv = "data/train_series.parquet"

def read_data(verbose=True, rd=True, re=True):
    events, data = None, None
    if re:
        if verbose: 
            print(f"Reading {events_csv}")
            t=time.time()
        events = pd.read_csv(events_csv)
        if verbose: 
            print(f"Read {len(events)} rows from {events_csv} in {time.time()-t:.3f} seconds")
    if rd:
        if verbose:
            print(f"Reading {data_csv}")
            t=time.time()
        data = pd.read_parquet(data_csv, engine='fastparquet')
        if verbose: 
            print(f"Read {len(data)} rows from {data_csv} in {time.time()-t:.3f} seconds")
    return data, events



In [2]:
data,events = read_data()
train_sids, test_sids = train_test_split(events['series_id'].unique(), train_size=0.8, random_state=42)
train_sids.shape, test_sids.shape
train_sids, test_sids = [set(i) for i in [train_sids, test_sids]]
len(train_sids), len(test_sids)

Reading data/train_events.csv
Read 14508 rows from data/train_events.csv in 0.056 seconds
Reading data/train_series.parquet
Read 127946340 rows from data/train_series.parquet in 73.314 seconds


(221, 56)

In [5]:
len(data), next(iter(train_sids))

(127946340, '5aad18e7ce64')

In [6]:
data.loc[data['series_id']=='5aad18e7ce64']

Unnamed: 0,series_id,step,timestamp,anglez,enmo
42542460,5aad18e7ce64,0.0,2018-02-05T17:00:00-0500,0.837300,0.0850
42542461,5aad18e7ce64,1.0,2018-02-05T17:00:05-0500,-27.239401,0.0558
42542462,5aad18e7ce64,2.0,2018-02-05T17:00:10-0500,-47.353199,0.0484
42542463,5aad18e7ce64,3.0,2018-02-05T17:00:15-0500,-26.554701,0.1502
42542464,5aad18e7ce64,4.0,2018-02-05T17:00:20-0500,-16.809799,0.1135
...,...,...,...,...,...
42952855,5aad18e7ce64,410395.0,2018-03-01T10:59:35-0500,-46.804100,0.0150
42952856,5aad18e7ce64,410396.0,2018-03-01T10:59:40-0500,-40.997799,0.0143
42952857,5aad18e7ce64,410397.0,2018-03-01T10:59:45-0500,-41.273701,0.0125
42952858,5aad18e7ce64,410398.0,2018-03-01T10:59:50-0500,-41.259899,0.0166


In [8]:
events.isna().sum()

series_id       0
night           0
event           0
step         4923
timestamp    4923
dtype: int64

In [33]:
nacheck = events.isna()
nacheck['series_id'] = events['series_id']
nacheck = nacheck.groupby('series_id').sum()
nonas = nacheck.loc[nacheck['step']==0]
nonas.__len__()

37

In [34]:
events['series_id'].unique().__len__()

277

In [42]:
nacheck.sum()['step'] , events.__len__() , events.dropna().__len__()

(4923, 14508, 9585)

In [51]:
df = data.head(1279463)


### Start Block

In [57]:
%%timeit
idx = [i in train_sids for i in df['series_id']]
train_events = df[idx]

268 ms ± 59.4 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [58]:
%%timeit
idx = df['series_id'].apply(lambda i: i in train_sids)
train_events = df[idx]

207 ms ± 4.34 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [55]:
import importlib
import parr
from itertools import repeat
importlib.reload(parr)

<module 'parr' from 'd:\\Projects\\ChildMindSleep\\parr.py'>

In [56]:
%%timeit
idx = parr.parallelize(parr.series_id_in_series_set, df['series_id'], repeat(train_sids, len(df['series_id'])))
train_events = df[idx]

2.72 s ± 643 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### End block

In [None]:
type('abc') is str

True

In [None]:
%%timeit


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import time
import parr
from itertools import repeat

def load_training_data(train_val_split=0.8, verbose=True, seed=42):
    events_csv = 'data/train_events.csv'
    if verbose: 
        print(f"Reading {events_csv}")
        t=time.time()
    events = pd.read_csv(events_csv)
    if verbose: 
        print(f"Read {len(events)} rows from {events_csv} in {time.time()-t} seconds")
    series_ids = events['series_id'].unique()
    train_ids, val_ids = train_test_split(series_ids, train_size=train_val_split, random_state=seed)
    train_ids, val_ids = set(train_ids), set(val_ids)
    train_events = events.loc[events['series_id'].apply(lambda i: i in train_ids)]
    val_events = events.loc[events['series_id'].apply(lambda i: i in val_ids)]


    data_csv = "data/train_series.parquet"
    if verbose: 
        print(f"Reading {data_csv}")
        t=time.time()
    data = pd.read_parquet(data_csv, engine='fastparquet')
    if verbose: 
        print(f"Read {len(data)} rows from {data_csv} in {time.time()-t} seconds")
        print("Getting indexs of training data from the data csv")
        t=time.time()
    train_idx = parr.parallelize(parr.series_id_in_series_set, data['series_id'], repeat(train_ids), total=len(data))
    if verbose: 
        print(f"Got indexes of training data from data csv in {time.time()-t} seconds")
        print(f"Spitting the training data from the data csv")
        t=time.time()
    train_data = data.loc[train_idx]
    if verbose:
        print(f"Splitted training data in {time.time()-t} seconds")
    if verbose: 
        print(f"Spitting the validation data from the data csv")
        t=time.time()
    val_data = data.loc[[i in val_ids for i in data['series_id']]]
    if verbose:
        print(f"Splitted validation data in {time.time()-t} seconds")
    if verbose: print(f"Done splitting the data")


    return train_data, train_events, val_data, val_events 

In [None]:
td, te, vd, ve = load_training_data()

Reading data/train_events.csv
Read 14508 rows from data/train_events.csv in 0.035845041275024414 seconds
Reading data/train_series.parquet
Read 127946340 rows from data/train_series.parquet in 69.02580213546753 seconds
Getting indexs of training data from the data csv
with tqdm


In [None]:
import parr
import pandas as pd
import itertools

s = pd.Series('a b c d e f g h i j k l m n o p'.split())
series_set = set('d e f g h'.split())
parr.parallelize(parr.series_id_in_series_set, s, itertools.repeat(series_set))
# with concurrent.futures.ProcessPoolExecutor() as executor:
#     res = list(executor.map(parr.series_id_in_series_set, s))
    

[False,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False]

In [None]:
td, te, vd, ve = load_training_data()

In [None]:
data.head()

In [None]:
data[data['series_id']=='038441c925bb'].head()

# Loading up the Data

The data will be loaded from the following two files

- data/train_events.csv
- data/train_series.parquet

into two pandas dataframe `events` and `series`

In [None]:
import pandas as pd

In [None]:
events = pd.read_csv('data/train_events.csv')
events.head()

In [None]:
series = pd.read_parquet("data/train_series.parquet", engine='fastparquet')

In [None]:
series.head()

In [None]:
len(series), len(events)

# Testing the Evaluation metric

Here I'm checking to see the evaluation metric works as I would expect it to work properly

In [None]:
import evaluations

In [None]:
# Lets load up sample submissions
sample = pd.read_csv("data/sample_submission.csv", index_col='row_id')

In [None]:
sample

In [None]:
s_ids = sample['series_id'].unique()
s_ids

In [None]:
series.columns

In [None]:
ground_truths = events[['series_id', 'event', 'step']]
ground_truths.isna().sum()

In [None]:
ground_truths = ground_truths.dropna()
ground_truths.isna().sum()

In [None]:
ground_truths.head()

In evaluations module

```py
def scoreIt(preds_df: pd.DataFrame, targs_df: pd.DataFrame):
    tol = [12, 36, 60, 90, 120, 150, 180, 240, 300, 360]
    tol = [float(i) for i in tol]
    tols = {
        'onset': tol,
        'wakeup': tol
    }
    return evaluations.score(
        solution=targs_df,
        submission=preds_df,
        tolerances=tols,
        series_id_column_name='series_id',
        time_column_name='step',
        event_column_name='event',
        score_column_name='score',
    )
```

        

In [None]:
evaluations.scoreIt(sample, ground_truths)

In [None]:
s2 = ground_truths.iloc[:10]
s2.loc[:,'score']=[(i%10)/10 for i in range(len(s2))]
s2

In [None]:
evaluations.scoreIt(s2, ground_truths.iloc[:10])

In [None]:
s2.describe()

In [None]:
import random
s3 = s2.copy()
r = 10
s3.loc[:,'step'] += [random.randint(-1*r,r) for _ in range(len(s3))]
s3

In [None]:
evaluations.scoreIt(s3, ground_truths.iloc[:10])

In [None]:
from math import sqrt
sqrt((s3['step'] - s2['step']).apply(lambda i: i**2).sum())

In [None]:
s4 = s3
s4.loc[:,'step'] = 10
s4

In [None]:
evaluations.scoreIt(s4, ground_truths.iloc[:10])