# Loading up the Data

The data will be loaded from the following two files

- data/train_events.csv
- data/train_series.parquet

into two pandas dataframe `events` and `series`

In [1]:
import pandas as pd
import numpy as np
import os
from IPython.display import display
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
events = pd.read_csv('data/train_events.csv')
events.head()

Unnamed: 0,series_id,night,event,step,timestamp
0,038441c925bb,1,onset,4992.0,2018-08-14T22:26:00-0400
1,038441c925bb,1,wakeup,10932.0,2018-08-15T06:41:00-0400
2,038441c925bb,2,onset,20244.0,2018-08-15T19:37:00-0400
3,038441c925bb,2,wakeup,27492.0,2018-08-16T05:41:00-0400
4,038441c925bb,3,onset,39996.0,2018-08-16T23:03:00-0400


In [3]:
series = pd.read_parquet("data/train_series.parquet", engine='fastparquet')

In [4]:
ground_truths = pd.read_csv('data/train_events.csv')

In [5]:
series.head()

Unnamed: 0,series_id,step,timestamp,anglez,enmo
0,038441c925bb,0.0,2018-08-14T15:30:00-0400,2.6367,0.0217
1,038441c925bb,1.0,2018-08-14T15:30:05-0400,2.6368,0.0215
2,038441c925bb,2.0,2018-08-14T15:30:10-0400,2.637,0.0216
3,038441c925bb,3.0,2018-08-14T15:30:15-0400,2.6368,0.0213
4,038441c925bb,4.0,2018-08-14T15:30:20-0400,2.6368,0.0215


In [6]:
unique_ids = series['series_id'].unique()
curr_50_series_ids = unique_ids[:50]
first_50_series = series[series['series_id'].isin(curr_50_series_ids)]

In [7]:
def sliding_window(df, N=10):
    # Calculate rolling mean and std for 'anglez'
    df['anglez_mean'] = df['anglez'].rolling(window=N).mean()
    df['anglez_std'] = df['anglez'].rolling(window=N).std()

    # Calculate rolling mean and std for 'enmo'
    df['enmo_mean'] = df['enmo'].rolling(window=N).mean()
    df['enmo_std'] = df['enmo'].rolling(window=N).std()

    return df.iloc[N-1:50000].copy()

In [8]:
def label_event(row, ground_truths, proximity=10):
    relevant_truths = ground_truths[ground_truths['series_id'] == row['series_id']]
    for _, truth in relevant_truths.iterrows():
        if abs(row['step'] - truth['step']) <= proximity:
            if row['enmo_mean'] < 0.03 and truth['event'] == 'onset':
                return 'onset'
            elif row['anglez_std'] > 1.1 and truth['event'] == 'wakeup':
                return 'wakeup'
    return 'none'

In [9]:
output_dir = "OutputData/series_chunks"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [10]:
for series_id in curr_50_series_ids:
    # Extract data for this series_id
    series_data = first_50_series[first_50_series['series_id'] == series_id].copy()
    
    # Apply sliding window
    series_data_processed = sliding_window(series_data)

    series_data_processed['event'] = series_data_processed.apply(lambda row: label_event(row, ground_truths), axis=1)

    display(series_data_processed.iloc[10900: 10940])
    
    # Save the processed data as a chunk
    output_path = os.path.join(output_dir, f'series_{series_id}.parquet')
    series_data_processed.to_parquet(output_path)
    print(f"Saved series {series_id} to {output_path}")

Unnamed: 0,series_id,step,timestamp,anglez,enmo,anglez_mean,anglez_std,enmo_mean,enmo_std,event
10909,038441c925bb,10909.0,2018-08-15T06:39:05-0400,-63.177898,0.0203,-63.129109,0.059627,0.02041,0.000129,none
10910,038441c925bb,10910.0,2018-08-15T06:39:10-0400,-63.108101,0.0206,-63.124769,0.059395,0.02042,0.00014,none
10911,038441c925bb,10911.0,2018-08-15T06:39:15-0400,-63.192501,0.0201,-63.126229,0.061004,0.0204,0.00017,none
10912,038441c925bb,10912.0,2018-08-15T06:39:20-0400,-63.139,0.0206,-63.12902,0.060873,0.02043,0.000177,none
10913,038441c925bb,10913.0,2018-08-15T06:39:25-0400,-63.028702,0.0202,-63.12837,0.06201,0.02043,0.000177,none
10914,038441c925bb,10914.0,2018-08-15T06:39:30-0400,-63.122002,0.0203,-63.12308,0.059817,0.02042,0.000181,none
10915,038441c925bb,10915.0,2018-08-15T06:39:35-0400,-63.155701,0.0207,-63.12086,0.057939,0.02044,0.000201,none
10916,038441c925bb,10916.0,2018-08-15T06:39:40-0400,-63.128799,0.0204,-63.11625,0.054917,0.02043,0.0002,none
10917,038441c925bb,10917.0,2018-08-15T06:39:45-0400,-63.0961,0.0202,-63.12052,0.051008,0.0204,0.000211,none
10918,038441c925bb,10918.0,2018-08-15T06:39:50-0400,-63.155701,0.02,-63.13045,0.046614,0.02034,0.000232,none


Saved series 038441c925bb to OutputData/series_chunks\series_038441c925bb.parquet


KeyboardInterrupt: 

In [11]:
output_dir = "OutputData/series_chunks"

series_files = [os.path.join(output_dir, f) for f in os.listdir(output_dir) if f.endswith('.parquet')]
dfs_to_join = [pd.read_parquet(file) for file in series_files]
df = pd.concat(dfs_to_join, ignore_index=True)

In [14]:
X = df[['enmo_mean', 'enmo_std', 'anglez_mean', 'anglez_std']]
y = df['event']

# Split data into training and test sets
X_train, X_test, y_train, y_test, series_id_train, series_id_test, step_train, step_test = train_test_split(
    X, y, df['series_id'], df['step'], test_size=0.2, random_state=42)

# Initialize and train Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predictions
y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)

onset_prob = y_prob[:, 0]
wakeup_prob = y_prob[:, 1]

onset_num = y_prob[:, 0].sum()
wakeup_num = y_prob[:, 1].sum()
total = len(y_prob)
print(onset_num, wakeup_num, total)

# Create the output dataframe
output = pd.DataFrame({
    'row_id': range(len(y_test)),
    'series_id': series_id_test,
    'step': step_test,
    'event': y_pred,
    'score': np.where(y_pred == 'onset', onset_prob, wakeup_prob)
})

# # Drop duplicates, keeping only the top scored event for each series_id
# output_uniques = output.drop_duplicates(subset=['series_id', 'event'], keep='first')

# # Find series_ids that don't have 'onset' or 'wakeup' and add them
# unique_series_ids = output['series_id'].unique()

# # For 'onset'
# missing_onset_series = set(unique_series_ids) - set(output_uniques[output_uniques['event'] == 'onset']['series_id'])
# for series_id in missing_onset_series:
#     new_row = pd.DataFrame({'series_id': [series_id], 'event': ['onset'], 'step': [0], 'score': [0.0]})
#     output_uniques = pd.concat([output_uniques, new_row], ignore_index=True)

# # For 'wakeup'
# missing_wakeup_series = set(unique_series_ids) - set(output_uniques[output_uniques['event'] == 'wakeup']['series_id'])
# for series_id in missing_wakeup_series:
#     new_row = pd.DataFrame({'series_id': [series_id], 'event': ['wakeup'], 'step': [0], 'score': [0.0]})
#     output_uniques = pd.concat([output_uniques, new_row], ignore_index=True)
    
output = output[output['event'] != 'none']

# Finally, sort values for a cleaner look
output_final = output.sort_values(by=['series_id', 'step', 'event']).reset_index(drop=True)


# Save the output dataframe
output_path = "OutputData/rf_submission.csv"
output_final.to_csv(output_path, index=False)

9983.3 11.600000000000001 9999


# Testing the Evaluation metric

Here I'm checking to see the evaluation metric works as I would expect it to work properly

In [15]:
import evaluations

In [16]:
# Lets load up sample submissions
sample = pd.read_csv("OutputData/rf_submission.csv", index_col='row_id')

In [17]:
sample

Unnamed: 0_level_0,series_id,step,event,score
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1861,038441c925bb,4994.0,onset,0.1
5258,038441c925bb,4997.0,onset,0.09
8886,038441c925bb,10935.0,wakeup,0.0
7229,038441c925bb,10936.0,wakeup,0.0
9846,038441c925bb,39988.0,onset,0.28
9195,038441c925bb,39991.0,onset,0.38
7653,038441c925bb,39996.0,onset,0.41
4594,038441c925bb,39997.0,onset,0.41
9785,038441c925bb,39999.0,onset,0.29
9952,038441c925bb,40000.0,onset,0.32


In [18]:
ground_truths = events[['series_id', 'event', 'step']]
ground_truths = ground_truths.dropna()

# Get the first series_id
first_series_id = ground_truths['series_id'].iloc[0]

# Filter rows that have this series_id
selected_rows = ground_truths[ground_truths['series_id'] == first_series_id]

In [19]:
def scoreIt(preds_df: pd.DataFrame, targs_df: pd.DataFrame):
    tol = [12, 36, 60, 90, 120, 150, 180, 240, 300, 360]
    tol = [float(i) for i in tol]
    tols = {
        'onset': tol,
        'wakeup': tol
    }
    return evaluations.score(
        solution=targs_df,
        submission=preds_df,
        tolerances=tols,
        series_id_column_name='series_id',
        time_column_name='step',
        event_column_name='event',
        score_column_name='score',
    )

        

In [20]:
scoreIt(sample, selected_rows)

0.03289473684210526