In [1]:
import sys
sys.path.append('../')

from baselines.FixTimeRegressor import FixTimeRegressor
from baselines.TimeRangeRegressor import TimeRangeRegressor

import pandas as pd
import tqdm
import numpy as np

import pyarrow.parquet as pq

In [14]:
regressor = TimeRangeRegressor(['05:00:00', '08:00:00'], ['21:00:00', '23:59:59'])


In [3]:
train_file = pq.ParquetFile('../data/raw/train_series.parquet')

In [4]:
participant_series_ids = ['038441c925bb', '03d92c9f6f8a', '0402a003dae9', '04f547b8017d', '05e1944c3818', '08db4255286f']
train_series = pd.DataFrame()
for series in participant_series_ids:
  train_series = pd.concat([train_series, pd.read_parquet("../data/raw/train_series.parquet", filters=[('series_id','=',series)])], axis=0)

In [5]:
def reduce_memory(df):
    start_memory = df.memory_usage().sum() / 1024**2
    print(f'Memory usage before cleanup is {start_memory:.2f} MB')
    
    for col in tqdm.tqdm(df.columns):
        column_type = df[col].dtype
        if column_type != object:
            max_value = df[col].max()
            min_value = df[col].min()
            if str(column_type) == 'uint32':
                if min_value > np.iinfo(np.uint8).min and max_value < np.iinfo(np.uint8).max:
                    df[col] = df[col].astype(np.uint8)
                elif min_value > np.iinfo(np.uint16).min and max_value < np.iinfo(np.uint16).max:
                    df[col] = df[col].astype(np.uint16)
            elif str(column_type) == 'float32':
                if min_value > np.finfo(np.float16).min and max_value < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
        df['series_id'] = df['series_id'].astype('category')

    end_memory = df.memory_usage().sum() / 1024**2
    print(f'Memory usage after cleanup is {end_memory:.2f} MB')
    
    improvement = (start_memory - end_memory) / start_memory * 100
    print(f'Memory usage improved by {improvement:.2f}%')
    
    return df

In [6]:
# dataframe = pd.DataFrame()
# for batch in tqdm.tqdm(train_file.iter_batches()):
#     df = batch.to_pandas()
#     dataframe = pd.concat([dataframe, df.loc[df['series_id'].isin(participant_series_ids)]], axis=0)

In [15]:
results = regressor.predict(train_series)

results.head()

100%|██████████| 726105/726105 [00:41<00:00, 17499.18it/s]


Unnamed: 0,row_id,series_id,step,event,score
0,0,038441c925bb,10680,wakeup,0.5
1,1,038441c925bb,10980,wakeup,0.5
2,2,038441c925bb,11760,wakeup,0.5
3,3,038441c925bb,28620,wakeup,0.5
4,4,038441c925bb,44940,wakeup,0.5


In [16]:
results.to_csv('../results/baseline_submission_1.csv')

In [9]:
regressor = FixTimeRegressor('06:00:00', '22:00:00')

In [10]:
results = regressor.predict(train_series)

100%|██████████| 346/346 [00:00<00:00, 16227.72it/s]


In [11]:
results.to_csv('../results/baseline_submission.csv', sep=',', index=False)