In [10]:
import sys
sys.path.append('../')

from baselines.FixTimeRegressor import FixTimeRegressor

import pandas as pd
import tqdm
import numpy as np

import pyarrow.parquet as pq

In [11]:
train_file = pq.ParquetFile('../data/raw/train_series.parquet')

In [12]:
participant_series_ids = ['038441c925bb', '03d92c9f6f8a', '0402a003dae9', '04f547b8017d', '05e1944c3818', '08db4255286f']

In [4]:
def reduce_memory(df):
    start_memory = df.memory_usage().sum() / 1024**2
    print(f'Memory usage before cleanup is {start_memory:.2f} MB')
    
    for col in tqdm.tqdm(df.columns):
        column_type = df[col].dtype
        if column_type != object:
            max_value = df[col].max()
            min_value = df[col].min()
            if str(column_type) == 'uint32':
                if min_value > np.iinfo(np.uint8).min and max_value < np.iinfo(np.uint8).max:
                    df[col] = df[col].astype(np.uint8)
                elif min_value > np.iinfo(np.uint16).min and max_value < np.iinfo(np.uint16).max:
                    df[col] = df[col].astype(np.uint16)
            elif str(column_type) == 'float32':
                if min_value > np.finfo(np.float16).min and max_value < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
        df['series_id'] = df['series_id'].astype('category')

    end_memory = df.memory_usage().sum() / 1024**2
    print(f'Memory usage after cleanup is {end_memory:.2f} MB')
    
    improvement = (start_memory - end_memory) / start_memory * 100
    print(f'Memory usage improved by {improvement:.2f}%')
    
    return df

In [20]:
dataframe = pd.DataFrame()
for batch in tqdm.tqdm(train_file.iter_batches()):
    df = batch.to_pandas()
    dataframe = pd.concat([dataframe, df.loc[df['series_id'].isin(participant_series_ids)]], axis=0)

1953it [07:34,  4.29it/s]


In [6]:
regressor = FixTimeRegressor('06:00:00', '22:00:00')

In [21]:
results = regressor.predict(dataframe)

100%|██████████| 2989980/2989980 [11:59<00:00, 4152.83it/s]


In [22]:
results.to_csv('../results/baseline_submission.csv', sep=',', index=False)