In [1]:
import pandas as pd
import numpy as np
import os
import warnings
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
# Suppress warnings
warnings.filterwarnings('ignore')

# Pandas option for displaying all columns
pd.options.display.max_columns = None

def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    stats, indexes = zip(*results)
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df

# Load datasets
train = pd.read_csv('C:/MY_Masters_Project/child-mind-institute-problematic-internet-use/train.csv')
test = pd.read_csv('C:/MY_Masters_Project/child-mind-institute-problematic-internet-use/test.csv')
sample = pd.read_csv('C:/MY_Masters_Project/child-mind-institute-problematic-internet-use/sample_submission.csv')

# Load time series data
train_ts = load_time_series("C:/MY_Masters_Project/child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts = load_time_series("C:/MY_Masters_Project/child-mind-institute-problematic-internet-use/series_test.parquet")

# Merge datasets
time_series_cols = train_ts.columns.tolist()
time_series_cols.remove('id')
train = pd.merge(train, train_ts, how="left", on='id')
test = pd.merge(test, test_ts, how="left", on='id')
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

100%|██████████| 996/996 [01:49<00:00,  9.07it/s]
100%|██████████| 2/2 [00:00<00:00,  4.90it/s]
