In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
import os

DIR = 'drive/MyDrive/sber/'

DATA_PATH = os.path.join(DIR, 'train.parquet')

TRAIN_VAL1_PATH = os.path.join(DIR, 'train_val1.parquet')  
TRAIN_VAL2_PATH = os.path.join(DIR, 'train_val2.parquet') 

VAL1_PATH = os.path.join(DIR, 'val1.parquet')  
VAL2_PATH = os.path.join(DIR, 'val2.parquet')  

TRAIN_TEST_PATH = os.path.join(DIR, 'train_test.parquet')
TEST_IDS_PATH = os.path.join(DIR, 'test_ids.csv')
VAL1_IDS_PATH = os.path.join(DIR, 'val1_ids.csv')  

USER_DECODER_PATH = os.path.join(DIR, 'user_decoder.pkl') 

VAL1_USER_IDS_PATH = os.path.join(DIR, 'val1_user_ids.parquet')  
VAL2_USER_IDS_PATH = os.path.join(DIR, 'val2_user_ids.parquet')  

CLUSTERS_PATH = os.path.join(DIR, 'clusters.parquet') 

In [None]:
import pandas as pd
from datetime import datetime
import numpy as np
import pickle

data = pd.read_parquet(DATA_PATH)

col_type_map = {
    'retailer_id': 'uint8',
    'city_id': 'uint8',
    'store_id': 'uint16',
}

for col in col_type_map:
    mapping = dict(zip(data[col].unique(), range(data[col].nunique())))
    data[col] = data[col].map(mapping).astype(col_type_map[col])

data['cluster_id'] = data['cluster_id'].astype('uint16')
data['product_quantity'] = data['product_quantity'].astype('uint16')
data['dt'] = data['completed_at'].values.astype(np.int64) // 10 ** 12
data['dt'] = data['dt'] - data['dt'].min()
data['dt'] = data['dt'].astype(np.uint16)
data = data.sort_values('dt')

user_decoder = data['id'].unique()
user_encoder = dict(zip(user_decoder, np.arange(len(user_decoder))))

data['user_id'] = data['id'].map(user_encoder).astype(np.int32)

val1_date_split = pd.to_datetime(datetime(year=2021, month=9, day=1))
val2_date_split = pd.to_datetime(datetime(year=2021, month=8, day=1))

train_val1 = data[data.completed_at <= val1_date_split]
val1 = data[data.completed_at > val1_date_split]
val1 = val1[val1.id.isin(train_val1.id)]

train_val2 = data[data.completed_at <= val2_date_split]
val2 = data[(data.completed_at > val2_date_split) & (data.completed_at <= val1_date_split)]
val2 = val2[val2.id.isin(train_val2.id)]

columns = ['user_id', 'order_id', 'dt', 'cluster_id', 
          'product_quantity', 'retailer_id', 'city_id', 'store_id',
           'product_price', 'product_discount']

train_val1[columns].to_parquet(TRAIN_VAL1_PATH, index=False)
train_val2[columns].to_parquet(TRAIN_VAL2_PATH, index=False)

columns_val = ['user_id', 'cluster_id']

val1[columns_val].drop_duplicates().to_parquet(VAL1_PATH, index=False)
val2[columns_val].drop_duplicates().to_parquet(VAL2_PATH, index=False)

data[columns].to_parquet(TRAIN_TEST_PATH, index=False)

val1['id'].drop_duplicates().to_csv(VAL1_IDS_PATH, index=False)

pickle.dump(user_decoder, open(USER_DECODER_PATH, 'wb'))


val1[['user_id']].drop_duplicates().to_parquet(VAL1_USER_IDS_PATH, index=False)
val2[['user_id']].drop_duplicates().to_parquet(VAL2_USER_IDS_PATH, index=False)

clusters = pd.read_parquet(CLUSTERS_PATH)
for col in ['d_max', 'd_min', 'd_mean', 'd_median']:
    clusters[col] = clusters[col].astype(np.float32)
clusters['cluster_size'] = clusters['cluster_size'].astype(np.int16)
clusters['cluster_id'] = clusters['cluster_id'].astype(np.uint16)
clusters.to_parquet(CLUSTERS_PATH, index=False)