In [9]:
## data handlers:
import pandas as pd 

## support: 
import os 
from tqdm import tqdm

In [42]:
PATH2PARQUETS = '../../data/graph_features'
PATH2TEST_CLUSTERING = '../../data/clustering_result_test.csv'

def get_x1_features(path): 
    gz_files = os.listdir(path)
    df_list = list()
    for gz_file in tqdm(gz_files, desc='reading x1 based features'):
        df_list.append(
            pd.read_parquet(f'{path}/{gz_file}')
        )
    return pd.concat(df_list)

def preproc_x1_features(x1_features: pd.DataFrame): 
    x1_features['t'] = x1_features['t'].fillna(-1) 
    
    train_data = x1_features[x1_features.x1.notna()]
    test_data = x1_features[x1_features.x1.isna()]
    
    train_data = train_data.fillna(0)
    test_data = test_data.fillna(0)

    test_data = test_data.drop('x1', axis=1)

    return train_data, test_data 

def add_clustering_to_train_test(train_data, test_data,
                                 clustering): 
    train_data = train_data.merge(clustering,
                                  left_on=['ego_id'],
                                  right_on=['ego_id'],
                                  how='left') 
                                     
    test_data = test_data.merge(clustering,
                                left_on=['ego_id'],
                                right_on=['ego_id'],
                                how='left')
                                     
    return train_data, test_data

def write_data_to_dir_with_clustering(data, type_='train',
                                      folder_name='clean_data') -> None:
    assert 'cluster' in data
    if folder_name not in os.listdir(): 
        os.mkdir(folder_name)

    if type_ not in os.listdir(folder_name):
        os.mkdir(f'{folder_name}/{type_}')
        
    for cluster in tqdm(data['cluster'].unique(), desc=f'writing {type_} files'): 
        data_sliced = data[data['cluster'] == cluster].reset_index(drop=True) 
        data_sliced.to_parquet(f'{folder_name}/{type_}/cluster_{cluster}.gz',
                               index=False) 
        


In [44]:
x1_features = get_x1_features(PATH2PARQUETS) 
clustering = pd.read_csv(PATH2TEST_CLUSTERING)
train_data, test_data = preproc_x1_features(x1_features)
train_data, test_data = add_clustering_to_train_test(train_data, test_data,
                                                     clustering)
write_data_to_dir_with_clustering(train_data, type_='train')
write_data_to_dir_with_clustering(test_data, type_='test')

reading x1 based features: 100%|██████████| 9/9 [00:04<00:00,  2.12it/s]
writing train files: 100%|██████████| 9/9 [00:36<00:00,  4.07s/it]
writing test files: 100%|██████████| 9/9 [00:12<00:00,  1.34s/it]
