In [None]:
import pandas as pd
import numpy as np
import time
import random
import lightgbm as lgb
import time
from multiprocessing import Process, Pool
import pickle
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc

In [None]:
from process import map_feature

In [None]:
FOLDER = '../../data_kaggle/champs/'
OUTPUT = FOLDER + 'out/'
TEMP = OUTPUT + 'temp/'
MAP  = OUTPUT + 'map/'

In [None]:
# original input
df_train = pd.read_csv(FOLDER+'train.csv')
# df_test = pd.read_csv(FOLDER+'test.csv')
mols_train = df_train['molecule_name'].unique()

In [None]:
DATA_DATE = '20190721'
MAP_DATE = '20190728'
GEN_DATE = '20190817'

In [None]:
# input files
map_train = MAP + MAP_DATE + '_' + 'map_train.pickle'
map_test = MAP + MAP_DATE + '_' + 'map_test.pickle'

first_train = OUTPUT + DATA_DATE + '_' + 'features_train.pickle'
first_test = OUTPUT + DATA_DATE + '_' + 'features_test.pickle'

In [None]:
merged_train_temp = TEMP + GEN_DATE + '_' + 'merged_features_train_{}.pickle'
merged_test_temp = TEMP + GEN_DATE + '_' + 'merged_features_test_{}.pickle'

In [None]:
df_map_train = pd.read_pickle(map_train)
df_map_train_idx = df_map_train.set_index('molecule_name')

In [None]:
df_map_train.head()

In [None]:
bond_types = ['1JHC', '1JHN', '2JHH', '2JHC', '2JHN', '3JHH', '3JHC', '3JHN']

In [None]:
cols_0, cols_1 = [], []
cols_0.append('target')
for b in bond_types:
    for i in range(3):
        cols_0.append('{}_{}_x'.format(b, i))
        cols_1.append('{}_{}_y'.format(b, i))

cols = []
cols.extend(cols_0)
cols.extend(cols_1)
cols.append('id')
cols.append('molecule_name')

df_map_train.columns = cols
df_map_train.head()

In [None]:
del_cols = ['1JHC_1_x', '1JHC_2_x', '1JHC_1_y', '1JHC_2_y',
            '1JHN_1_x', '1JHN_2_x', '1JHN_1_y', '1JHN_2_y',
            '2JHH_2_x', '2JHH_2_y',
            '2JHC_2_x', '2JHC_2_y',
            '2JHN_1_x', '2JHN_2_x', '2JHN_1_y', '2JHN_2_y',
            '3JHH_2_x', '3JHH_2_y',
            '3JHC_2_x', '3JHC_2_y',
            '3JHN_1_x', '3JHN_2_x', '3JHN_1_y', '3JHN_2_y',
           ]

In [None]:
for col in del_cols:
    del df_map_train[col]

In [None]:
gc.collect()

In [None]:
with open(first_train, 'rb') as f:
    features_train = pickle.load(f)

In [None]:
num_div = 8

In [None]:
def map_train_data(temp_no):
    div = len(mols_train) // num_div
    res = len(mols_train) % num_div
    if temp_no == num_div - 1:
        last_slice = div*(temp_no+1) + res
    else:
        last_slice = div*(temp_no+1)
        
    for m in tqdm(mols_train[div*temp_no:last_slice]):
        merge_features = map_feature(df_map_train_idx,features_train, m)
        with open(merged_train_temp.format(m), 'wb') as f:
            pickle.dump(merge_features, f)

In [None]:
start = time.time()
with Pool(processes=num_div) as p:
    p.map(map_train_data, [i for i in range(num_div)])

elapsed_time = time.time() - start
print ("total elapsed_time:{0}".format(elapsed_time) + "[sec]")

In [None]:
df_map_test = pd.read_pickle(map_test)
df_map_test_idx = df_map_test.set_index('molecule_name')
df_map_test.columns = cols
df_map_test.head()
for col in del_cols:
    del df_map_test[col]

In [None]:
with open(first_test, 'rb') as f:
    features_test = pickle.load(f)

In [None]:
def map_test_data(temp_no):
    div = len(mols_test) // num_div
    res = len(mols_test) % num_div
    if temp_no == num_div - 1:
        last_slice = div*(temp_no+1) + res
    else:
        last_slice = div*(temp_no+1)
        
    for m in tqdm(mols_test[div*temp_no:last_slice]):
        merge_features = map_feature(df_map_test_idx,features_test, m)
        with open(merged_test_temp.format(m), 'wb') as f:
            pickle.dump(merge_features, f)

In [None]:
start = time.time()
with Pool(processes=num_div) as p:
    p.map(map_test_data, [i for i in range(num_div)])

elapsed_time = time.time() - start
print ("total elapsed_time:{0}".format(elapsed_time) + "[sec]")   