In [1]:
import os
import pandas as pd
import numpy as np
import time
import gc
import pickle
from joblib import Parallel, delayed

In [24]:
FOLDER = '../../data_kaggle/champs/'
NCORES = 1
OUTPUT = FOLDER + 'out/'

In [3]:
# df_mulliken_charges = pd.read_csv(FOLDER + 'mulliken_charges.csv')
# df_sample =  pd.read_csv(FOLDER + 'sample_submission.csv')
# df_magnetic_shielding_tensors = pd.read_csv(FOLDER + 'magnetic_shielding_tensors.csv')
df_train = pd.read_csv(FOLDER + 'train.csv')
# df_test = pd.read_csv(FOLDER + 'test.csv')
# df_dipole_moments = pd.read_csv(FOLDER + 'dipole_moments.csv')
# df_potential_energy = pd.read_csv(FOLDER + 'potential_energy.csv')
df_structures = pd.read_csv(FOLDER + 'structures.csv')
# df_scalar_coupling_contributions = pd.read_csv(FOLDER + 'scalar_coupling_contributions.csv')

In [4]:
df_structures_idx = df_structures.set_index('molecule_name')
df_train_idx = df_train.set_index('molecule_name')

In [5]:
mols = df_train['molecule_name'].unique()

In [6]:
def assign_atoms_index(df_idx, molecule):
    se_0 = df_idx.loc[molecule]['atom_index_0']
    se_1 = df_idx.loc[molecule]['atom_index_1']
    if type(se_0) == np.int64:
        se_0 = pd.Series(se_0)
    if type(se_1) == np.int64:
        se_1 = pd.Series(se_1)
    assign_idx = pd.concat([se_0, se_1]).unique()
    assign_idx.sort()
    return assign_idx

In [7]:
def get_dist_matrix(df_structures_idx, molecule):
    df_temp = df_structures_idx.loc[molecule]
    locs = df_temp[['x','y','z']].values
    num_atoms = len(locs)
    loc_tile = np.tile(locs.T, (num_atoms,1,1))
    dist_mat = (np.linalg.norm(loc_tile - loc_tile.T, axis=1)**2)
    return dist_mat

In [8]:
def n_bond_cnt(df_structures_idx, mol):
    dist_mat = get_dist_matrix(df_structures_idx, mol)
    df_temp = df_structures_idx.loc[mol]
    num_atoms = df_temp.shape[0]

    n_idx = df_temp[df_temp['atom'] == 'N']['atom_index'].values

    n_bond_cnt = []

    for i in n_idx:
        dist_argsort = np.argsort(dist_mat[i])

        near_1_idx = dist_argsort[1]
        near_2_idx = dist_argsort[2]
        
        dist_1 = dist_mat[i][near_1_idx]
        dist_2 = dist_mat[i][near_2_idx]

        if dist_2 - dist_1 > 0.2:
            n_bond_cnt.append(1)
            continue
        
        near_3_idx = dist_argsort[3]
        dist_3 = dist_mat[i][near_3_idx]
        
        if dist_3 - dist_2 > 0.2:
            n_bond_cnt.append(2)
        else:
            n_bond_cnt.append(3)

    se_n_bond = pd.Series(n_bond_cnt, name='n_bond')
    se_n_idx = pd.Series(n_idx, name='atom_index')
    df_bond = pd.concat([se_n_idx, se_n_bond], axis=1)

    df_temp2 = pd.merge(df_temp[['atom', 'atom_index']], df_bond, on='atom_index', how='outer').fillna(0)
    df_temp2['molecule_name'] = mol
    return df_temp2

In [20]:
def c_bond_cnt(df_structures_idx, mol):
    dist_mat = get_dist_matrix(df_structures_idx, mol)
    df_temp = df_structures_idx.loc[mol]
    num_atoms = df_temp.shape[0]

    c_idx = df_temp[df_temp['atom'] == 'C']['atom_index'].values

    c_bond_cnt = []

    for i in c_idx:
        dist_argsort = np.argsort(dist_mat[i])

        near_1_idx = dist_argsort[1]
        near_2_idx = dist_argsort[2]

        origin_loc = df_temp[df_temp['atom_index'] == i][['x', 'y', 'z']].values[0]
        near_1_loc = df_temp[df_temp['atom_index'] == near_1_idx][['x', 'y', 'z']].values[0]
        near_2_loc = df_temp[df_temp['atom_index'] == near_2_idx][['x', 'y', 'z']].values[0]

        vec_01 = near_1_loc - origin_loc
        vec_02 = near_2_loc - origin_loc
        cos_12 = np.dot(vec_01, vec_02) /np.linalg.norm(vec_01) / np.linalg.norm(vec_02)

        if cos_12 < -0.95:
            c_bond_cnt.append(2)
            continue

        near_3_idx = dist_argsort[3]
        near_3_loc = df_temp[df_temp['atom_index'] == near_3_idx][['x', 'y', 'z']].values[0]
        vec_012 = vec_01 + vec_02
        vec_03 = near_3_loc - origin_loc
        cos_123 = np.dot(vec_012, vec_03) / np.linalg.norm(vec_012) / dist_mat[i][near_3_idx]

        if cos_123 < -0.95:
            c_bond_cnt.append(3)
        else:
            c_bond_cnt.append(4)

    se_c_bond = pd.Series(c_bond_cnt, name='c_bond')
    se_c_idx = pd.Series(c_idx, name='atom_index')
    df_bond = pd.concat([se_c_idx, se_c_bond], axis=1)

    df_temp2 = pd.merge(df_temp[['atom', 'atom_index']], df_bond, on='atom_index', how='outer').fillna(0)
    df_temp2['molecule_name'] = mol
    return df_temp2

In [21]:
def get_pickup_dist_matrix(df_idx, df_structures_idx, molecule, num_pickup, atoms=['H', 'C', 'N', 'O', 'F']):
    pickup_dist_matrix = np.zeros([0, len(atoms)*num_pickup*3])
    assigned_idxs = assign_atoms_index(df_idx, molecule) # [0, 1, 2, 3, 4, 5, 6] -> [1, 2, 3, 4, 5, 6]
    dist_mat = get_dist_matrix(df_structures_idx, molecule)
    df_cbond = c_bond_cnt(df_structures_idx, molecule)
    df_nbond = n_bond_cnt(df_structures_idx, molecule)
    
    num_cbonds = df_cbond['c_bond'].values
    num_nbonds = df_nbond['n_bond'].values
    num_bonds = num_nbonds + num_cbonds
    
    for idx in assigned_idxs: # [1, 2, 3, 4, 5, 6] -> [2]
        df_temp = df_structures_idx.loc[molecule]
        locs = df_temp[['x','y','z']].values

        dist_arr = dist_mat[idx] # (7, 7) -> (7, )

        atoms_mole = df_structures_idx.loc[molecule]['atom'].values # ['O', 'C', 'C', 'N', 'H', 'H', 'H']
        atoms_mole_idx = df_structures_idx.loc[molecule]['atom_index'].values # [0, 1, 2, 3, 4, 5, 6]

        mask_atoms_mole_idx = atoms_mole_idx != idx # [ True,  True, False,  True,  True,  True,  True]
        masked_atoms = atoms_mole[mask_atoms_mole_idx] # ['O', 'C', 'N', 'H', 'H', 'H']
        masked_atoms_idx = atoms_mole_idx[mask_atoms_mole_idx]  # [0, 1, 3, 4, 5, 6]
        masked_dist_arr = dist_arr[mask_atoms_mole_idx]  # [ 5.48387003, 2.15181049, 1.33269675, 10.0578779, 4.34733927, 4.34727838]
        masked_locs = locs[masked_atoms_idx]
        masked_num_bonds = num_bonds[masked_atoms_idx]

        sorting_idx = np.argsort(masked_dist_arr) # [2, 1, 5, 4, 0, 3]
        sorted_atoms_idx = masked_atoms_idx[sorting_idx] # [3, 1, 6, 5, 0, 4]
        sorted_atoms = masked_atoms[sorting_idx] # ['N', 'C', 'H', 'H', 'O', 'H']
        sorted_dist_arr = 1/masked_dist_arr[sorting_idx] #[0.75035825,0.46472494,0.23002898,0.23002576,0.18235297,0.09942455]
        sorted_num_bonds = masked_num_bonds[sorting_idx]

        sorted_locs = masked_locs[sorting_idx]
        nearest_idx = sorted_atoms_idx[0]
        nearest_atom = sorted_atoms[0]
        base_vec = sorted_locs[0] - locs[idx]
        base_vec = base_vec / np.linalg.norm(base_vec)
        
        target_matrix = np.zeros([len(atoms), num_pickup*3])
        for a, atom in enumerate(atoms):
            pickup_atom = sorted_atoms == atom # [False, False,  True,  True, False,  True]
            pickup_dist = sorted_dist_arr[pickup_atom] # [0.23002898, 0.23002576, 0.09942455]
            pickup_num_bond = sorted_num_bonds[pickup_atom]
            pickup_locs = sorted_locs[pickup_atom] - np.tile(locs[idx], (sum(pickup_atom), 1))
            
            in_prods = np.dot(base_vec, pickup_locs.T) / np.linalg.norm(pickup_locs, axis=1)
            
            num_atom = len(pickup_dist)
            if num_atom > num_pickup:
                target_matrix[a, :num_pickup] = pickup_dist[:num_pickup]
                target_matrix[a, num_pickup:num_pickup*2] = pickup_num_bond[:num_pickup]
                target_matrix[a, num_pickup*2:] = in_prods[:num_pickup]
            else:
                target_matrix[a, :num_atom] = pickup_dist
                target_matrix[a, num_pickup:num_pickup+num_atom] = pickup_num_bond
                target_matrix[a, num_pickup*2:num_pickup*2+num_atom] = in_prods
        
        pickup_dist_matrix = np.vstack([pickup_dist_matrix, target_matrix.reshape(-1)])
    return pickup_dist_matrix #(num_atoms, num_pickup*5)

In [22]:
def get_dist_mat(df, df_structures_idx, mol, num_pickup):
    assigned_idxs = assign_atoms_index(df, mol)
    dist_mat_mole = get_pickup_dist_matrix(df, df_structures_idx, mol, num_pickup=num_pickup)
    mol_name_arr = [mol] * len(assigned_idxs) 

    return (mol_name_arr, assigned_idxs, dist_mat_mole)

In [25]:
num = 10
dist_mat = np.zeros([0, num*5*3])
molecule_names = np.empty([0])
atoms_idx = np.zeros([0], dtype=np.int32)

start = time.time()

dist_mats = Parallel(n_jobs=NCORES)(delayed(get_dist_mat)(df_train_idx, df_structures_idx, mol, num_pickup=num) for mol in mols[:100])
molecule_names = np.hstack([x[0] for x in dist_mats])
atoms_idx = np.hstack([x[1] for x in dist_mats])
dist_mat = np.vstack([x[2] for x in dist_mats])

col_name_list = []
atoms = ['H', 'C', 'N', 'O', 'F']
for a in atoms:
    for n in range(num):
        col_name_list.append('dist_{}_{}'.format(a, n))
    for n in range(num):
        col_name_list.append('num_bond_{}_{}'.format(a, n))
    for n in range(num):
        col_name_list.append('cos_{}_{}'.format(a, n))
        
se_mole = pd.Series(molecule_names, name='molecule_name')
se_atom_idx = pd.Series(atoms_idx, name='atom_index').astype('int32')
df_dist = pd.DataFrame(dist_mat, columns=col_name_list).astype('float32')
df_distance = pd.concat([se_mole, se_atom_idx,df_dist], axis=1)

elapsed_time = time.time() - start
print ("elapsed_time:{0}".format(elapsed_time) + "[sec]")

elapsed_time:4.728086471557617[sec]


In [None]:
for i in range(num):
    del df_distance['num_bond_H_{}'.format(i)]
    del df_distance['num_bond_O_{}'.format(i)]
    del df_distance['num_bond_F_{}'.format(i)]

In [None]:
nums = [10, 10, 5, 4, 3]
for a, n in zip(atoms, nums):
    for i in range(n, num):
        del df_distance['dist_{}_{}'.format(a, i)]
        del df_distance['cos_{}_{}'.format(a, i)]

In [None]:
df_distance.to_pickle(OUTPUT + '20190628_dist_bond_dir_train_temp0.pickle')

In [None]:
df_distance.head()