In [1]:
import os
import pandas as pd
import numpy as np
import time

In [2]:
FOLDER = '../../data_kaggle/champs/'
OUTPUT = FOLDER + 'out/'
os.listdir(FOLDER)

['mulliken_charges.csv',
 'sample_submission.csv',
 'magnetic_shielding_tensors.csv',
 'train.csv',
 'test.csv',
 'dipole_moments.csv',
 'potential_energy.csv',
 'structures.csv',
 'out',
 'scalar_coupling_contributions.csv',
 'dsgdb9nsd_000001.xyz',
 'structures.zip']

In [3]:
# df_mulliken_charges = pd.read_csv(FOLDER + 'mulliken_charges.csv')
# df_sample =  pd.read_csv(FOLDER + 'sample_submission.csv')
# df_magnetic_shielding_tensors = pd.read_csv(FOLDER + 'magnetic_shielding_tensors.csv')
df_train = pd.read_csv(FOLDER + 'train.csv')
# df_test = pd.read_csv(FOLDER + 'test.csv')
# df_dipole_moments = pd.read_csv(FOLDER + 'dipole_moments.csv')
# df_potential_energy = pd.read_csv(FOLDER + 'potential_energy.csv')
df_structures = pd.read_csv(FOLDER + 'structures.csv')
# df_scalar_coupling_contributions = pd.read_csv(FOLDER + 'scalar_coupling_contributions.csv')

In [4]:
df_structures_idx = df_structures.set_index('molecule_name')
df_train_idx = df_train.set_index('molecule_name')
mols = df_train['molecule_name'].unique()
types_3J = ['3JHH', '3JHC', '3JHN']

In [5]:
def get_dist_matrix(df_structures_idx, molecule):
    df_temp = df_structures_idx.loc[molecule]
    locs = df_temp[['x','y','z']].values
    num_atoms = len(locs)
    loc_tile = np.tile(locs.T, (num_atoms,1,1))
    dist_mat = ((loc_tile - loc_tile.T)**2).sum(axis=1)
    return dist_mat

In [6]:
def gen_pairs_list(df_idx, df_structures_idx, molecule_name, type_3J):
    pairs_list = []
    df_tr = df_idx.loc[molecule_name]
    df_st = df_structures_idx.loc[molecule_name]
    if type(df_tr) == pd.Series:
        return []
    
    pairs_3J = df_tr.query('type == "{}"'.format(type_3J))[['atom_index_0','atom_index_1','id']].values
    dist_matrix = get_dist_matrix(df_structures_idx, molecule_name)

    for p3 in pairs_3J:
        atom_idx_0 = p3[0]
        con_id = p3[2]

        dist_arr = dist_matrix[atom_idx_0]
        mask = dist_arr != 0
        dist_arr_excl_0 = dist_arr[mask]
        masked_idx = df_st['atom_index'].values[mask]
        atom_idx_1 = masked_idx[np.argsort(dist_arr_excl_0)[0]]

        atom_idx_3 = p3[1]
        dist_arr = dist_matrix[atom_idx_3]
        mask = dist_arr != 0
        dist_arr_excl_0 = dist_arr[mask]
        masked_idx = df_st['atom_index'].values[mask]
        atom_idx_2 = masked_idx[np.argsort(dist_arr_excl_0)[0]]        
        
        pair = [atom_idx_0, atom_idx_1, atom_idx_2, atom_idx_3, con_id]
        pairs_list.append(pair)
        
    return pairs_list

In [7]:
def get_cos_3J(df_structures_idx, molecule_name, atom_idx_list):
    pos_list = []
    df_st = df_structures_idx.loc[molecule_name]

    for idx in atom_idx_list:
        pos = df_st.query('atom_index == {}'.format(idx))[['x', 'y', 'z']].values
        pos_list.append(pos)

    v01 = pos_list[1] - pos_list[0]
    v12 = pos_list[2] - pos_list[1]
    v23 = pos_list[3] - pos_list[2]

    v01_12 = v01 - ((np.dot(v01, v12.T) / np.linalg.norm(v12) **2 ) * v12)[0]
    v23_12 = v23 - ((np.dot(v23, v12.T) / np.linalg.norm(v12) **2 ) * v12)[0]
    
    cos = (np.dot(v01_12, v23_12.T) / np.linalg.norm(v01_12) / np.linalg.norm(v23_12))[0]
    
    return np.array([cos, cos**2-1])[:,0]

In [8]:
col_cos = np.zeros([0,2])
id_arr = np.zeros(0)
start = time.time()
types_3J = ['3JHH', '3JHC', '3JHN']
for m in mols[:50]:
    df_tr = df_train_idx.loc[m]
    cos_arr = np.zeros([df_tr.shape[0]])
    for t in types_3J:
        pairs_list = gen_pairs_list(df_train_idx, df_structures_idx, m, t)
        if len(pairs_list) == 0:
            continue
        for pair in pairs_list:
            cos = get_cos_3J(df_structures_idx, m, pair)
            col_cos = np.vstack([col_cos, cos])
            id_arr = np.hstack([id_arr, pair[4]])

se_id = pd.Series(id_arr, name='id', dtype='int32')
df_cos = pd.DataFrame(col_cos, columns=['cos_3j', 'cos_3j^2']).astype('float32')
df_cos_3j = pd.concat([se_id, df_cos], axis=1)

elapsed_time = time.time() - start
print ("elapsed_time:{0}".format(elapsed_time) + "[sec]")

  app.launch_new_instance()


elapsed_time:5.0849432945251465[sec]


In [9]:
df_cos_3j.to_csv(OUTPUT + '20190625_cos_3j_2.csv', index=False)

In [10]:
df_cos_3j

Unnamed: 0,id,cos_3j,cos_3j^2
0,23,-0.499958,-7.500424e-01
1,24,1.000000,-3.110625e-10
2,25,-0.500033,-7.499672e-01
3,29,1.000000,-1.340448e-10
4,30,-0.499932,-7.500677e-01
5,31,-0.500020,-7.499803e-01
6,34,-0.499995,-7.500049e-01
7,35,-0.500063,-7.499375e-01
8,36,1.000000,-2.809728e-10
9,49,-0.476404,-7.730390e-01
