<a href="https://colab.research.google.com/github/TanushGoel/Machine-Learning-Playground/blob/master/Predicting_Molecular_Properties_Catboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
np.random.seed(42)

In [0]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [0]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
structures = pd.read_csv('structures.csv')
scalar_coupling = pd.read_csv('scalar_coupling_contributions.csv')
magnetic_shielding = pd.read_csv('magnetic_shielding_tensors.csv')
potential_energy = pd.read_csv('potential_energy.csv')
dipole_moments = pd.read_csv('dipole_moments.csv')
mulliken_charges = pd.read_csv('mulliken_charges.csv')
sample_sub = pd.read_csv('sample_submission.csv')

In [0]:
train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074


In [0]:
print('Total rows:', len(train))
print('Unique Molecule Names:', len(train.molecule_name.unique()))
print('Unique Coupling Types:', len(train.type.unique()))

Total rows: 4658147
Unique Molecule Names: 85003
Unique Coupling Types: 8


In [0]:
labels = train.scalar_coupling_constant
train = train.drop('scalar_coupling_constant', axis=1)

In [0]:
test.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type
0,4658147,dsgdb9nsd_000004,2,0,2JHC
1,4658148,dsgdb9nsd_000004,2,1,1JHC
2,4658149,dsgdb9nsd_000004,2,3,3JHH
3,4658150,dsgdb9nsd_000004,3,0,1JHC
4,4658151,dsgdb9nsd_000004,3,1,2JHC


In [0]:
print('Total rows:', len(test))
print('Unique Molecule Names:', len(test.molecule_name.unique()))
print('Unique Coupling Types:', len(test.type.unique()))

Total rows: 2505542
Unique Molecule Names: 45772
Unique Coupling Types: 8


In [0]:
sample_sub.head()

Unnamed: 0,id,scalar_coupling_constant
0,4658147,0
1,4658148,0
2,4658149,0
3,4658150,0
4,4658151,0


In [0]:
scalar_coupling.head()

Unnamed: 0,molecule_name,atom_index_0,atom_index_1,type,fc,sd,pso,dso
0,dsgdb9nsd_000001,1,0,1JHC,83.0224,0.254579,1.25862,0.27201
1,dsgdb9nsd_000001,1,2,2JHH,-11.0347,0.352978,2.85839,-3.4336
2,dsgdb9nsd_000001,1,3,2JHH,-11.0325,0.352944,2.85852,-3.43387
3,dsgdb9nsd_000001,1,4,2JHH,-11.0319,0.352934,2.85855,-3.43393
4,dsgdb9nsd_000001,2,0,1JHC,83.0222,0.254585,1.25861,0.272013


In [0]:
print('Total rows:', len(scalar_coupling))
print('Unique Molecule Names:', len(scalar_coupling.molecule_name.unique()))
print('Unique Coupling Types:', len(scalar_coupling.type.unique()))

Total rows: 4658147
Unique Molecule Names: 85003
Unique Coupling Types: 8


In [0]:
structures.head()

Unnamed: 0,molecule_name,atom_index,atom,x,y,z
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397


In [0]:
print('Total rows:', len(structures))
print('Unique Molecule Names:', len(structures.molecule_name.unique()))

Total rows: 2358657
Unique Molecule Names: 130775


In [0]:
magnetic_shielding.head()

Unnamed: 0,molecule_name,atom_index,XX,YX,ZX,XY,YY,ZY,XZ,YZ,ZZ
0,dsgdb9nsd_000001,0,195.315,0.0,-0.0001,0.0,195.317,0.0007,-0.0001,0.0007,195.317
1,dsgdb9nsd_000001,1,31.341,-1.2317,4.0544,-1.2317,28.9546,-1.7173,4.0546,-1.7173,34.0861
2,dsgdb9nsd_000001,2,31.5814,1.2173,-4.1474,1.2173,28.9036,-1.6036,-4.1476,-1.6036,33.8967
3,dsgdb9nsd_000001,3,31.5172,4.1086,1.2723,4.1088,33.9068,1.695,1.2724,1.6951,28.9579
4,dsgdb9nsd_000001,4,31.4029,-4.0942,-1.1793,-4.0944,34.0776,1.6259,-1.1795,1.626,28.9013


In [0]:
print('Total rows:', len(magnetic_shielding))
print('Unique Molecule Names:', len(magnetic_shielding.molecule_name.unique()))

Total rows: 1533537
Unique Molecule Names: 85003


In [0]:
potential_energy.head()

Unnamed: 0,molecule_name,potential_energy
0,dsgdb9nsd_000001,-40.52368
1,dsgdb9nsd_000002,-56.56025
2,dsgdb9nsd_000003,-76.42608
3,dsgdb9nsd_000005,-93.42849
4,dsgdb9nsd_000007,-79.83869


In [0]:
print('Total rows:', len(potential_energy))
print('Unique Molecule Names:', len(potential_energy.molecule_name.unique()))

Total rows: 85003
Unique Molecule Names: 85003


In [0]:
dipole_moments.head()

Unnamed: 0,molecule_name,X,Y,Z
0,dsgdb9nsd_000001,0.0,0.0,0.0
1,dsgdb9nsd_000002,-0.0002,0.0,1.6256
2,dsgdb9nsd_000003,0.0,0.0,-1.8511
3,dsgdb9nsd_000005,0.0,0.0,-2.8937
4,dsgdb9nsd_000007,0.0,0.0,0.0


In [0]:
print('Total rows:', len(dipole_moments))
print('Unique Molecule Names:', len(dipole_moments.molecule_name.unique()))

Total rows: 85003
Unique Molecule Names: 85003


In [0]:
mulliken_charges.head()

Unnamed: 0,molecule_name,atom_index,mulliken_charge
0,dsgdb9nsd_000001,0,-0.535689
1,dsgdb9nsd_000001,1,0.133921
2,dsgdb9nsd_000001,2,0.133922
3,dsgdb9nsd_000001,3,0.133923
4,dsgdb9nsd_000001,4,0.133923


In [0]:
print('Total rows:', len(mulliken_charges))
print('Unique Molecule Names:', len(mulliken_charges.molecule_name.unique()))

Total rows: 1533537
Unique Molecule Names: 85003


In [0]:
molecule_names = []
# used structures dataframe because it has all molecule names
for i in structures.molecule_name.unique():
  molecule_names.append(i)

In [0]:
molecules = pd.DataFrame({
        "molecule_names": molecule_names
    })
molecules.head()

Unnamed: 0,molecule_names
0,dsgdb9nsd_000001
1,dsgdb9nsd_000002
2,dsgdb9nsd_000003
3,dsgdb9nsd_000004
4,dsgdb9nsd_000005


In [0]:
molecule_names_factorized = molecules.molecule_names.factorize()[0]

In [0]:
molecule_index = dict(zip(molecule_names, molecule_names_factorized))
len(molecule_index)

130775

In [0]:
train.type.unique()

array(['1JHC', '2JHH', '1JHN', '2JHN', '2JHC', '3JHH', '3JHC', '3JHN'],
      dtype=object)

In [0]:
type_index = {'1JHC':1, '2JHH':2, '1JHN':3, '2JHN':4, '2JHC':5, '3JHH':6, '3JHC':7, '3JHN':8}

In [0]:
structures.atom.unique()

array(['C', 'H', 'N', 'O', 'F'], dtype=object)

In [0]:
atom_indices = {'C':1, 'H':2, 'N':3, 'O':4, 'F':5}

In [0]:
train.molecule_name = train.molecule_name.map(molecule_index)
train.type = train.type.map(type_index)
test.molecule_name = test.molecule_name.map(molecule_index)
test.type = test.type.map(type_index)
scalar_coupling.molecule_name = scalar_coupling.molecule_name.map(molecule_index)
scalar_coupling.type = scalar_coupling.type.map(type_index)
structures.molecule_name = structures.molecule_name.map(molecule_index)
structures.atom = structures.atom.map(atom_indices)
magnetic_shielding.molecule_name = magnetic_shielding.molecule_name.map(molecule_index) 
potential_energy.molecule_name = potential_energy.molecule_name.map(molecule_index) 
dipole_moments.molecule_name = dipole_moments.molecule_name.map(molecule_index)
mulliken_charges.molecule_name = mulliken_charges.molecule_name.map(molecule_index)

In [0]:
del molecule_index, molecule_names_factorized, molecules, type_index, atom_indices

In [0]:
def map_structure_info(df, atom_idx):
    df = pd.merge(df, structures, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'atom': f'atom_{atom_idx}',
                            'x': f'x_{atom_idx}',
                            'y': f'y_{atom_idx}',
                            'z': f'z_{atom_idx}'})
    return df

In [0]:
train = map_structure_info(train, 0)
train = map_structure_info(train, 1)

test = map_structure_info(test, 0)
test = map_structure_info(test, 1)

del structures

In [0]:
#STOP
def map_mulliken_charge(df, atom_idx) :
    df = pd.merge(df, mulliken_charges, how = 'left',
                 left_on=['molecule_name', f'atom_index_{atom_idx}'],
                 right_on=['molecule_name', 'atom_index'])
    df = df.rename(columns={'mulliken_charge':f'mulliken_charge_{atom_idx}'})
    df = df.drop('atom_index', axis=1)
    return df

train = map_mulliken_charge(train, 0)
train = map_mulliken_charge(train, 1)

del mulliken_charges

In [0]:
train = train.merge(potential_energy, on="molecule_name", how='inner')
del potential_energy

In [0]:
dipole_moment = np.sqrt(dipole_moments['X'] ** 2 + dipole_moments['Y'] ** 2 + dipole_moments['Z'] ** 2)
dipole_moments['dipole_moment'] = dipole_moment
dipole_moments = dipole_moments.drop(['X','Y','Z'], axis=1)
train = train.merge(dipole_moments, on='molecule_name', how='inner')
del dipole_moment, dipole_moments

In [0]:
train['fc'] = scalar_coupling.fc
train['sd'] = scalar_coupling.sd
train['pso'] = scalar_coupling.pso
train['dso'] = scalar_coupling.dso

In [0]:
#START
train_p_0 = train[['x_0', 'y_0', 'z_0']].values
train_p_1 = train[['x_1', 'y_1', 'z_1']].values

train['dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1)


test_p_0 = test[['x_0', 'y_0', 'z_0']].values
test_p_1 = test[['x_1', 'y_1', 'z_1']].values

test['dist'] = np.linalg.norm(test_p_0 - test_p_1, axis=1)

In [0]:
train['dist_x'] = (train['x_0'] - train['x_1']) ** 2
train['dist_y'] = (train['y_0'] - train['y_1']) ** 2
train['dist_z'] = (train['z_0'] - train['z_1']) ** 2

test['dist_x'] = (test['x_0'] - test['x_1']) ** 2
test['dist_y'] = (test['y_0'] - test['y_1']) ** 2
test['dist_z'] = (test['z_0'] - test['z_1']) ** 2

In [0]:
train["min_distance_0"] = train.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('min')
train["min_distance_1"] = train.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('min')

test["min_distance_0"] = test.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('min')
test["min_distance_1"] = test.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('min')

In [0]:
train = reduce_mem_usage(train)

In [0]:
test = reduce_mem_usage(test)

In [0]:
def create_features(df):
  df["distance_0"]=((df['x_0']-df['dist_x'])**2+(df['y_0']-df['dist_y'])**2+(df['z_0']-df['dist_z'])**2)**(1/2)
  df["distance_1"]=((df['x_1']-df['dist_x'])**2+(df['y_1']-df['dist_y'])**2+(df['z_1']-df['dist_z'])**2)**(1/2)
  df["vec_0_x"]=(df['x_0']-df['dist_x'])/(df["distance_0"]+1e-10)
  df["vec_0_y"]=(df['y_0']-df['dist_y'])/(df["distance_0"]+1e-10)
  df["vec_0_z"]=(df['z_0']-df['dist_z'])/(df["distance_0"]+1e-10)
  df["vec_1_x"]=(df['x_1']-df['dist_x'])/(df["distance_1"]+1e-10)
  df["vec_1_y"]=(df['y_1']-df['dist_y'])/(df["distance_1"]+1e-10)
  df["vec_1_z"]=(df['z_1']-df['dist_z'])/(df["distance_1"]+1e-10)
  df["vec_x"]=(df['x_1']-df['x_0'])/(df["dist"]+1e-10)
  df["vec_y"]=(df['y_1']-df['y_0'])/(df["dist"]+1e-10)
  df["vec_z"]=(df['z_1']-df['z_0'])/(df["dist"]+1e-10)
  df["cos_0_1"]=df["vec_0_x"]*df["vec_1_x"]+df["vec_0_y"]*df["vec_1_y"]+df["vec_0_z"]*df["vec_1_z"]
  df["cos_0"]=df["vec_0_x"]*df["vec_x"]+df["vec_0_y"]*df["vec_y"]+df["vec_0_z"]*df["vec_z"]
  df["cos_1"]=df["vec_1_x"]*df["vec_x"]+df["vec_1_y"]*df["vec_y"]+df["vec_1_z"]*df["vec_z"]
  df = df.drop('vec_0_x', axis=1)
  df = df.drop('vec_0_y', axis=1)
  df = df.drop('vec_0_z', axis=1)
  df = df.drop('vec_1_x', axis=1)
  df = df.drop('vec_1_y', axis=1)
  df = df.drop('vec_1_z', axis=1)
  df = df.drop('vec_x', axis=1)
  df = df.drop('vec_y', axis=1)
  df = df.drop('vec_z', axis=1)
  
  return df

train = create_features(train)
test = create_features(test)

In [0]:
train = reduce_mem_usage(train)

In [0]:
test = reduce_mem_usage(test)

In [0]:
def create_more_features(df):
  
  
    df['dist_to_type_mean'] = df['dist'] / df.groupby('type')['dist'].transform('mean')
    
    df['molecule_couples'] = df.groupby('molecule_name')['id'].transform('count')
    
    df['molecule_dist_mean'] = df.groupby('molecule_name')['dist'].transform('mean')
    df['molecule_dist_mean_div'] = df['molecule_dist_mean'] / df['dist']
    #df = df.drop('molecule_dist_mean', axis=1)
    
    df['molecule_dist_min'] = df.groupby('molecule_name')['dist'].transform('min')
    df['molecule_dist_max'] = df.groupby('molecule_name')['dist'].transform('max')
    
    df[f'molecule_type_dist_std'] = df.groupby(['molecule_name', 'type'])['dist'].transform('std')
    df[f'molecule_type_dist_std_diff'] = df[f'molecule_type_dist_std'] - df['dist']
    
    df['molecule_type_dist_mean'] = df.groupby(['molecule_name', 'type'])['dist'].transform('mean')
    df['molecule_type_dist_mean_div'] = df['molecule_type_dist_mean'] / df['dist']
    #df = df.drop('molecule_type_dist_mean', axis=1)
    
    df['molecule_type_dist_max'] = df.groupby(['molecule_name', 'type'])['dist'].transform('max')
    df['molecule_type_dist_min'] = df.groupby(['molecule_name', 'type'])['dist'].transform('min')
    
    df['atom_0_couples_count'] = df.groupby(['molecule_name', 'atom_index_0'])['id'].transform('count')
    df['atom_1_couples_count'] = df.groupby(['molecule_name', 'atom_index_1'])['id'].transform('count')
    
    df['molecule_atom_index_0_dist_mean'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('mean')
    df['molecule_atom_index_0_dist_mean_div'] = df['molecule_atom_index_0_dist_mean'] / df['dist']
    #df = df.drop('molecule_atom_index_0_dist_mean', axis=1)
    
    df['molecule_atom_index_0_dist_max'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('max')
    df['molecule_atom_index_0_dist_max_div'] = df['molecule_atom_index_0_dist_max'] / df['dist']
    #df = df.drop('molecule_atom_index_0_dist_max', axis=1)
    
    df['molecule_atom_index_0_dist_min'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('min')
    df['molecule_atom_index_0_dist_min_div'] = df['molecule_atom_index_0_dist_min'] / df['dist']
    #df = df.drop('molecule_atom_index_0_dist_min', axis=1)
    
    df['molecule_atom_index_1_dist_mean'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('mean')
    df['molecule_atom_index_1_dist_mean_div'] = df['molecule_atom_index_1_dist_mean'] / df['dist']
    #df = df.drop('molecule_atom_index_1_dist_mean', axis=1)
    
    df['molecule_atom_index_1_dist_max'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('max')
    df['molecule_atom_index_1_dist_max_div'] = df['molecule_atom_index_1_dist_max'] / df['dist']
    #df = df.drop('molecule_atom_index_1_dist_max', axis=1)
    
    df['molecule_atom_index_1_dist_min'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('min')
    df['molecule_atom_index_1_dist_min_div'] = df['molecule_atom_index_1_dist_min'] / df['dist']
    #df = df.drop('molecule_atom_index_1_dist_min', axis=1)

    df[f'molecule_atom_index_0_dist_std'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('std')
    df[f'molecule_atom_index_0_dist_std_diff'] = df[f'molecule_atom_index_0_dist_std'] - df['dist']
    df[f'molecule_atom_index_0_dist_std_div'] = df[f'molecule_atom_index_0_dist_std'] / df['dist']
    
    df[f'molecule_atom_index_1_dist_std'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('std')
    df[f'molecule_atom_index_1_dist_std_diff'] = df[f'molecule_atom_index_1_dist_std'] - df['dist']
    df[f'molecule_atom_index_1_dist_std_div'] = df[f'molecule_atom_index_1_dist_std'] / df['dist']
    
    df[f'molecule_atom_index_0_x_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['x_1'].transform('std')
    df[f'molecule_atom_index_0_y_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('std')
    df[f'molecule_atom_index_0_z_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['z_1'].transform('std')
    
    df[f'molecule_atom_1_dist_std'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('std')
    df[f'molecule_atom_1_dist_std_diff'] = df[f'molecule_atom_1_dist_std'] - df['dist']
    
    return df
  
train = create_more_features(train)
test = create_more_features(test)

In [0]:
train = reduce_mem_usage(train)

In [0]:
test = reduce_mem_usage(test)

In [0]:
train = train.drop(['molecule_name', 'id'], axis=1)
test = test.drop(['molecule_name', 'id'], axis=1)

In [0]:
train.head()

Unnamed: 0,atom_index_0,atom_index_1,type,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1,dist,dist_x,dist_y,dist_z,min_distance_0,min_distance_1,distance_0,distance_1,cos_0_1,cos_0,cos_1,dist_to_type_mean,molecule_couples,molecule_dist_mean,molecule_dist_mean_div,molecule_dist_min,molecule_dist_max,molecule_type_dist_std,molecule_type_dist_std_diff,molecule_type_dist_mean,molecule_type_dist_mean_div,molecule_type_dist_max,molecule_type_dist_min,atom_0_couples_count,atom_1_couples_count,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_0_dist_min_div,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_div,molecule_atom_index_0_dist_std,molecule_atom_index_0_dist_std_diff,molecule_atom_index_0_dist_std_div,molecule_atom_index_1_dist_std,molecule_atom_index_1_dist_std_diff,molecule_atom_index_1_dist_std_div,molecule_atom_index_0_x_1_std,molecule_atom_index_0_y_1_std,molecule_atom_index_0_z_1_std,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff
0,1,0,1,2,0.00215,-0.006031,0.001976,1,-0.012695,1.085938,0.008003,1.091797,0.000221,1.192383,3.6e-05,1.091797,1.091797,1.198242,0.107483,0.990234,-1.0,-0.988281,0.999023,10,1.506836,1.379883,1.091797,1.783203,0.0,-1.091797,1.091797,1.0,1.091797,1.091797,4,4,1.610352,1.474609,1.783203,1.632812,1.091797,1.0,1.091797,1.0,1.091797,1.0,1.091797,1.0,0.345703,-0.746094,0.31665,0.0,-1.091797,0.0,0.728027,0.182129,0.728027,0.0,-1.091797
1,1,2,2,2,0.00215,-0.006031,0.001976,2,1.011719,1.463867,0.000277,1.783203,1.019531,2.160156,3e-06,1.091797,1.783203,2.392578,0.696289,0.910156,-0.986816,-0.830566,1.004883,10,1.506836,0.845215,1.091797,1.783203,0.0,-1.783203,1.783203,1.0,1.783203,1.783203,4,1,1.610352,0.902832,1.783203,1.0,1.091797,0.612305,1.783203,1.0,1.783203,1.0,1.783203,1.0,0.345703,-1.4375,0.193848,,,,0.728027,0.182129,0.728027,0.0,-1.783203
2,1,3,2,2,0.00215,-0.006031,0.001976,2,-0.541016,1.447266,-0.876465,1.783203,0.294922,2.113281,0.771973,1.091797,1.783203,2.273438,1.964844,0.654785,-0.553711,0.266602,1.004883,10,1.506836,0.845215,1.091797,1.783203,0.0,-1.783203,1.783203,1.0,1.783203,1.783203,4,2,1.610352,0.902832,1.783203,1.0,1.091797,0.612305,1.783203,1.0,1.783203,1.0,1.783203,1.0,0.345703,-1.4375,0.193848,0.0,-1.783203,0.0,0.728027,0.182129,0.728027,0.0,-1.783203
3,1,4,2,2,0.00215,-0.006031,0.001976,2,-0.523926,1.4375,0.90625,1.783203,0.276611,2.085938,0.817871,1.091797,1.783203,2.261719,1.03418,0.643066,-0.895996,-0.235474,1.004883,10,1.506836,0.845215,1.091797,1.783203,0.0,-1.783203,1.783203,1.0,1.783203,1.783203,4,3,1.610352,0.902832,1.783203,1.0,1.091797,0.612305,1.783203,1.0,1.783203,1.0,1.783203,1.0,0.345703,-1.4375,0.193848,0.0,-1.783203,0.0,0.728027,0.182129,0.728027,0.0,-1.783203
4,2,0,1,2,1.011719,1.463867,0.000277,1,-0.012695,1.085938,0.008003,1.091797,1.049805,0.142822,6e-05,1.091797,1.091797,1.321289,1.420898,0.685547,-0.319092,0.47168,0.999023,10,1.506836,1.379883,1.091797,1.783203,0.0,-1.091797,1.091797,1.0,1.091797,1.091797,3,4,1.552734,1.421875,1.783203,1.632812,1.091797,1.0,1.091797,1.0,1.091797,1.0,1.091797,1.0,0.39917,-0.692383,0.365723,0.0,-1.091797,0.0,0.300293,0.205933,0.891113,0.0,-1.091797


In [0]:
train.isnull().sum()

atom_index_0                                0
atom_index_1                                0
type                                        0
atom_0                                      0
x_0                                         0
y_0                                         0
z_0                                         0
atom_1                                      0
x_1                                         0
y_1                                         0
z_1                                         0
dist                                        0
dist_x                                      0
dist_y                                      0
dist_z                                      0
min_distance_0                              0
min_distance_1                              0
distance_0                                  0
distance_1                                  0
cos_0_1                                     0
cos_0                                       0
cos_1                             

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(train, labels, test_size=0.0000575, random_state=1)
del train
del labels

In [0]:
print('Train:', len(y_train))
print('Valid:', len(y_valid))

Train: 4657879
Valid: 268


In [0]:
!pip install catboost
from catboost import Pool, CatBoostRegressor

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/5a/8a/a867c35770291646b085e9248814eb32dbe2aa824715b08e40cd92d0a83e/catboost-0.15.1-cp36-none-manylinux1_x86_64.whl (61.0MB)
[K     |████████████████████████████████| 61.1MB 1.4MB/s 
Installing collected packages: catboost
Successfully installed catboost-0.15.1


In [0]:
cat = CatBoostRegressor(loss_function='RMSE', boosting_type='Plain', bootstrap_type='Bernoulli', eval_metric='R2', 
                        subsample=0.725, od_type="Iter", iterations=3250, use_best_model=True, random_state=42)
train_pool = Pool(X_train, y_train)
del X_train
del y_train
cat.fit(train_pool, eval_set=(X_valid, y_valid), early_stopping_rounds=100, verbose=1)
cat.best_score_

0:	learn: 0.0484580	test: 0.0466074	best: 0.0466074 (0)	total: 2.44s	remaining: 2h 12m 13s
1:	learn: 0.0949458	test: 0.0914801	best: 0.0914801 (1)	total: 4.73s	remaining: 2h 7m 57s
2:	learn: 0.1395327	test: 0.1345679	best: 0.1345679 (2)	total: 7s	remaining: 2h 6m 15s
3:	learn: 0.1822591	test: 0.1759311	best: 0.1759311 (3)	total: 9.17s	remaining: 2h 4m 2s
4:	learn: 0.2231494	test: 0.2156781	best: 0.2156781 (4)	total: 11.4s	remaining: 2h 3m 16s
5:	learn: 0.2622448	test: 0.2537818	best: 0.2537818 (5)	total: 13.5s	remaining: 2h 1m 51s
6:	learn: 0.2995814	test: 0.2899945	best: 0.2899945 (6)	total: 15.7s	remaining: 2h 1m 26s
7:	learn: 0.3353106	test: 0.3246061	best: 0.3246061 (7)	total: 18s	remaining: 2h 1m 38s
8:	learn: 0.3694260	test: 0.3580760	best: 0.3580760 (8)	total: 20.2s	remaining: 2h 56s
9:	learn: 0.4018702	test: 0.3897080	best: 0.3897080 (9)	total: 22.4s	remaining: 2h 48s
10:	learn: 0.4328212	test: 0.4201204	best: 0.4201204 (10)	total: 24.5s	remaining: 2h 13s
11:	learn: 0.4622492	t

{'learn': {'R2': 0.9956465239813657, 'RMSE': 2.305507542767881},
 'validation': {'R2': 0.9950125588682586, 'RMSE': 2.3623564377512096}}

In [0]:
del X_valid
del y_valid

In [0]:
feature_stuff = dict(zip(cat.feature_names_, cat.feature_importances_))
feature_stuff

{'atom_0': 0.0,
 'atom_0_couples_count': 0.34276195894776856,
 'atom_1': 0.19108765043782172,
 'atom_1_couples_count': 0.7850655084760128,
 'atom_index_0': 0.022533846612625197,
 'atom_index_1': 0.11998903151140974,
 'cos_0': 0.006817662278895138,
 'cos_0_1': 0.006185575310408398,
 'cos_1': 0.006070545360329384,
 'dist': 15.88942012081377,
 'dist_to_type_mean': 2.325944849632149,
 'dist_x': 0.003788426262700007,
 'dist_y': 0.009712265692647137,
 'dist_z': 0.010267680202549166,
 'distance_0': 0.002989762520851155,
 'distance_1': 0.02583775207518611,
 'min_distance_0': 0.8477816589379646,
 'min_distance_1': 0.5874561070746659,
 'molecule_atom_1_dist_std': 0.014877679716117694,
 'molecule_atom_1_dist_std_diff': 0.11308465883041618,
 'molecule_atom_index_0_dist_max': 0.18697083924421973,
 'molecule_atom_index_0_dist_max_div': 0.24036601895448081,
 'molecule_atom_index_0_dist_mean': 0.12835459685508763,
 'molecule_atom_index_0_dist_mean_div': 0.08362599718213078,
 'molecule_atom_index_0_dis

In [0]:
cat.save_model('Predicting_Molecular_Properties_Catboost_Final.hdf5')

In [0]:
Y_pred = cat.predict(test)
del test
len(Y_pred)

2505542

In [0]:
sample_sub.head()

Unnamed: 0,id,scalar_coupling_constant
0,4658147,0
1,4658148,0
2,4658149,0
3,4658150,0
4,4658151,0


In [0]:
sample_sub.scalar_coupling_constant = Y_pred
del Y_pred

In [0]:
sample_sub.head()

Unnamed: 0,id,scalar_coupling_constant
0,4658147,12.103838
1,4658148,179.321172
2,4658149,6.202224
3,4658150,181.443416
4,4658151,9.499162


In [0]:
len(sample_sub)

2505542

In [0]:
sample_sub.isnull().sum()

id                          0
scalar_coupling_constant    0
dtype: int64

In [0]:
sample_sub.to_csv('Predicting_Molecular_Properties_1.gz', index=False, compression='gzip')

In [0]:
from google.colab import files
files.download('Predicting_Molecular_Properties_1.gz')

In [0]:
# make a predictor that predicts values of scalar_coupling dataframe - make predictions on test set, use that as more data for the ensemble - scalar coupling and train datasets match
# or use MICE to fill in missing data on test set using scalar_coupling with training set
# predict potential energy and mulliken charges and other features
# use separate models for each type
# ensemble with random forest and extra trees and more catboosts - maybe trained on part of full data

# THINGS TO PREDICT FOR TESTING SET:
# dipole_moments, potential_energy, mulliken_charges, fc, sd, pso, dso