In [1]:
import numpy as np 
import pandas as pd

In [2]:
default_path = '../input/'

df_train = pd.read_csv(default_path+'train.csv')
df_test  = pd.read_csv(default_path+'test.csv')
df_struct = pd.read_csv(default_path+'structures.csv')
unique_type = list(set(df_test['type']))

elements = []
for i in unique_type: 
    elements.append(i[2])
    elements.append(i[3])
unique_elements = list(set(elements))

In [3]:
# Gives a normalized list of values for each of the TYPE of coupling
def give_uniqueId(lis): 
    l = np.size(lis)
    fin = np.zeros((l,1))
    for i in range(0,l):
        fin[i] = (unique_type.index(lis[i])+1)
    return fin

def give_uniqueElement(lis):
    element = np.frompyfunc(lambda x:x[3:4],1,1)(lis)
    uniqueID = np.frompyfunc(lambda x:unique_elements.index(x)-1,1,1)(element)
    return uniqueID
def giveFirst(lis):
    return np.frompyfunc(lambda x:int(x[0:1])-1,1,1)(lis)
    
# Returns dataframe merging xyz values
def map_atom_info(df, atom_idx):
    df = pd.merge(df, df_struct, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'atom': f'atom_{atom_idx}',
                            'x': f'x_{atom_idx}',
                            'y': f'y_{atom_idx}',
                            'z': f'z_{atom_idx}'})
    return df

In [4]:
# Merging the x,y,z values into the train and test dataframes
train_df = map_atom_info(map_atom_info(df_train,0),1)
test_df = map_atom_info(map_atom_info(df_test,0),1)

In [5]:
#First parameter of the type
train_df['typeval']= giveFirst(train_df['type'].values)
test_df['typeval']= giveFirst(test_df['type'].values)
# Giving last 2nd element as a parameter as first element is always hydrogen
train_df['elem'] = give_uniqueElement(train_df['type'].values)
test_df['elem']  = give_uniqueElement(test_df['type'].values)
# Assigning unique numerical values to the type 
train_df['type'] = give_uniqueId(train_df['type'].values)
test_df['type'] = give_uniqueId(test_df['type'].values)

In [6]:
train_df.head(4)

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1,typeval,elem
0,0,dsgdb9nsd_000001,1,0,3.0,84.8076,H,0.00215,-0.006031,0.001976,C,-0.012698,1.085804,0.008001,0,0
1,1,dsgdb9nsd_000001,1,2,5.0,-11.257,H,0.00215,-0.006031,0.001976,H,1.011731,1.463751,0.000277,1,1
2,2,dsgdb9nsd_000001,1,3,5.0,-11.2548,H,0.00215,-0.006031,0.001976,H,-0.540815,1.447527,-0.876644,1,1
3,3,dsgdb9nsd_000001,1,4,5.0,-11.2543,H,0.00215,-0.006031,0.001976,H,-0.523814,1.437933,0.906397,1,1


In [7]:
test_df.head(3)

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1,typeval,elem
0,4658147,dsgdb9nsd_000004,2,0,4.0,H,-1.661639,0.0,1.0,C,0.599539,0.0,1.0,1,0
1,4658148,dsgdb9nsd_000004,2,1,3.0,H,-1.661639,0.0,1.0,C,-0.599539,0.0,1.0,0,0
2,4658149,dsgdb9nsd_000004,2,3,8.0,H,-1.661639,0.0,1.0,H,1.661639,0.0,1.0,2,1


In [8]:
train_df['dx'] = train_df['x_0']-train_df['x_1']
train_df['dy'] = train_df['y_0']-train_df['y_1']
train_df['dz'] = train_df['z_0']-train_df['z_1']
test_df['dx'] = test_df['x_0']-test_df['x_1']
test_df['dy'] = test_df['y_0']-test_df['y_1']
test_df['dz'] = test_df['z_0']-test_df['z_1']

In [9]:
train_df.head(4)

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1,typeval,elem,dx,dy,dz
0,0,dsgdb9nsd_000001,1,0,3.0,84.8076,H,0.00215,-0.006031,0.001976,C,-0.012698,1.085804,0.008001,0,0,0.014849,-1.091835,-0.006025
1,1,dsgdb9nsd_000001,1,2,5.0,-11.257,H,0.00215,-0.006031,0.001976,H,1.011731,1.463751,0.000277,1,1,-1.00958,-1.469782,0.0017
2,2,dsgdb9nsd_000001,1,3,5.0,-11.2548,H,0.00215,-0.006031,0.001976,H,-0.540815,1.447527,-0.876644,1,1,0.542965,-1.453558,0.87862
3,3,dsgdb9nsd_000001,1,4,5.0,-11.2543,H,0.00215,-0.006031,0.001976,H,-0.523814,1.437933,0.906397,1,1,0.525964,-1.443964,-0.904421


In [10]:
# Assigning features and lables
X_train = train_df[['x_0','y_0','z_0','x_1','y_1','z_1','dx','dy','dz','elem','typeval','type']].values
y_train = train_df['scalar_coupling_constant'].values
X_test = test_df[['x_0','y_0','z_0','x_1','y_1','z_1','dx','dy','dz','elem','typeval','type']].values

In [11]:
X_train

array([[0.002150416, -0.0060313176, 0.0019761204, ..., 0, 0, 3.0],
       [0.002150416, -0.0060313176, 0.0019761204, ..., 1, 1, 5.0],
       [0.002150416, -0.0060313176, 0.0019761204, ..., 1, 1, 5.0],
       ...,
       [1.12654988, -1.3487328980000002, -1.933838381, ..., 0, 2, 1.0],
       [1.12654988, -1.3487328980000002, -1.933838381, ..., 0, 1, 4.0],
       [1.12654988, -1.3487328980000002, -1.933838381, ..., 0, 0, 3.0]],
      dtype=object)

In [12]:
# Preprocessing : Making the mean of features to 0
from sklearn import preprocessing
X_train = preprocessing.scale(X_train)
X_test = preprocessing.scale(X_test)

In [13]:
# Importing requirements for dnn
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [14]:
def DNN(num_layers,train_X,train_y):
    #create model
    model = Sequential()
    #get number of columns in training data
    n_cols = train_X.shape[1]
    #num_layers = [200,30,15,10]
    # Input Layer #200
    model.add(Dense(num_layers[0], activation='relu', input_shape=(n_cols,)))
    
    # Hidden Layers and Output Layer #index 1,2  1->len(num_layers)-2 
    for i in range(1,len(num_layers)-1):
        model.add(Dense(num_layers[i], activation='relu'))
    model.add(Dense(num_layers[-1],activation='linear'))
    model.compile(optimizer='adamax', loss='mean_squared_error')
    
    #set early stopping monitor so the model stops training when it won't improve anymore
    early_stopping_monitor = EarlyStopping(patience=5)
    #train model
    model.fit(train_X, train_y, 
              validation_split=0.1, epochs=10, callbacks=[early_stopping_monitor])
    return model

In [15]:
model = DNN([10000,1200,500,300,200,100,1],X_train,y_train)
pred = model.predict(X_test)

Train on 4192332 samples, validate on 465815 samples
Epoch 1/10

In [16]:
def submit(predictions):
    submit = pd.read_csv(default_path+'sample_submission.csv')
    print(len(submit), len(predictions))   
    submit["scalar_coupling_constant"] = predictions
    submit.to_csv("submission.csv", index=False)

In [17]:
submit(pred)

2505542 2505542
