In [158]:
## Libraries
# General purpose libraries
import os
import pickle
import random
import numpy as np
import pandas as pd
import zipfile
from matplotlib import pyplot

# Machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, LinearRegression
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras import backend as K
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, BatchNormalization
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Input, Lambda
from tensorflow.keras.utils import plot_model
from sklearn.metrics import mean_squared_error
from tensorflow.keras.callbacks import ReduceLROnPlateau

In [159]:
## Import files from zip folders
train_features = pd.read_csv('train_features.csv.zip',compression='zip') #E gap 
train_labels = pd.read_csv('train_labels.csv.zip',compression='zip')
pretrain_features = pd.read_csv('pretrain_features.csv.zip',compression='zip') 
pretrain_labels = pd.read_csv('pretrain_labels.csv.zip',compression='zip') #only Elumo 
test_features = pd.read_csv('test_features.csv.zip',compression='zip') #1000 id each one has 1000 features
sample = pd.read_csv('sample.csv') 

In [160]:
#Keep only the features
train_features = train_features.drop(['Id', 'smiles'], axis=1)
train_labels = train_labels.drop('Id', axis=1)
pretrain_features = pretrain_features.drop(['Id', 'smiles'], axis=1)
pretrain_labels = pretrain_labels.drop('Id', axis=1)
test_features = test_features.drop(['Id', 'smiles'], axis=1) 

In [161]:
#Split data for evaluating the base model 
X_pretrain, X_pretest, y_pretrain, y_pretest = train_test_split(pretrain_features, pretrain_labels, test_size=0.33)

In [162]:
#Constant
pretrain_input_shape = pretrain_features.shape[1]
train_input_shape = train_features.shape[1]

In [220]:
#Obtained the pre-trained model with the pretrain_features
#Define the model 
base_model = Sequential()
base_model.add(Dense(500, activation='relu', kernel_initializer='he_normal', input_shape=(pretrain_input_shape,)))
base_model.add(Dropout(0.3))
base_model.add(Dense(200, activation='relu', kernel_initializer='he_normal'))
#base_model.add(Dropout(0.3))
base_model.add(Dense(100, activation='relu', kernel_initializer='he_normal'))
base_model.add(Dense(1))

In [221]:
base_model.summary()

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_100 (Dense)           (None, 500)               500500    
                                                                 
 dropout_18 (Dropout)        (None, 500)               0         
                                                                 
 dense_101 (Dense)           (None, 200)               100200    
                                                                 
 dense_102 (Dense)           (None, 100)               20100     
                                                                 
 dense_103 (Dense)           (None, 1)                 101       
                                                                 
Total params: 620,901
Trainable params: 620,901
Non-trainable params: 0
_________________________________________________________________


In [222]:
#Compile the model
base_model.compile(optimizer='adam', loss='mse')

In [None]:
###### Fit model
print("Training model...")
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=1, min_lr=0.000001)
history = base_model.fit(x = pretrain_features, y = pretrain_labels, epochs=10, batch_size = 16, validation_split=0.15,callbacks=[reduce_lr])
print("Training completed!")
# make predictions
yhat = base_model.predict(X_pretest)
# evaluate predictions
rmse = mean_squared_error(y_pretest, yhat)
print('RMSE: %.3f' % rmse)

Training model...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

In [None]:
# plot learning curves
pyplot.title('Learning Curves')
pyplot.xlabel('Epoch')
pyplot.ylabel('RMSE')
pyplot.plot(history.history['loss'], label='train')
pyplot.legend()
pyplot.show()

In [None]:
#Freeze layers of our base model trained with the pretrain dataset
base_model.trainable = False

In [None]:
#Model for the train dataset
#Define the model
x_in = Input(shape=train_input_shape)
#inlcude our base model here
x = base_model(x_in, training=False)
x = layers.Dense(100, activation="relu", kernel_initializer='he_normal')(x)
#x = layers.Dropout(0.2)(x)
x = layers.Dense(50, activation="relu", kernel_initializer='he_normal')(x)
#x = layers.Dense(20, activation="relu", kernel_initializer='he_normal')(x)
x = layers.Dense(10, activation="relu", kernel_initializer='he_normal')(x)
x_out = Dense(1)(x)
# define the model
model = Model(inputs=x_in, outputs=x_out)

In [None]:
#Model summary 
model.summary()

In [None]:
#Compile the model
adam = tf.keras.optimizers.Adam()
model.compile(optimizer=adam, loss='mse')

In [None]:
#Fit the model
initial_epochs = 10
print("Training model...")
history = model.fit(x = train_features, y = train_labels, epochs=initial_epochs, batch_size = 2)
print("Training completed!")

In [None]:
# plot learning curves
pyplot.title('Learning Curves')
pyplot.xlabel('Epoch')
pyplot.ylabel('RMSE')
pyplot.plot(history.history['loss'], label='train')
pyplot.legend()
pyplot.show()

In [None]:
#Fine tuning 
base_model.trainable = True

# Let's take a look to see how many layers are in the base model
print("Number of layers in the base model: ", len(base_model.layers))

# Fine-tune from this layer onwards
fine_tune_at = 2

# Freeze all the layers before the `fine_tune_at` layer
for layer in base_model.layers[:fine_tune_at]:
    layer.trainable = False

In [None]:
#Compile the model again for fine tunning
tuning_learning_rate = 1e-5
adam = tf.keras.optimizers.Adam(tuning_learning_rate)
model.compile(optimizer=adam, loss='mse')

In [None]:
model.summary()

In [None]:
#Fitting and fining
fine_tune_epochs = 5
total_epochs =  initial_epochs + fine_tune_epochs
history_fine = model.fit(x = train_features, y = train_labels, epochs=total_epochs, batch_size = 1)

In [None]:
# plot learning curves
pyplot.title('Learning Curves')
pyplot.xlabel('Epoch')
pyplot.ylabel('RMSE')
pyplot.plot(history.history['loss'], label='train')
pyplot.legend()
pyplot.show()

In [None]:
#Model for the train dataset
#Define the model
x_in = Input(shape=train_input_shape)
#inlcude our base model here
x = base_model(x_in, training=False)
x = layers.Dense(100, activation="relu", kernel_initializer='he_normal')(x)
#x = layers.Dropout(0.2)(x)
x = layers.Dense(30, activation="relu", kernel_initializer='he_normal')(x)
#x = layers.Dense(20, activation="relu", kernel_initializer='he_normal')(x)
x = layers.Dense(10, activation="relu", kernel_initializer='he_normal')(x)
x_out = Dense(1)(x)
# define the model
model = Model(inputs=x_in, outputs=x_out)
#Compile the model
adam = tf.keras.optimizers.Adam()
model.compile(optimizer=adam, loss='mse')
#Fitting and fining
fine_tune_epochs = 5
total_epochs =  initial_epochs + fine_tune_epochs
history_fine = model.fit(x = train_features, y = train_labels, epochs=total_epochs, batch_size = 1)

In [None]:
#Predict
y_test = model.predict(test_features)

In [None]:
print(y_test)

In [None]:
y_test_df = pd.DataFrame(y_test)

In [None]:
y_test_df.head()

In [None]:
sample['y'] = y_test_df

In [None]:
sample.head(50)

In [191]:
sample.shape

(10000, 2)

In [192]:
sample.to_csv('prediction_plg.csv', index=False)