In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Initialization: importing the packages that we will use
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu' # Google colab offers time limited use of GPU for free

# Training parameters 
BATCH_SIZE = 64

In [3]:
# For loading the data
from PIL import Image
import os
import os.path

import torchvision
import torch.utils.data
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

import numpy as np
import pandas as pd
import random

In [4]:
# For constructing the network
import torch.optim as optim
import torch.utils.data
import torch.nn as nn
import torchvision.models as models
import torch.utils.data
import torch.backends.cudnn as cudnn

In [5]:
from tqdm import tqdm

In [6]:
# Solve the imshow dead kernel problem
import os    
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [7]:
'''
Start loading the data
'''
print('================== START LOADING DATA ==================')



In [8]:
path_drive = '/content/drive/My Drive/'

In [9]:
!cp /content/drive/MyDrive/Task4/pretrain_features.csv /content
!cp /content/drive/MyDrive/Task4/pretrain_labels.csv /content
!cp /content/drive/MyDrive/Task4/test_features.csv /content
!cp /content/drive/MyDrive/Task4/train_features.csv /content
!cp /content/drive/MyDrive/Task4/train_labels.csv /content

In [10]:
r1 = pd.read_csv('pretrain_features.csv')  # features
r2 = pd.read_csv('pretrain_labels.csv')  # labels


print(r1.shape)
print(r2.shape)

# Merge two dataframes
all_data_st = pd.merge(r2, r1, on='Id')
print(all_data_st.shape)
print(all_data_st.head(10))

# # Output to csv
# all_data_st.to_csv("training_set.csv", index=None)

(50000, 1002)
(50000, 2)
(50000, 1003)
   Id  lumo_energy                                             smiles  \
0   0    -3.111521  c1occ2c1c1ccc3cscc3c1c1ncc3cc(ccc3c21)-c1cccc2...   
1   1    -3.219118  C1C=c2c(cc3ncc4c5[SiH2]C=Cc5oc4c3c2=C1)-c1scc2...   
2   2    -3.114145  C1C=c2c3cccnc3c3c4c[nH]cc4c4cc(cnc4c3c2=C1)-c1...   
3   3    -3.161867  [SiH2]1C=Cc2c1csc2-c1cnc2c(c1)c1ccccc1c1cc3ccc...   
4   4    -3.687744        c1occ2c1c(cc1[se]c3ccncc3c21)-c1cccc2nsnc12   
5   5    -2.791261  [SiH2]1C=Cc2[nH]c3c(oc4cc(sc34)-c3scc4cc[se]c3...   
6   6    -3.688235          c1ccc(nc1)-c1cc2ncc3c4cnccc4sc3c2c2nsnc12   
7   7    -3.243368  C1C=c2c3cccnc3c3c4cocc4c4C=C(Cc4c3c2=C1)c1scc2...   
8   8    -3.508069  c1cc2csc(-c3cc4c5cscc5c5c6occc6c6cscc6c5c4c4ns...   
9   9    -3.440629  [SiH2]1C=c2c3cc(oc3c3c4cocc4c4ccc5cscc5c4c3c2=...   

   feature_0000  feature_0001  feature_0002  feature_0003  feature_0004  \
0           0.0           0.0           0.0           0.0           0.0   
1      

In [11]:
# all_data_st = all_data_st.sample(frac=1)
# print(all_data_st.head(10))
all_data_st_noidsmiles = all_data_st.drop(['Id', 'smiles'], axis=1)
print(all_data_st_noidsmiles.head(10))
all_data_array = np.array(all_data_st_noidsmiles)
all_data_tensor = torch.tensor(all_data_array)
print(all_data_tensor)

   lumo_energy  feature_0000  feature_0001  feature_0002  feature_0003  \
0    -3.111521           0.0           0.0           0.0           0.0   
1    -3.219118           0.0           0.0           0.0           0.0   
2    -3.114145           0.0           0.0           0.0           0.0   
3    -3.161867           0.0           0.0           0.0           0.0   
4    -3.687744           0.0           0.0           0.0           0.0   
5    -2.791261           0.0           0.0           0.0           1.0   
6    -3.688235           0.0           0.0           0.0           1.0   
7    -3.243368           0.0           0.0           0.0           0.0   
8    -3.508069           0.0           0.0           0.0           0.0   
9    -3.440629           0.0           0.0           0.0           0.0   

   feature_0004  feature_0005  feature_0006  feature_0007  feature_0008  ...  \
0           0.0           0.0           0.0           0.0           0.0  ...   
1           1.0          

In [12]:
torch.tensor(all_data_st_noidsmiles.iloc[0][1:], dtype=torch.double)

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 1., 

In [13]:
# For reproducbility
cudnn.benchmark = False
cudnn.deterministic = True
np.random.seed(1998)
torch.manual_seed(1998)
torch.cuda.manual_seed(1998)
torch.backends.cudnn.deterministic = True
random.seed(1998)

In [14]:
'''
Start constructing the network
'''
print('================== START CONSTRUCTING NETWORK ==================')



In [15]:
class FeatureExtract(nn.Module):
    def __init__(self, **kwargs):
        super().__init__()
        self.encoder_hidden_layer_1 = nn.Linear(
            in_features=kwargs["input_shape"], out_features=512
        )
        self.encoder_hidden_layer_2 = nn.Linear(
            in_features=512, out_features=256
        )
        self.encoder_hidden_layer_3 = nn.Linear(
            in_features=256, out_features=128
        )
        self.encoder_hidden_layer_4 = nn.Linear(
            in_features=128, out_features=64
        )
        self.prediction_layer = nn.Linear(
            in_features=64, out_features=1
        )
 
    def forward(self, features):
        activation_1 = self.encoder_hidden_layer_1(features)
        activation_1 = torch.relu(activation_1)
        activation_2 = self.encoder_hidden_layer_2(activation_1)
        activation_2 = torch.relu(activation_2)
        activation_3 = self.encoder_hidden_layer_3(activation_2)
        activation_3 = torch.relu(activation_3)
        activation_4 = self.encoder_hidden_layer_4(activation_3)
        activation_4 = torch.relu(activation_4)
        output = self.prediction_layer(activation_4)
        return output

In [16]:
model = FeatureExtract(input_shape=1000).to(device)

In [17]:
# Construct the loss and optimizer
criterion = nn.MSELoss()

optimizer = optim.SGD(model.parameters(),
                            lr=0.0005,
                            momentum=0.9,
                            weight_decay=2e-3,#The value used in the paper is 1e-3
                            nesterov=True)

In [18]:
len(all_data_st_noidsmiles)

50000

In [19]:
training = all_data_st_noidsmiles.sample(frac=0.95, random_state=1998)

validation = all_data_st_noidsmiles.drop(training.index)

print(len(training))
print(len(validation))

47500
2500


In [20]:
input = torch.tensor(training.iloc[0][1:], dtype=torch.float).to(device)
result = model(input)
loss = criterion(result, torch.tensor(training.iloc[1][0:1], dtype=torch.float).to(device))
print(loss)

tensor(8.0797, device='cuda:0', grad_fn=<MseLossBackward0>)


In [21]:
def val(model, criterion, valset, datapoint):

  torch.cuda.empty_cache()
  
  validation_loss_sum = 0

  for i in range(len(valset)):

    # Get one input from the validation set
    input = torch.tensor(valset.iloc[i][1:], dtype=torch.float).to(device)
    
    # Calculate the output
    output = model(input)

    # Calculate the MSE loss
    validation_loss_point = criterion(output, torch.tensor(valset.iloc[i][0:1], dtype=torch.float).to(device))

    # update validation_loss
    validation_loss_sum += validation_loss_point.item()

  print(f'{datapoint} Validation Loss: {validation_loss_sum / len(valset):.3f}')

  torch.cuda.empty_cache()


In [22]:
def train(model, criterion, optimizer, epochs, training_set, validation_set):
  
  # Empty the cache of CUDA  
  torch.cuda.empty_cache()
  
  print('================== START TRAINING ==================')

  # Change to train mode
  model.train()

  for epoch in range(epochs):
    running_loss = 0
    for data_point in range(len(training_set)):

      # Get one input from the training set
      input = torch.tensor(training_set.iloc[data_point][1:], dtype=torch.float).to(device)

      # Calculate its corresponding output
      result = model(input)
      # print(result)

      # Calculate the MSE loss
      loss = criterion(result, torch.tensor(training_set.iloc[data_point][0:1], dtype=torch.float).to(device))
      if ((data_point+1) % 100) == 0:
        print(f'training {data_point+1} loss: {loss}')

      # Zero the gradient
      optimizer.zero_grad()
                
      # Back prop and update
      loss.backward()
      optimizer.step()

      running_loss += loss.item()

      # Validation
      if ((data_point+1) % 5000) == 0:
        # Change to evaluation mode
        model.eval()

        val(model, criterion, validation_set, data_point+1)

        # Change back to training mode
        model.train()

    print(f'[{epoch + 1}] average loss per epoch: {running_loss / len(training_set):.3f}')

    save_path = f'/content/drive/My Drive/Task4/test5/model_epoch{epoch+1}.pt'
    torch.save({'epoch': epoch+1, 'model_state_dict': model.state_dict()}, save_path)
    print(f'Saved model checkpoint to {save_path}')

In [23]:
train(model, criterion, optimizer, 5, training, validation)

training 100 loss: 0.006629479117691517
training 200 loss: 0.1727408468723297
training 300 loss: 0.17348702251911163
training 400 loss: 0.05562751740217209
training 500 loss: 0.3524382710456848
training 600 loss: 0.05999352037906647
training 700 loss: 0.0006808076286688447
training 800 loss: 0.0036711804568767548
training 900 loss: 0.010571641847491264
training 1000 loss: 0.06037874147295952
training 1100 loss: 0.07002365589141846
training 1200 loss: 0.0031184267718344927
training 1300 loss: 0.004192217253148556
training 1400 loss: 0.00508978171274066
training 1500 loss: 0.015186920762062073
training 1600 loss: 0.006078209728002548
training 1700 loss: 0.010147289372980595
training 1800 loss: 0.06858671456575394
training 1900 loss: 0.0006138374446891248
training 2000 loss: 0.001844224170781672
training 2100 loss: 0.0010619936510920525
training 2200 loss: 0.015589496120810509
training 2300 loss: 0.00087946024723351
training 2400 loss: 0.10230690240859985
training 2500 loss: 0.00788085907

In [24]:
feature_gap = pd.read_csv('train_features.csv')  # features
label_gap = pd.read_csv('train_labels.csv')  # labels


print(feature_gap.shape)
print(label_gap.shape)

# Merge two dataframes
train_gap_data = pd.merge(label_gap, feature_gap, on='Id')
print(train_gap_data.shape)
print(train_gap_data.head(10))

train_gap_data_noidsmiles = train_gap_data.drop(['Id', 'smiles'], axis=1)
print(train_gap_data_noidsmiles.head(10))

(100, 1002)
(100, 2)
(100, 1003)
      Id  homo_lumo_gap                                             smiles  \
0  50000       2.052872    C1C=c2c3ccoc3c3c4ccccc4c(cc3c2=C1)-c1scc2ccsc12   
1  50001       1.325530  c1cc([se]c1-c1sc(-c2cccc3nsnc23)c2nccnc12)-c1c...   
2  50002       1.837294  [SiH2]1C=CC=C1c1cc2cnc3c(sc4ccc5c[nH]cc5c34)c2...   
3  50003       1.388601  C1C=c2ccc3c4cocc4c4c([se]c5cc(-c6cccs6)c6nsnc6...   
4  50004       0.991851  C1c(ccc1-c1sc(-c2nccc3nsnc23)c2ccoc12)-c1scc2c...   
5  50005       1.181848  c1c[nH]c(c1)-c1sc(-c2ccc(-c3scc4[se]ccc34)c3ns...   
6  50006       1.469864  C1C(=Cc2c1c1cnc3ccc4=C[SiH2]C=c4c3c1c1c[nH]cc2...   
7  50007       1.780535  c1c[nH]c(c1)-c1ccc([nH]1)-c1sc(-c2scc3ccoc23)c...   
8  50008       2.959695        c1cc2oc3c(ccc4cc(cnc34)-c3cccc4ccccc34)c2s1   
9  50009       2.224978  c1cc2csc(-c3cc4cc5c6c[nH]cc6c6cc[se]c6c5cc4[nH...   

   feature_0000  feature_0001  feature_0002  feature_0003  feature_0004  \
0           0.0           0.0    

In [25]:
# Calculate gradient except for the first layer
for name, param in model.named_parameters():
  print(name)
  if ('encoder_hidden_layer_1' in name):
    param.requires_grad = False
  else:
    param.requires_grad = True

encoder_hidden_layer_1.weight
encoder_hidden_layer_1.bias
encoder_hidden_layer_2.weight
encoder_hidden_layer_2.bias
encoder_hidden_layer_3.weight
encoder_hidden_layer_3.bias
encoder_hidden_layer_4.weight
encoder_hidden_layer_4.bias
prediction_layer.weight
prediction_layer.bias


In [26]:
model

FeatureExtract(
  (encoder_hidden_layer_1): Linear(in_features=1000, out_features=512, bias=True)
  (encoder_hidden_layer_2): Linear(in_features=512, out_features=256, bias=True)
  (encoder_hidden_layer_3): Linear(in_features=256, out_features=128, bias=True)
  (encoder_hidden_layer_4): Linear(in_features=128, out_features=64, bias=True)
  (prediction_layer): Linear(in_features=64, out_features=1, bias=True)
)

In [27]:
criterion_gap = nn.MSELoss()

# Update the parameters for the layers with gradient calculated
optimizer_gap = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()),
                            lr=0.00005,
                            momentum=0.9,
                            weight_decay=2e-3,#The value used in the paper is 1e-3
                            nesterov=True)

def train_gap(model, criterion, optimizer, training_set_gap):
  # Empty the cache of CUDA  
  torch.cuda.empty_cache()
   
  print('================== START TRANSFER LEARNING ==================')

  # Change to train mode
  model.train()

  running_loss = 0
  for i in range(len(training_set_gap)):
    # Get one input from the training set
    input = torch.tensor(training_set_gap.iloc[i][1:], dtype=torch.float).to(device)

    # Calculate its corresponding output
    result = model(input)
    # print(result)

    # Calculate the MSE loss
    loss = criterion(result, torch.tensor(training_set_gap.iloc[i][0:1], dtype=torch.float).to(device))

    # print(f'training {i+1} loss: {loss}')

    # Zero the gradient
    optimizer.zero_grad()
              
    # Back prop and update
    loss.backward()
    optimizer.step()

    running_loss += loss.item()

  print(f'average loss per epoch: {running_loss / len(training_set_gap):.3f}')

  return model

In [28]:
# Print the weight before training
print("model.layer1.weight", model.encoder_hidden_layer_1.weight)
print("model.layer2.weight", model.encoder_hidden_layer_2.weight)
print("model.layer3.weight", model.encoder_hidden_layer_3.weight)
print("model.layer4.weight", model.encoder_hidden_layer_4.weight)
print("model.output.weight", model.prediction_layer.weight)

for i in range(4000):
  train_gap(model, criterion_gap, optimizer_gap, train_gap_data_noidsmiles)

# Print the weight after training
print("model.layer1.weight", model.encoder_hidden_layer_1.weight)
print("model.layer2.weight", model.encoder_hidden_layer_2.weight)
print("model.layer3.weight", model.encoder_hidden_layer_3.weight)
print("model.layer4.weight", model.encoder_hidden_layer_4.weight)
print("model.output.weight", model.prediction_layer.weight)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
average loss per epoch: 0.017
average loss per epoch: 0.017
average loss per epoch: 0.017
average loss per epoch: 0.017
average loss per epoch: 0.017
average loss per epoch: 0.017
average loss per epoch: 0.017
average loss per epoch: 0.017
average loss per epoch: 0.017
average loss per epoch: 0.017
average loss per epoch: 0.017
average loss per epoch: 0.017
average loss per epoch: 0.017
average loss per epoch: 0.017
average loss per epoch: 0.017
average loss per epoch: 0.017
average loss per epoch: 0.017
average loss per epoch: 0.017
average loss per epoch: 0.017
average loss per epoch: 0.017
average loss per epoch: 0.017
average loss per epoch: 0.017
average loss per epoch: 0.017
average loss per epoch: 0.017
average loss per epoch: 0.017
average loss per epoch: 0.016
average loss per epoch: 0.016
average loss per epoch: 0.016
average loss per epoch: 0.016
average loss per epoch: 0.016
average loss per epoch: 0.016
avera

In [29]:
model(torch.tensor(train_gap_data_noidsmiles.iloc[95][1:], dtype=torch.float).to(device))

tensor([1.9445], device='cuda:0', grad_fn=<AddBackward0>)

In [30]:
# Start Prediction
test_feature = pd.read_csv('test_features.csv')
test_id = test_feature['Id']
print(test_id.head(10))
test_feature_noidsmiles = test_feature.drop(['Id', 'smiles'], axis=1)
print(test_feature_noidsmiles.head(10))

0    50100
1    50101
2    50102
3    50103
4    50104
5    50105
6    50106
7    50107
8    50108
9    50109
Name: Id, dtype: int64
   feature_0000  feature_0001  feature_0002  feature_0003  feature_0004  \
0           0.0           0.0           0.0           1.0           1.0   
1           0.0           0.0           0.0           0.0           0.0   
2           0.0           0.0           0.0           0.0           0.0   
3           0.0           0.0           0.0           0.0           1.0   
4           0.0           0.0           0.0           1.0           0.0   
5           0.0           0.0           0.0           1.0           0.0   
6           0.0           0.0           0.0           0.0           1.0   
7           0.0           0.0           0.0           1.0           0.0   
8           0.0           0.0           0.0           0.0           0.0   
9           0.0           0.0           0.0           0.0           0.0   

   feature_0005  feature_0006  feature_00

In [31]:
def predict(model, test_features, test_id, epoch_run):
  
  print('================== START PREDICTION ==================')
  model.eval()

  y = []
  for i in range(len(test_feature)):
    input = torch.tensor(test_features.iloc[i], dtype=torch.float).to(device)

    # Calculate its corresponding output
    with torch.no_grad():
      result = model(input)
      result = result.item()
      y.append(result)

  output_df = pd.DataFrame(y, columns=['y'])

  prediction = pd.concat([test_id, output_df], join='outer', axis=1)

  prediction.to_csv('/content/drive/MyDrive/Task4/test5/submission_epoch{0}.csv'.format(epoch_run), index=False, header=True)




In [32]:
predict(model, test_feature_noidsmiles, test_id, 5)

