# Kaggle Molecular energy estimation. RUCODE 5.0
----
https://github.com/Sankek/MolecularEnergyEstimation

https://www.kaggle.com/competitions/molecular-energy-estimation-rucode

In [None]:
# Logs into github account to make changes in repository

from getpass import getpass
username = getpass('User name: ')
email = getpass('Email: ')
token = getpass('Password: ')
!git init .
!git remote add -f origin "https://"$username":"$token"@github.com/Sankek/MolecularEnergyEstimation.git"
!git config user.name $username
!git config user.email $email
!git pull origin master

del username
del token
del email

In [None]:
# Upload kaggle.json with token to download and send files from competition.
# Read the docs: https://www.kaggle.com/docs/api

from google.colab import files
files.upload()

!mkdir /root/.kaggle
!mv kaggle.json /root/.kaggle
!chmod 600 /root/.kaggle/kaggle.json

from kaggle.api.kaggle_api_extended import KaggleApi

In [None]:
# Since Google Colab removed the ability to connect to Google Drive from 
# a non-current account, there is some hack that still allows you to do this.
#
# You need to follow the link that will appear in the output and login to the 
# account that owns the desired Google Drive.
#
# After that, you need to run the next cell.
#
# https://stackoverflow.com/questions/69819761/i-cant-mount-my-another-drive-in-google-colab/70797774#70797774

!sudo add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!sudo apt-get update -qq 2>&1 > /dev/null
!sudo apt -y install -qq google-drive-ocamlfuse 2>&1 > /dev/null
!google-drive-ocamlfuse

In [None]:
!sudo apt-get install -qq w3m # to act as web browser 
!xdg-settings set default-web-browser w3m.desktop # to set default browser
%cd /content
!mkdir drive
%cd drive
!mkdir MyDrive
%cd ..
%cd ..
!google-drive-ocamlfuse /content/drive/MyDrive
%cd /content

In [None]:
!pip install ase
!pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric pyg-nightly -f https://data.pyg.org/whl/torch-1.10.0+cu113.html

# pyg-nightly required to fix error in DimeNet.from_qm9_pretrained
# https://githubhot.com/repo/rusty1s/pytorch_geometric/issues/4425

In [None]:
import os
import os.path as osp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json

from sklearn.model_selection import train_test_split

import ase.db
from ase.db import connect
from ase.visualize import view
import ase

import torch
import torch.nn as nn
import torch.nn.functional as F

import torch_geometric as tg
import torch_geometric.data as tgd
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool, BatchNorm

from tqdm.notebook import tqdm

from dataset import MOSESDataset
from training import long_train
from utils import plot_loss, predict_test, make_submission, load_config, load_model
from compare_models import write_tensorboard_losses

In [None]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [None]:
link_cutoff = 7  # max length in angstroms for creating an edge in the molecular graph
SAVED_MODELS_PATH = '/content/drive/MyDrive/Projects/MolecularEnergyEstimation/saved'

In [None]:
!cp -r /content/drive/MyDrive/Projects/MolecularEnergyEstimation/cutoff7/raw .
!cp -r /content/drive/MyDrive/Projects/MolecularEnergyEstimation/cutoff7/processed .

In [None]:
val_size = 0.2
competition_name = 'molecular-energy-estimation-rucode'
train_dataset = MOSESDataset(competition_name, '.', 'train', val_size=val_size)
val_dataset = MOSESDataset(competition_name, '.', 'val', val_size=val_size)
test_dataset = MOSESDataset(competition_name, '.', 'test', val_size=val_size)

In [None]:
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

dataset = {
    'train': train_dataset,
    'val': val_dataset,
    'test': test_dataset
}

dataloader = {
    'train': train_loader,
    'val': val_loader,
    'test': test_loader
}

In [None]:
from mxmnet import MXMNet

# Mapping atomic numbers to an indexes for assigning embeddings
el_to_idx_dict = {
    8: 0,
    17: 1,
    9: 2,
    35: 3,
    1: 4,
    7: 5,
    16: 6,
    6: 7,
}
unique_elements_num = len(el_to_idx_dict)

def element_to_idx(el):
    u, inv = np.unique(el, return_inverse = True)
    return np.array([el_to_idx_dict[x] for x in u])[inv].reshape(el.shape)

model_parameters = dict(
    unique_elements_num=unique_elements_num,
    dim=128,
    n_layer=3,
    cutoff=15,
    num_spherical=7, 
    num_radial=6, 
    envelope_exponent=5
)

model = MXMNet(**model_parameters).to(DEVICE)
model_group = 'MXMNet'
model_name = 'MXMNet_v2_bs32'
start_lr = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=start_lr)
criterion = torch.nn.L1Loss()
criterion_name = 'L1Loss'

def output_call(input):
    input.x = torch.LongTensor(element_to_idx(input.z.cpu())).to(DEVICE)

    return model(input).view(-1, 1)

In [None]:
model_config = {
    'model_name': model_name,
    'start_lr': start_lr,
    'criterion_name': criterion_name,
    'batch_size': batch_size,
    'save': True,
    'save_path': osp.join(SAVED_MODELS_PATH, model_group),
    'model_group': model_group,
    'link_cutoff': link_cutoff,
    'device': str(DEVICE)
}
model_config.update(model_parameters)

model_state = {
    'model': model,
    'optimizer': optimizer,
    'train_losses': [],
    'val_losses': [],
    'output_call': output_call,
    'criterion': criterion,
    'lr': start_lr,
    'trained_epochs': 0
}

In [None]:
load_model(model_state, model_config, 110)

In [None]:
long_train(model_state, model_config, dataloader, epochs=10, new_lr=None)

In [None]:
plot_loss(model_state, start_from=5)

In [None]:
# to successfully run this you have to define output_call above
write_tensorboard_losses(SAVED_MODELS_PATH, device=DEVICE)

loaded SchNet
loaded SchNet_default
loaded SchNet_default_full_train
loaded SchNet_15_04_22
loaded SchNet_v3
loaded SchNet_v4
loaded SchNet_v4_run2
loaded SchNet_v4_run3_bs32
loaded SchNet_v4_run4_bs16
loaded SchNet_v4_run3_bs32_continued
loaded SchNet_v4_run3_bs32_continued_again
loaded MXMNet_v1_bs64
loaded MXMNet_v2_bs64
loaded MXMNet_v3_bs64
loaded MXMNet_v2_bs32
loaded DimeNet
loaded DimeNet_v2


In [None]:
%load_ext tensorboard
%tensorboard --logdir=runs

In [None]:
# !ls {osp.join(SAVED_MODELS_PATH, 'SchNet')} | grep run3
# osp.join(SAVED_MODELS_PATH, 'SchNet', 'SchNet_v4_run2_120ep.pth')

In [None]:
energies = predict_test(model_state, model_config, dataloader)
make_submission(energies, competition_name)