In [23]:
import csv
import time
import numpy as np
import scipy.sparse as sp
from sklearn.metrics import accuracy_score, log_loss

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from tqdm import tqdm_notebook as tqdm
from data import load_data

In [48]:
adj, features, edge_features = load_data('D:\OneDrive\OneDrive - enpc.fr\Roman Cloud\\altegrad_challenge_2022\\')

In [7]:
# Split data into training and test sets
adj_train = list()
features_train = list()
edge_features_train = list()
y_train = list()
adj_test = list()
features_test = list()
edge_features_test = list()
proteins_test = list()
with open('data/graph_labels.txt', 'r') as f:
    for i,line in enumerate(f):
        t = line.split(',')
        if len(t[1][:-1]) == 0:
            proteins_test.append(t[0])
            adj_test.append(adj[i])
            features_test.append(features[i])
            edge_features_test.append(edge_features[i])

        else:
            adj_train.append(adj[i])
            features_train.append(features[i])
            y_train.append(int(t[1][:-1]))
            edge_features_train.append(edge_features[i])

In [8]:
import torch
from torch.utils.data import Dataset

class DGLGraphDataset(Dataset):
    def __init__(self, adj, features, edge_features, labels):
        self.adj = adj
        self.features = features
        self.edge_features = edge_features
        self.labels = labels

    def __len__(self):
        return len(self.adj)

    def __getitem__(self, idx):
        g = dgl.from_scipy(self.adj[idx])
        g.ndata['feat'] = torch.FloatTensor(self.features[idx])
#         g.edata['feat'] = torch.tensor(self.edge_features[idx])
        
        
        return g, self.labels[idx]

In [27]:
import torch
from torch.utils.data import Dataset

class DGLGraphDatasetTEST(Dataset):
    def __init__(self, adj, features, edge_features):
        self.adj = adj
        self.features = features
        self.edge_features = edge_features

    def __len__(self):
        return len(self.adj)

    def __getitem__(self, idx):
        g = dgl.from_scipy(self.adj[idx])
        g.ndata['feat'] = torch.FloatTensor(self.features[idx])
#         g.edata['feat'] = torch.tensor(self.edge_features[idx])
        
        
        return g

In [9]:
import pickle

with open('data/bert_embeddings/train/embeddings.pkl', 'rb') as f:
    train_embeddings = pickle.load(f)

with open('data/bert_embeddings/test/embeddings.pkl', 'rb') as f:
    test_embeddings = pickle.load(f)

In [10]:
train_embeddings['Amino Acids'][0].shape

for i in range(len(train_embeddings['Amino Acids'])):
    train_embeddings['Amino Acids'][i] = train_embeddings['Amino Acids'][i][1:-1]

print(train_embeddings['Amino Acids'][0].shape)
features_train[0].shape

(185, 1024)


(185, 86)

In [28]:
for i in range(len(test_embeddings['Amino Acids'])):
    test_embeddings['Amino Acids'][i] = test_embeddings['Amino Acids'][i][1:-1]

print(test_embeddings['Amino Acids'][0].shape)
features_test[0].shape

(327, 1024)


(327, 86)

In [12]:
dataset = DGLGraphDataset(adj_train, train_embeddings['Amino Acids'], edge_features_train, y_train)

In [30]:
dataset_test = DGLGraphDatasetTEST(adj_test, test_embeddings['Amino Acids'], edge_features_test)

In [13]:
dataset[0][0].ndata['feat']

tensor([[ 0.0020,  0.0599,  0.0363,  ..., -0.0423, -0.0722, -0.0852],
        [ 0.0264,  0.0454, -0.0312,  ...,  0.0018, -0.0215,  0.0100],
        [-0.0072, -0.0445, -0.1033,  ..., -0.0503,  0.0643, -0.1209],
        ...,
        [ 0.1269,  0.2370,  0.0141,  ...,  0.0843, -0.0104, -0.0799],
        [ 0.1819, -0.0697,  0.1489,  ..., -0.0372, -0.1432, -0.0091],
        [ 0.1557,  0.0675,  0.0276,  ...,  0.0615, -0.0514, -0.0923]])

In [33]:
from dgl.dataloading import GraphDataLoader

train_loader = GraphDataLoader(
        dataset, batch_size=16, shuffle=True
    )

test_loader = GraphDataLoader(dataset_test, batch_size=1, shuffle=False)

In [15]:
data = next(iter(train_loader))

In [16]:
model = HGPSLModel(1024, 18, 128)

In [17]:
batch_graphs, batch_labels = data

In [18]:
model(batch_graphs, batch_graphs.ndata["feat"]).shape

torch.Size([16, 18])

In [22]:
import torch.nn.functional as F

def train(model: torch.nn.Module, optimizer, trainloader, device):
    model.train()
    total_loss = 0.0
    num_batches = len(trainloader)
    correct = 0.0 
    num_graphs = 0
    for batch in tqdm(trainloader):
        optimizer.zero_grad()
        batch_graphs, batch_labels = batch
        batch_graphs = batch_graphs.to(device)
        batch_labels = batch_labels.long().to(device)
        num_graphs += batch_labels.size(0)
        out = model(batch_graphs, batch_graphs.ndata["feat"])
        pred = out.argmax(dim=1)
        correct += pred.eq(batch_labels).sum().item()
        loss = F.nll_loss(out, batch_labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / num_batches, correct / num_graphs

In [23]:
device = torch.device("cuda:0" if torch.cuda.is_available() else 'cpu')

model = HGPSLModel(1024, 18, 128).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-3)

trainloader = GraphDataLoader(
        dataset, batch_size=64, shuffle=True
    )

In [25]:
epochs = 15

for epoch in tqdm(range(epochs)):
    train_loss, train_acc = train(model, optimizer, trainloader, device)
    print(train_loss)
    print(train_acc)
    
    
    

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for epoch in tqdm(range(epochs)):


  0%|          | 0/15 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch in tqdm(trainloader):


  0%|          | 0/77 [00:00<?, ?it/s]

1.755640612020121
0.46522094926350244


  0%|          | 0/77 [00:00<?, ?it/s]

1.473575674094163
0.5589198036006546


  0%|          | 0/77 [00:00<?, ?it/s]

1.2946888425133445
0.6045417348608838


  0%|          | 0/77 [00:00<?, ?it/s]

1.2140280883033554
0.6231587561374795


  0%|          | 0/77 [00:00<?, ?it/s]

1.1190574912281779
0.6565057283142389


  0%|          | 0/77 [00:00<?, ?it/s]

1.057567997412248
0.6773731587561375


  0%|          | 0/77 [00:00<?, ?it/s]

1.001125486640187
0.698240589198036


  0%|          | 0/77 [00:00<?, ?it/s]

0.931471756526402
0.726063829787234


  0%|          | 0/77 [00:00<?, ?it/s]

0.8757626437521601
0.7407937806873978


  0%|          | 0/77 [00:00<?, ?it/s]

0.8087593998227801
0.7592062193126022


  0%|          | 0/77 [00:00<?, ?it/s]

0.7516447777871962
0.7741407528641571


  0%|          | 0/77 [00:00<?, ?it/s]

0.7021773360766373
0.7900981996726678


  0%|          | 0/77 [00:00<?, ?it/s]

0.6623924989978989
0.8019639934533551


  0%|          | 0/77 [00:00<?, ?it/s]

0.5994297714976521
0.818126022913257


  0%|          | 0/77 [00:00<?, ?it/s]

0.5344832466020213
0.8389934533551555


In [26]:
import os
os.makedirs('models/', exist_ok=True)
torch.save(model.state_dict(), 'models/modele_roman_Bert.pt')

In [37]:
model.eval()
preds = list()
with torch.no_grad():
    for batch in test_loader:
        x = batch.to(device)
        out = model(x, x.ndata["feat"])
        pred = out.detach().cpu().numpy()
        pred = np.exp(pred)
        preds.append(pred)

print(preds)

[array([[2.8259547e-07, 5.5858637e-03, 8.7067628e-01, 3.6042093e-06,
        1.9556261e-04, 3.1760159e-05, 3.2611957e-03, 2.1864630e-03,
        8.5365667e-07, 4.7994615e-04, 3.5478166e-04, 8.7093416e-04,
        2.9274472e-04, 5.5370876e-04, 3.5397559e-02, 6.7844398e-02,
        1.2252126e-02, 1.2020781e-05]], dtype=float32), array([[2.39677115e-06, 2.04293374e-02, 7.08234191e-01, 1.84199944e-05,
        2.89046526e-04, 2.49283010e-04, 6.29724702e-03, 8.81046709e-03,
        4.45219621e-06, 1.44341227e-03, 8.53549282e-04, 2.99217226e-03,
        1.72306958e-03, 1.12412311e-03, 1.00418895e-01, 1.25684187e-01,
        2.13382151e-02, 8.74926263e-05]], dtype=float32), array([[3.2818811e-05, 9.2584278e-06, 3.4972795e-06, 5.6728441e-01,
        4.5032662e-05, 6.9534685e-09, 3.2817751e-01, 9.1451021e-08,
        5.4241107e-03, 3.0771103e-03, 1.4342956e-04, 4.8284001e-06,
        2.2822052e-05, 2.2576507e-02, 7.2950110e-02, 2.9604598e-05,
        4.7751637e-05, 1.7109394e-04]], dtype=float32

In [44]:
y_pred_proba = np.array([list(pred[0]) for pred in preds])

y_pred_proba.shape

(1223, 18)

In [45]:
os.makedirs('submissions/', exist_ok=True)
with open('submissions/modele_roman_Bert.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    lst = list()
    for i in range(18):
        lst.append('class'+str(i))
    lst.insert(0, "name")
    writer.writerow(lst)
    for i, protein in enumerate(proteins_test):
        lst = y_pred_proba[i,:].tolist()
        lst.insert(0, protein)
        writer.writerow(lst)