In [1]:
# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
 process = psutil.Process(os.getpid())
 print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
 print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm() 

Collecting gputil
  Downloading https://files.pythonhosted.org/packages/ed/0e/5c61eedde9f6c87713e89d794f01e378cfd9565847d4576fa627d758c554/GPUtil-1.4.0.tar.gz
Building wheels for collected packages: gputil
  Building wheel for gputil (setup.py) ... [?25l[?25hdone
  Created wheel for gputil: filename=GPUtil-1.4.0-cp36-none-any.whl size=7413 sha256=7925da01c6d2a5c3a4a4bbf0958d67a1720ab6cebb290a401e60985aff31fe45
  Stored in directory: /root/.cache/pip/wheels/3d/77/07/80562de4bb0786e5ea186911a2c831fdd0018bda69beab71fd
Successfully built gputil
Installing collected packages: gputil
Successfully installed gputil-1.4.0
Gen RAM Free: 12.7 GB  | Proc size: 159.7 MB
GPU RAM Free: 15079MB | Used: 0MB | Util   0% | Total 15079MB


# Load from the github

In [2]:
!git clone --branch edited https://github.com/Vitaly-Protasov/DL_project_skoltech

Cloning into 'DL_project_skoltech'...
remote: Enumerating objects: 135, done.[K
remote: Counting objects: 100% (135/135), done.[K
remote: Compressing objects: 100% (113/113), done.[K
remote: Total 332 (delta 77), reused 52 (delta 22), pack-reused 197[K
Receiving objects: 100% (332/332), 77.23 KiB | 192.00 KiB/s, done.
Resolving deltas: 100% (189/189), done.


In [3]:
cd DL_project_skoltech/

/content/DL_project_skoltech


# Download data

In [4]:
!bash download_data.sh

--2020-06-03 21:03:59--  https://s3.amazonaws.com/code2vec/data/java-small_data.tar.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.94.181
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.94.181|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 382731969 (365M) [application/x-tar]
Saving to: ‘java-small_data.tar.gz’


2020-06-03 21:04:28 (13.1 MB/s) - ‘java-small_data.tar.gz’ saved [382731969/382731969]

java-small/
java-small/java-small.dict.c2v
java-small/java-small.train.c2v
java-small/java-small.val.c2v
java-small/java-small.test.c2v


# Import our modules and torch


In [0]:
import create_vocab
import data_to_tensors
import model_implementation
from train import *

import torch
import torch.nn as nn
import torch.nn.functional as F
import random 

from torch.utils.data import DataLoader

# Because their data looks like this: we have 3 main dictionaries:
* dictionary of all words in the sheets
* dictionary of all paths(they are already hashed and converted to digits)
* dictionary of targets(method names)


In [0]:
dict_path = 'data/java-small/java-small.dict.c2v'
word2idx, path2idx, target2idx, idx2target = create_vocab.create_vocab(dict_path)

* 1) data_iterator Returns 100 batches each(the size can be changed when calling the function)
* 2) from below you can see how the data is accessed, then 3 vectors are fed to the model(sorry for such names, I think you will understand from the arguments of the model that the output is)
* 3) well, at the output we have code_vector and the second vector-probability distribution between all labels in target_vocab(dictionary of method names)

In [0]:
path_for_train = 'data/java-small/java-small.train.c2v'
train_dataset = data_to_tensors.TextDataset(path_for_train, 
                                                    word2idx, 
                                                    path2idx, 
                                                    target2idx)

path_for_val = 'data/java-small/java-small.val.c2v'
val_dataset = data_to_tensors.TextDataset(path_for_val, 
                                                    word2idx, 
                                                    path2idx, 
                                                    target2idx)

path_for_test = 'data/java-small/java-small.test.c2v'
test_dataset = data_to_tensors.TextDataset(path_for_test, 
                                                    word2idx, 
                                                    path2idx, 
                                                    target2idx)

train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=512, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False) 

# Train

In [0]:
SEED = 1337
random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [0]:
model = model_implementation.code2vec_model(values_vocab_size = len(word2idx), 
                             paths_vocab_size = len(path2idx), 
                             labels_num = len(target2idx))

In [0]:
N_EPOCHS = 40
LR = 3e-3

optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=1.2e-5)
criterion = nn.CrossEntropyLoss()

early_stop = False # ставите True и тогда будет обучение ток для одного батча
list_train_loss, list_val_loss, list_train_precision, list_val_precision,list_train_recall, list_val_recall, list_train_f1, list_val_f1 = train(model = model, optimizer = optimizer,
                                                                                                                                                criterion = criterion, train_loader = train_loader,
                                                                                                                                                val_loader = val_loader, test_loader = test_loader,
                                                                                                                                                epochs = N_EPOCHS, idx2target_vocab = idx2target, 
                                                                                                                                                scheduler=None, checkpoint=True, early_stop = early_stop)


Epoch 1: train loss - 9.40172, validation loss - 11.09501
	 Validation: precision - 0.06527, recall - 0.10652, f1_score - 0.08094
	 Test: precision - 0.0768, recall - 0.11506, f1_score - 0.09212
Elapsed time: 288.962
----------------------------------------------------------------------
Epoch 2: train loss - 7.82712, validation loss - 10.95321
	 Validation: precision - 0.09305, recall - 0.13375, f1_score - 0.10975
	 Test: precision - 0.1032, recall - 0.15269, f1_score - 0.12316
Elapsed time: 286.182
----------------------------------------------------------------------
Epoch 3: train loss - 6.85154, validation loss - 10.85459
	 Validation: precision - 0.10101, recall - 0.14914, f1_score - 0.12044
	 Test: precision - 0.11474, recall - 0.16783, f1_score - 0.1363
Elapsed time: 285.944
----------------------------------------------------------------------
Epoch 4: train loss - 6.12865, validation loss - 10.87316
	 Validation: precision - 0.12971, recall - 0.18171, f1_score - 0.15137
	 Test