# Download and unpack data

In [1]:
!mkdir data
!wget https://s3.amazonaws.com/code2vec/data/java-small_data.tar.gz
!tar -xvzf java-small_data.tar.gz
!mv java-small data

--2020-06-03 00:34:06--  https://s3.amazonaws.com/code2vec/data/java-small_data.tar.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.110.245
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.110.245|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 382731969 (365M) [application/x-tar]
Saving to: ‘java-small_data.tar.gz’


2020-06-03 00:34:20 (26.6 MB/s) - ‘java-small_data.tar.gz’ saved [382731969/382731969]

java-small/
java-small/java-small.dict.c2v
java-small/java-small.train.c2v
java-small/java-small.val.c2v
java-small/java-small.test.c2v


# Load from the github

In [2]:
!git clone --single-branch --branch dataload https://github.com/Leggerla/DL_project_skoltech

Cloning into 'DL_project_skoltech'...
remote: Enumerating objects: 61, done.[K
remote: Counting objects: 100% (61/61), done.[K
remote: Compressing objects: 100% (61/61), done.[K
remote: Total 258 (delta 33), reused 2 (delta 0), pack-reused 197[K
Receiving objects: 100% (258/258), 52.61 KiB | 10.52 MiB/s, done.
Resolving deltas: 100% (145/145), done.


In [3]:
cd DL_project_skoltech/

/content/DL_project_skoltech


# Import our modules and torch


In [0]:
import create_vocab
import data_to_tensors
import model_implementation
from train import *

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import * 

# Because their data looks like this: we have 3 main dictionaries:
* dictionary of all words in the sheets
* dictionary of all paths(they are already hashed and converted to digits)
* dictionary of targets(method names)


In [0]:
dict_path = '../data/java-small/java-small.dict.c2v'
word2idx, path2idx, target2idx, idx2target = create_vocab.create_vocab(dict_path)

* 1) data_iterator Returns 100 batches each(the size can be changed when calling the function)
* 2) from below you can see how the data is accessed, then 3 vectors are fed to the model(sorry for such names, I think you will understand from the arguments of the model that the output is)
* 3) well, at the output we have code_vector and the second vector-probability distribution between all labels in target_vocab(dictionary of method names)

In [0]:
path_for_train = '../data/java-small/java-small.train.c2v'
train_dataset = data_to_tensors.TextDataset(path_for_train, 
                                                    word2idx, 
                                                    path2idx, 
                                                    target2idx)

path_for_val = '../data/java-small/java-small.val.c2v'
val_dataset = data_to_tensors.TextDataset(path_for_val, 
                                                    word2idx, 
                                                    path2idx, 
                                                    target2idx)

train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1024, shuffle=False)

# Train

In [13]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import random 

SEED = 1234
random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(DEVICE)

cuda:0


In [0]:
model = model_implementation.code2vec_model(values_vocab_size = len(word2idx), 
                             paths_vocab_size = len(path2idx), 
                             labels_num = len(target2idx))

In [0]:
N_EPOCHS = 1000
LR = 1e-4

optimizer = torch.optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss().to(DEVICE)

early_stop = False # ставите True и тогда будет обучение ток для одного батча
list_train_loss, list_val_loss, list_train_precision, list_val_precision,list_train_recall, list_val_recall, list_train_f1, list_val_f1 = train(model = model, optimizer = optimizer,
                                                                                                                                                criterion = criterion, train_loader = train_loader,
                                                                                                                                                val_loader = val_loader,
                                                                                                                                                epochs = N_EPOCHS, idx2target_vocab = idx2target, 
                                                                                                                                                scheduler=None, checkpoint=True, early_stop = early_stop)

1th epoch processed in 214.070
Epoch 1: train loss - 11.20019, validation loss - 12.31062
	 precision - 0.00411, recall - 0.0034, f1_score - 0.00372
----------------------------------------------------------------------
2th epoch processed in 209.883
Epoch 2: train loss - 10.2799, validation loss - 12.7469
	 precision - 0.02662, recall - 0.02576, f1_score - 0.02618
----------------------------------------------------------------------
3th epoch processed in 209.701
Epoch 3: train loss - 9.99948, validation loss - 13.00541
	 precision - 0.03739, recall - 0.04031, f1_score - 0.0388
----------------------------------------------------------------------
4th epoch processed in 210.010
Epoch 4: train loss - 9.74964, validation loss - 13.19532
	 precision - 0.0355, recall - 0.04171, f1_score - 0.03836
----------------------------------------------------------------------
5th epoch processed in 209.207
Epoch 5: train loss - 9.52889, validation loss - 13.3776
	 precision - 0.03555, recall - 0.0