# **Imports**

In [None]:
!pip install tokenizers
!pip install datasets
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tokenizers
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 31.0 MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.12.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 13.0 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 52.0 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 60.1 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-p

In [None]:
import os
import gzip
import json
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm, trange
import plotly.express as ex
from pathlib import Path
from sklearn.model_selection import train_test_split
from tokenizers.implementations.byte_level_bpe import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from datasets import load_dataset
from torch.utils.data import Dataset,DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import RobertaForSequenceClassification
from transformers.data.metrics import simple_accuracy
from sklearn.metrics import f1_score

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# **Data Setting**

### Data Collection

**Constants**

In [None]:
Size = 30000

In [None]:
LANGUAGES = {
    "GO": 0,
    "Java": 1,
    "JavaScript": 2,
    "PHP": 3,
    "Python": 4,
    "Ruby": 5,
    "HTML": 6,
    "SQL": 7,
    "C": 8,
    "Rust": 9,
}

In [None]:
Languages = ['GO','Java','JavaScript','PHP','Python','Ruby','HTML','SQL','C','Rust']

**Data Extraction**

In [None]:
DataFrame = pd.DataFrame(columns = ['code', 'language'])
for l in Languages:
  dataset = load_dataset("codeparrot/github-code", streaming=True, split='train', languages = [l])
  dataset = dataset.remove_columns(['repo_name', 'path', 'license', 'size'])
  dataset = dataset.take(Size)
  DataFrame = DataFrame.append(pd.DataFrame(dataset))
DataFrame.to_parquet('/content/gdrive/MyDrive/MLA Project/10Languages.parquet')



**Data** **Visulaization**

In [None]:
dataset_count = DataFrame.groupby('language').count()
ex.pie(dataset_count.index[:],names='language',title='Proportion Of Different Languages',hole=0.33)

### Reading Data

In [None]:
DataFrame = pd.read_parquet('/content/gdrive/MyDrive/MLA Project/10LanguagesbalancedData.parquet')

In [None]:
Data = DataFrame['code'].to_numpy()
Label = DataFrame['language'].to_numpy()

###Splitting Data

In [None]:
train_data, test_data , train_labels, test_labels = train_test_split(Data, Label, test_size=0.3, random_state=0,stratify=Label)
validation_data, test_data, validation_labels, test_labels=train_test_split(test_data, test_labels, test_size=(2/3), random_state=0, stratify=test_labels)

In [None]:
print(len(train_data))
print(len(validation_data))
print(len(test_data))

196154
28022
56044


### Saving Data

In [None]:
np.save('/content/gdrive/MyDrive/MLA Project/10LanguagesBalancedData/trainData.npy', train_data)
np.save('/content/gdrive/MyDrive/MLA Project/10LanguagesBalancedData/trainLabels.npy', train_labels)
np.save('/content/gdrive/MyDrive/MLA Project/10LanguagesBalancedData/validationData.npy', validation_data)
np.save('/content/gdrive/MyDrive/MLA Project/10LanguagesBalancedData/validationLabels.npy', validation_labels)
np.save('/content/gdrive/MyDrive/MLA Project/10LanguagesBalancedData/testData.npy', test_data)
np.save('/content/gdrive/MyDrive/MLA Project/10LanguagesBalancedData/testLabels.npy', test_labels)

# **Removing** **Comments**

### Script

In [None]:
DataFrame = pd.read_parquet('/content/gdrive/MyDrive/MLA Project/10Languages.parquet')

In [None]:
LANGUAGES = {
    "Assembly": [';','\n'],
    "Batchfile": ['::','\n','REM','\n'],
    "C": ['/*','*/','//','\n'],
    "C#": ['/*','*/','//','\n'],
    "C++": ['/*','*/','//','\n'],
    "CMake": ['#[[',']]','#','\n'],            
    "CSS": ['/*','*/'],
    "Dockerfile": ['#','\n'],
    "FORTRAN": ['!','\n'],
    "GO": ['/*','*/','//','\n'],
    "Haskell": ['--','\n','{-','-}'],
    "HTML": ['<!--', '-->'],
    "Java": ['/*','*/','//','\n'],
    "JavaScript": ['/*','*/','//','\n'],
    "Julia": ['#','\n','#=','=#'],
    "Lua": ['--[[','--]]','--','\n'], 
    "Markdown": ['[',']: #','[//]: # (',')','[//]: # "','"','[]: # (',')','[]: # "','"','<!--','-->'],
    "Makefile": ['#','\n'],
    "PHP": ['/*','*/','//','\n'],
    "Perl": ['#','\n','=begin','=end'],
    "PowerShell": ['#','\n','<#','#>'],
    "Python": ['#','\n'],
    "Ruby": ['#','\n','=begin','=end'],
    "Rust": ['/*','*/','//','\n'],
    "SQL": ['/*','*/','--','\n'],
    "Scala": ['/*','*/','//','\n'],
    "Shell": [": '", " '"],
    "TypeScript": ['//','\n','/*','*/'],
    "TeX": ['%','\n'],
    "Visual Basic": ["'","\n"]
}

In [None]:
def removeSpecialComments(code, specialChars, shell=False):
  lines = code.splitlines()
  temp = []
  for line in lines:
    if shell:
      tempLine = line.strip()
    else:
      tempLine = line
    if tempLine == '':
      continue
    if tempLine[0] in specialChars:
      temp.append(line)
  for line in temp:
    lines.remove(line)
  code = "\n".join(lines)
  return code

In [None]:
def RemoveComments(code, language):
  identifier = LANGUAGES[language]
  for k in range(0, len(identifier), 2):
    start = identifier[k]
    end = identifier[k+1]
    temp = []
    commentIndex = []
    for i in range(len(code)):
      if code[i : i + len(start)] == start:
        for j in range(i+len(start)+1, len(code)):
          if code[j : j + len(end)] == end and code[j - 1 : j - 1 + len(start)] != start:
            commentIndex.append((i, j+len(end)))
            break
    temp[:0] = code
    for l,m in reversed(commentIndex):
      del temp[l:m + len(end)]
    code = ''.join(temp)
  if language == "FORTRAN":
    code = removeSpecialComments(code, ['*', 'C', 'c', 'd', 'D'])
  elif language == 'Shell':
    code = removeSpecialComments(code, ['#'], True)
  return code

In [None]:
new_df = pd.DataFrame(columns = ['code', 'language'])
languages=DataFrame.language.unique()
for l in range(len(languages)):
  for i in range(int(len(DataFrame)/len(languages))):
    x = DataFrame.loc[DataFrame['language'] == languages[l]]["code"][i]
    y = RemoveComments(x, languages[l])
    new_row = {'code':y, 'language':languages[l]}
    new_df = new_df.append(new_row, ignore_index=True)

new_df.to_parquet('/content/gdrive/MyDrive/MLA Project/10LanguagesWithoutComments.parquet')

### Balancing Data

In [None]:
DataFrame = pd.read_parquet('/content/gdrive/MyDrive/MLA Project/10LanguagesWithoutComments.parquet')

In [None]:
DataFrame.groupby('language').count()

Unnamed: 0_level_0,code
language,Unnamed: 1_level_1
C,30000
GO,30000
HTML,30000
Java,30000
JavaScript,30000
PHP,30000
Python,30000
Ruby,30000
Rust,30000
SQL,30000


In [None]:
languages=DataFrame.language.unique()
new_df = pd.DataFrame(columns = ['code', 'language'])
for l in range(len(languages)):
  for i in range(int(len(DataFrame)/len(languages))):
    x = DataFrame['code'][(l*(int(len(DataFrame)/len(languages))))+i]
    if len(x) != 0 :
      new_row = {'code':x, 'language':languages[l]}
      new_df = new_df.append(new_row, ignore_index=True)

In [None]:
new_df.groupby('language').count()

Unnamed: 0_level_0,code
language,Unnamed: 1_level_1
C,29497
GO,29686
HTML,29989
Java,29928
JavaScript,29781
PHP,29999
Python,28931
Ruby,29440
Rust,28022
SQL,28919


In [None]:
new_df.to_parquet('/content/gdrive/MyDrive/MLA Project/10LanguagesWithoutEmptyFiles.parquet')

In [None]:
new_df = pd.read_parquet('/content/gdrive/MyDrive/MLA Project/10LanguagesWithoutEmptyFiles.parquet')

In [None]:
len(new_df)

294192

In [None]:
files_count_map = {
    "C": 29497,
    "GO": 29686,
    "HTML":29989,
    "Java": 29928,
    "JavaScript": 29781,
    "PHP": 29999,
    "Python": 28931,
    "Ruby": 29440,
    "Rust": 28022,
    "SQL": 28919,
    }

In [None]:
# Mapping
languages=new_df.language.unique()
files_count = []
for i in range (len(languages)):
  files_count.append(files_count_map[languages[i]])
files_count = np.array(files_count)

# Making new df with balanced data
final_df = pd.DataFrame(columns = ['code', 'language'])
start = 0
end = 0
for l in range (len(languages)):
  end +=files_count[l]
  count = 0
  for j in range(start,end):
    if count < 28022 :
      x = new_df['code'][j]
      new_row = {'code':x, 'language':languages[l]}
      final_df = final_df.append(new_row, ignore_index=True)
      count +=1
  start +=files_count[l]

In [None]:
final_df.groupby('language').count()

Unnamed: 0_level_0,code
language,Unnamed: 1_level_1
C,28022
GO,28022
HTML,28022
Java,28022
JavaScript,28022
PHP,28022
Python,28022
Ruby,28022
Rust,28022
SQL,28022


In [None]:
dataset_count = final_df.groupby('language').count()
ex.pie(dataset_count.index[:],names='language',title='Proportion Of Different Languages',hole=0.33)

In [None]:
final_df.to_parquet('/content/gdrive/MyDrive/MLA Project/10LanguagesbalancedData.parquet')

# **Tokenizer**

### Load data

In [None]:
trainData = np.load('/content/gdrive/MyDrive/MLA Project/10LanguagesBalancedData/trainData.npy', allow_pickle=True)
trainLabels = np.load('/content/gdrive/MyDrive/MLA Project/10LanguagesBalancedData/trainLabels.npy', allow_pickle=True)
testData = np.load('/content/gdrive/MyDrive/MLA Project/10LanguagesBalancedData/testData.npy', allow_pickle=True)
testLabels = np.load('/content/gdrive/MyDrive/MLA Project/10LanguagesBalancedData/testLabels.npy', allow_pickle=True)
validationData = np.load('/content/gdrive/MyDrive/MLA Project/10LanguagesBalancedData/validationData.npy', allow_pickle=True)
validationLabels = np.load('/content/gdrive/MyDrive/MLA Project/10LanguagesBalancedData/validationLabels.npy', allow_pickle=True)

KeyboardInterrupt: ignored

### Training

In [None]:
text = []
file_count = 1
for i in tqdm(range(1, trainData.size + 1)):
  text.append(trainData[i-1])
  if i % 19600 == 0:
    with open(f'/content/gdrive/MyDrive/MLA Project/dump/{file_count}.txt', 'w') as fp:
      fp.write('\n'.join(text))
      file_count += 1
      text = []
with open(f'/content/gdrive/MyDrive/MLA Project/dump/{file_count}.txt', 'w') as fp:
      fp.write('\n'.join(text))

100%|██████████| 196154/196154 [00:09<00:00, 21106.70it/s]


In [None]:
from pathlib import Path
paths = [str(x) for x in Path('/content/gdrive/MyDrive/MLA Project/dump').glob('**/*.txt')]

In [None]:
print(paths)

['/content/gdrive/MyDrive/MLA Project/dump/1.txt', '/content/gdrive/MyDrive/MLA Project/dump/2.txt', '/content/gdrive/MyDrive/MLA Project/dump/3.txt', '/content/gdrive/MyDrive/MLA Project/dump/4.txt', '/content/gdrive/MyDrive/MLA Project/dump/5.txt', '/content/gdrive/MyDrive/MLA Project/dump/6.txt', '/content/gdrive/MyDrive/MLA Project/dump/7.txt', '/content/gdrive/MyDrive/MLA Project/dump/8.txt', '/content/gdrive/MyDrive/MLA Project/dump/9.txt', '/content/gdrive/MyDrive/MLA Project/dump/10.txt', '/content/gdrive/MyDrive/MLA Project/dump/11.txt']


In [None]:
from tokenizers import ByteLevelBPETokenizer
# initialize
tokenizer = ByteLevelBPETokenizer()
# and train
tokenizer.train(files=paths, vocab_size=50000 , min_frequency=2,
                special_tokens=['<|endoftext|>', '<s>', '<pad>', '</s>', '<unk>', '<mask>'])

In [None]:
tokenizer.save_model('/content/gdrive/MyDrive/MLA Project/10LangaugesTokenizer')

['/content/gdrive/MyDrive/MLA Project/10LangaugesTokenizer/vocab.json',
 '/content/gdrive/MyDrive/MLA Project/10LangaugesTokenizer/merges.txt']

### Tokenizing Data

In [None]:
# Set up tokenizer
tokenizer = ByteLevelBPETokenizer("/content/gdrive/MyDrive/MLA Project/10LanguagesTokenizer/vocab.json", "/content/gdrive/MyDrive/MLA Project/10LanguagesTokenizer/merges.txt",)
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)
tokenizer.enable_padding(length=512)

In [None]:
for i in tqdm(range(19)):
  encoding = tokenizer.encode_batch(trainData[10000*i:10000*(i+1)])
  en_train_data = [e.ids for e in encoding]
  np.save(f'/content/gdrive/MyDrive/MLA Project/10LanguagesBalancedData/TOKENIZEDtrainData{i}.npy', en_train_data)
i += 1
encoding = tokenizer.encode_batch(trainData[10000*i:])
en_train_data = [e.ids for e in encoding]
np.save(f'/content/gdrive/MyDrive/MLA Project/10LanguagesBalancedData/TOKENIZEDtrainData{i}.npy', en_train_data)

100%|██████████| 19/19 [17:38<00:00, 55.72s/it]


In [None]:
temp = []
for i in tqdm(range(20)):
  temp.extend(np.load(f'/content/gdrive/MyDrive/MLA Project/10LanguagesBalancedData/TOKENIZEDtrainData{i}.npy'))
np.save('/content/gdrive/MyDrive/MLA Project/10LanguagesBalancedData/TOKENIZEDtrainData.npy', temp)

100%|██████████| 20/20 [00:04<00:00,  4.54it/s]


In [None]:
for i in tqdm(range(2)):
  encoding = tokenizer.encode_batch(validationData[10000*i:10000*(i+1)])
  en_validation_data = [e.ids for e in encoding]
  np.save(f'/content/gdrive/MyDrive/MLA Project/10LanguagesBalancedData/TOKENIZEDvalidationData{i}.npy', en_validation_data)
i += 1
encoding = tokenizer.encode_batch(validationData[10000*i:])
en_validation_data = [e.ids for e in encoding]
np.save(f'/content/gdrive/MyDrive/MLA Project/10LanguagesBalancedData/TOKENIZEDvalidationData{i}.npy', en_validation_data)

100%|██████████| 2/2 [01:57<00:00, 58.97s/it]


In [None]:
temp = []
for i in tqdm(range(3)):
  temp.extend(np.load(f'/content/gdrive/MyDrive/MLA Project/10LanguagesBalancedData/TOKENIZEDvalidationData{i}.npy'))
np.save('/content/gdrive/MyDrive/MLA Project/10LanguagesBalancedData/TOKENIZEDvalidationData.npy', temp)

100%|██████████| 3/3 [00:00<00:00, 12.95it/s]


In [None]:
del validationData
del en_validation_data

In [None]:
for i in tqdm(range(5)):
  encoding = tokenizer.encode_batch(testData[10000*i:10000*(i+1)])
  en_test_data = [e.ids for e in encoding]
  np.save(f'/content/gdrive/MyDrive/MLA Project/10LanguagesBalancedData/TOKENIZEDtestData{i}.npy', en_test_data)
i += 1
encoding = tokenizer.encode_batch(testData[10000*i:])
en_test_data = [e.ids for e in encoding]
np.save(f'/content/gdrive/MyDrive/MLA Project/10LanguagesBalancedData/TOKENIZEDtestData{i}.npy', en_test_data)

100%|██████████| 5/5 [04:49<00:00, 57.92s/it]


In [None]:
temp = []
for i in tqdm(range(6)):
  temp.extend(np.load(f'/content/gdrive/MyDrive/MLA Project/10LanguagesBalancedData/TOKENIZEDtestData{i}.npy'))
np.save('/content/gdrive/MyDrive/MLA Project/10LanguagesBalancedData/TOKENIZEDtestData.npy', temp)

100%|██████████| 6/6 [00:00<00:00, 10.73it/s]


In [None]:
del testData
del en_test_data

# **Transformer**

### Loading Data

In [None]:
trainData = np.load('/content/gdrive/MyDrive/MLA Project/10LanguagesBalancedData/TOKENIZEDtrainData.npy')
validationData = np.load('/content/gdrive/MyDrive/MLA Project/10LanguagesBalancedData/TOKENIZEDvalidationData.npy')
testData = np.load('/content/gdrive/MyDrive/MLA Project/10LanguagesBalancedData/TOKENIZEDtestData.npy')
trainLabels = np.load('/content/gdrive/MyDrive/MLA Project/10LanguagesBalancedData/trainLabels.npy', allow_pickle=True)
testLabels = np.load('/content/gdrive/MyDrive/MLA Project/10LanguagesBalancedData/testLabels.npy', allow_pickle=True)
validationLabels = np.load('/content/gdrive/MyDrive/MLA Project/10LanguagesBalancedData/validationLabels.npy', allow_pickle=True)

### Loading Model

In [None]:
model = RobertaForSequenceClassification.from_pretrained('/content/gdrive/MyDrive/MLA Project/10LanguagesModel/model')

## Training Script

**Constants**

In [None]:
CODEBERTA_PRETRAINED = "huggingface/CodeBERTa-language-id"
EVALUATE = False
LANGUAGES = {
    "GO": 0,
    "Java": 1,
    "JavaScript": 2,
    "PHP": 3,
    "Python": 4,
    "Ruby": 5,
    "HTML": 6,
    "SQL": 7,
    "C": 8,
    "Rust": 9,
}

**Data Class**

In [None]:
class CodeDataset(Dataset):
    def __init__(self, data, label, language):
      self.data = data
      self.label = label
      self.language = language

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
      return torch.tensor(self.data[index]), torch.tensor(self.language[self.label[index]])

In [None]:
train_dataset = CodeDataset(trainData, trainLabels, LANGUAGES)
eval_dataset = CodeDataset(validationData, validationLabels, LANGUAGES)
test_dataset = CodeDataset(testData, testLabels, LANGUAGES)

**Model**

In [None]:
model = RobertaForSequenceClassification.from_pretrained(CODEBERTA_PRETRAINED, num_labels=len(LANGUAGES), ignore_mismatched_sizes=True)

**DataLoader**

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)

**Miscellaneous**

In [None]:
model.to("cuda")
model.train()
for param in model.roberta.parameters():
    param.requires_grad = False
## ^^ Only train final layer.

In [None]:
print(f"num params:", model.num_parameters())
print(f"num trainable params:", model.num_parameters(only_trainable=True))

num params: 83473950
num trainable params: 613662


**Validation & Test**

In [None]:
def evaluate(test = False):
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = np.empty((0), dtype=np.int64)
    out_label_ids = np.empty((0), dtype=np.int64)

    model.eval()
    if test == False :
      dataloader = DataLoader(eval_dataset, batch_size=128)
    else:
      dataloader = DataLoader(test_dataset, batch_size=128)
    for step, (input_ids, labels) in enumerate(tqdm(dataloader, desc="Eval")):
        with torch.no_grad():
            outputs = model(input_ids=input_ids.to("cuda"), labels=labels.to("cuda"))
            loss = outputs[0]
            logits = outputs[1]
            eval_loss += loss.mean().item()
            nb_eval_steps += 1
        preds = np.append(preds, logits.argmax(dim=1).detach().cpu().numpy(), axis=0)
        out_label_ids = np.append(out_label_ids, labels.detach().cpu().numpy(), axis=0)
        del input_ids
        del labels
        del outputs
        torch.cuda.empty_cache()
    eval_loss = eval_loss / nb_eval_steps
    acc = simple_accuracy(preds, out_label_ids)
    f1 = f1_score(y_true=out_label_ids, y_pred=preds, average="macro")
    if test == False :
      print("=== Eval: loss ===", eval_loss)
      print("=== Eval: acc. ===", acc)
      print("=== Eval: f1 ===", f1)
    else:
      print("=== Test: loss ===", eval_loss)
      print("=== Test: acc. ===", acc)
      print("=== Test: f1 ===", f1)
    return (eval_loss, acc)

**Training**

In [None]:
trainLA = np.load('/content/gdrive/MyDrive/MLA Project/10LanguagesModel/results/trainLoss_Accuracy.npy')
validationLA = np.load('/content/gdrive/MyDrive/MLA Project/10LanguagesModel/results/validationLoss_Accuracy.npy')

In [None]:
train_iterator = trange(0, 2, desc="Epoch")
optimizer = torch.optim.AdamW(model.parameters())
for _ in train_iterator:
    train_loss = 0.0
    nb_train_steps = 0
    preds = np.empty((0), dtype=np.int64)
    out_label_ids = np.empty((0), dtype=np.int64)
    epoch_iterator = tqdm(train_dataloader, desc="Iteration")
    for step, (input_ids, labels) in enumerate(epoch_iterator):
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids.to("cuda"), labels=labels.to("cuda"))
        loss = outputs[0]
        loss.backward()
        logits = outputs[1]
        train_loss += loss.mean().item()
        nb_train_steps += 1
        preds = np.append(preds, logits.argmax(dim=1).detach().cpu().numpy(), axis=0)
        out_label_ids = np.append(out_label_ids, labels.detach().cpu().numpy(), axis=0)
        optimizer.step()
        del input_ids
        del labels
        del outputs
        torch.cuda.empty_cache()
    train_loss = train_loss / nb_train_steps
    acc = simple_accuracy(preds, out_label_ids)
    f1 = f1_score(y_true=out_label_ids, y_pred=preds, average="macro")
    print("=== Train: loss ===", train_loss)
    print("=== Train: acc. ===", acc)
    print("=== Train: f1 ===", f1)
    model.save_pretrained("/content/gdrive/MyDrive/MLA Project/10LanguagesModel/model")
    val = evaluate()
    trainLA = np.append(trainLA, [[train_loss, acc]],axis=0)
    validationLA = np.append(validationLA, [val],axis=0)
    np.save('/content/gdrive/MyDrive/MLA Project/10LanguagesModel/results/trainLoss_Accuracy.npy', trainLA)
    np.save('/content/gdrive/MyDrive/MLA Project/10LanguagesModel/results/validationLoss_Accuracy.npy', validationLA)
    model.train()

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]
Iteration:   0%|          | 0/1533 [00:00<?, ?it/s][A
Iteration:   0%|          | 1/1533 [00:04<2:01:28,  4.76s/it][A
Iteration:   0%|          | 2/1533 [00:06<1:21:01,  3.18s/it][A
Iteration:   0%|          | 3/1533 [00:08<1:08:05,  2.67s/it][A
Iteration:   0%|          | 4/1533 [00:10<1:02:03,  2.44s/it][A
Iteration:   0%|          | 5/1533 [00:13<58:52,  2.31s/it]  [A
Iteration:   0%|          | 6/1533 [00:15<57:02,  2.24s/it][A
Iteration:   0%|          | 7/1533 [00:17<55:54,  2.20s/it][A
Iteration:   1%|          | 8/1533 [00:19<55:21,  2.18s/it][A
Iteration:   1%|          | 9/1533 [00:21<55:07,  2.17s/it][A
Iteration:   1%|          | 10/1533 [00:23<54:57,  2.16s/it][A
Iteration:   1%|          | 11/1533 [00:25<54:49,  2.16s/it][A
Iteration:   1%|          | 12/1533 [00:28<54:47,  2.16s/it][A
Iteration:   1%|          | 13/1533 [00:30<54:51,  2.17s/it][A
Iteration:   1%|          | 14/1533 [00:32<54:50,  2.17s/it][A
Ite

=== Train: loss === 0.7205470244400811
=== Train: acc. === 0.7582817582103857
=== Train: f1 === 0.7598821172948759



Eval:   0%|          | 0/219 [00:00<?, ?it/s][A
Eval:   0%|          | 1/219 [00:02<08:22,  2.30s/it][A
Eval:   1%|          | 2/219 [00:04<08:17,  2.29s/it][A
Eval:   1%|▏         | 3/219 [00:06<08:16,  2.30s/it][A
Eval:   2%|▏         | 4/219 [00:09<08:14,  2.30s/it][A
Eval:   2%|▏         | 5/219 [00:11<08:12,  2.30s/it][A
Eval:   3%|▎         | 6/219 [00:13<08:11,  2.31s/it][A
Eval:   3%|▎         | 7/219 [00:16<08:09,  2.31s/it][A
Eval:   4%|▎         | 8/219 [00:18<08:07,  2.31s/it][A
Eval:   4%|▍         | 9/219 [00:20<08:04,  2.31s/it][A
Eval:   5%|▍         | 10/219 [00:23<08:02,  2.31s/it][A
Eval:   5%|▌         | 11/219 [00:25<08:00,  2.31s/it][A
Eval:   5%|▌         | 12/219 [00:27<07:58,  2.31s/it][A
Eval:   6%|▌         | 13/219 [00:30<07:57,  2.32s/it][A
Eval:   6%|▋         | 14/219 [00:32<07:55,  2.32s/it][A
Eval:   7%|▋         | 15/219 [00:34<07:53,  2.32s/it][A
Eval:   7%|▋         | 16/219 [00:37<07:52,  2.33s/it][A
Eval:   8%|▊         | 17/219 [

=== Eval: loss === 0.5416612482234223
=== Eval: acc. === 0.83755620583827
=== Eval: f1 === 0.8411714830192605



Iteration:   0%|          | 0/1533 [00:00<?, ?it/s][A
Iteration:   0%|          | 1/1533 [00:02<1:02:14,  2.44s/it][A
Iteration:   0%|          | 2/1533 [00:04<1:02:10,  2.44s/it][A
Iteration:   0%|          | 3/1533 [00:07<1:02:07,  2.44s/it][A
Iteration:   0%|          | 4/1533 [00:09<1:01:59,  2.43s/it][A
Iteration:   0%|          | 5/1533 [00:12<1:01:55,  2.43s/it][A
Iteration:   0%|          | 6/1533 [00:14<1:01:49,  2.43s/it][A
Iteration:   0%|          | 7/1533 [00:17<1:01:45,  2.43s/it][A
Iteration:   1%|          | 8/1533 [00:19<1:01:47,  2.43s/it][A
Iteration:   1%|          | 9/1533 [00:21<1:01:45,  2.43s/it][A
Iteration:   1%|          | 10/1533 [00:24<1:01:45,  2.43s/it][A
Iteration:   1%|          | 11/1533 [00:26<1:01:40,  2.43s/it][A
Iteration:   1%|          | 12/1533 [00:29<1:01:42,  2.43s/it][A
Iteration:   1%|          | 13/1533 [00:31<1:01:39,  2.43s/it][A
Iteration:   1%|          | 14/1533 [00:34<1:01:38,  2.43s/it][A
Iteration:   1%|          | 1

=== Train: loss === 0.7235844297223983
=== Train: acc. === 0.7578025428999664
=== Train: f1 === 0.7594172847415417



Eval:   0%|          | 0/219 [00:00<?, ?it/s][A
Eval:   0%|          | 1/219 [00:02<08:23,  2.31s/it][A
Eval:   1%|          | 2/219 [00:04<08:20,  2.31s/it][A
Eval:   1%|▏         | 3/219 [00:06<08:18,  2.31s/it][A
Eval:   2%|▏         | 4/219 [00:09<08:16,  2.31s/it][A
Eval:   2%|▏         | 5/219 [00:11<08:14,  2.31s/it][A
Eval:   3%|▎         | 6/219 [00:13<08:12,  2.31s/it][A
Eval:   3%|▎         | 7/219 [00:16<08:09,  2.31s/it][A
Eval:   4%|▎         | 8/219 [00:18<08:07,  2.31s/it][A
Eval:   4%|▍         | 9/219 [00:20<08:05,  2.31s/it][A
Eval:   5%|▍         | 10/219 [00:23<08:03,  2.31s/it][A
Eval:   5%|▌         | 11/219 [00:25<08:01,  2.31s/it][A
Eval:   5%|▌         | 12/219 [00:27<07:58,  2.31s/it][A
Eval:   6%|▌         | 13/219 [00:30<07:57,  2.32s/it][A
Eval:   6%|▋         | 14/219 [00:32<07:55,  2.32s/it][A
Eval:   7%|▋         | 15/219 [00:34<07:52,  2.32s/it][A
Eval:   7%|▋         | 16/219 [00:37<07:51,  2.32s/it][A
Eval:   8%|▊         | 17/219 [

=== Eval: loss === 0.5059552247121454
=== Eval: acc. === 0.8410891442438084
=== Eval: f1 === 0.8450940006834597





**Final** **Test**

In [None]:
#Using Test
_ = evaluate(True)

#**Plots**

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
def Plotting(trainLA,validationLA):
    trainAcc = trainLA[:,1]
    validationAcc = validationLA[:,1]
    trainLoss = trainLA[:,0]
    validationLoss = validationLA[:,0]
    # Create figure with secondary y-axis
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    fig2 = make_subplots(specs=[[{"secondary_y": True}]])

    # Add traces
    fig.add_trace(go.Scatter(y=validationLoss, name="Validation Loss"), secondary_y=False)
    fig.add_trace(go.Scatter(y=trainLoss, name="Training Loss"), secondary_y=False)
    fig2.add_trace(go.Scatter(y=validationAcc, name="Validation Accuracy"), secondary_y=False)
    fig2.add_trace(go.Scatter(y=trainAcc, name="Train Accuracy"), secondary_y=False)

    # Add figure title
    fig.update_layout(title_text="Loss of Model")
    fig2.update_layout(title_text="Accuracy of Model")

    # Set x-axis title
    fig.update_xaxes(title_text="Epoch")
    fig.update_yaxes(title_text="Loss")
    fig2.update_xaxes(title_text="Epoch")
    fig2.update_yaxes(title_text="Accuracy")
    fig.show()
    fig2.show()
    return

## 10 Programming Languages without comments

In [None]:
trainLA = np.load('/content/gdrive/MyDrive/MLA Project/10LanguagesModel/results/trainLoss_Accuracy.npy')
validationLA = np.load('/content/gdrive/MyDrive/MLA Project/10LanguagesModel/results/validationLoss_Accuracy.npy')

In [None]:
Plotting(trainLA,validationLA)

## 10 Programming Languages the whole transformer

In [None]:
trainLA = np.load('/content/gdrive/MyDrive/MLA Project/training whole model/trainLoss_Accuracy.npy')
validationLA = np.load('/content/gdrive/MyDrive/MLA Project/training whole model/validationLoss_Accuracy.npy')

In [None]:
Plotting(trainLA,validationLA)