# **Imports**

In [None]:
!pip install tokenizers
!pip install datasets
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tokenizers
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 7.0 MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.12.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 8.6 MB/s 
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 55.9 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 55.8 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none

In [None]:
import os
import gzip
import json
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm, trange
import plotly.express as ex
from pathlib import Path
from sklearn.model_selection import train_test_split
from tokenizers.implementations.byte_level_bpe import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from datasets import load_dataset
from torch.utils.data import Dataset,DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import RobertaForSequenceClassification
from transformers.data.metrics import simple_accuracy
from sklearn.metrics import f1_score

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# **Data Setting**

### Data Collection

**Constants**

In [None]:
Size = 10000

In [None]:
LANGUAGES = {
    "Assembly": 0,
    "Batchfile": 1,
    "C": 2,
    "C#": 3,
    "C++": 4,
    "CMake": 5,
    "CSS": 6,
    "Dockerfile": 7,
    "FORTRAN": 8,
    "GO": 9,
    "Haskell": 10,
    "HTML":11,
    "Java": 12,
    "JavaScript": 13,
    "Julia": 14,
    "Lua": 15,
    "Makefile": 16,
    "Markdown": 17,
    "PHP": 18,
    "Perl": 19,
    "PowerShell": 20,
    "Python": 21,
    "Ruby": 22,
    "Rust": 23,
    "SQL": 24,
    "Scala": 25,
    "Shell": 26,
    "TypeScript": 27,
    "TeX": 28,
    "Visual Basic": 29
}

**Data Extraction**

In [None]:
DataFrame = pd.DataFrame(columns = ['code', 'language'])
for l in Languages:
  dataset = load_dataset("codeparrot/github-code", streaming=True, split='train', languages = [l])
  dataset = dataset.remove_columns(['repo_name', 'path', 'license', 'size'])
  dataset = dataset.take(Size)
  DataFrame = DataFrame.append(pd.DataFrame(dataset))
DataFrame.to_parquet('/content/gdrive/MyDrive/MLA Project/data.parquet')

**Data** **Visulaization**

In [None]:
dataset_count = DataFrame.groupby('language').count()
ex.pie(dataset_count.index[:],names='language',title='Proportion Of Different Languages',hole=0.33)

### Reading Data

In [None]:
DataFrame = pd.read_parquet('/content/gdrive/MyDrive/MLA Project/balancedData.parquet')

In [None]:
Data = DataFrame['code'].to_numpy()
Label = DataFrame['language'].to_numpy()

###Splitting Data

In [None]:
train_data, test_data , train_labels, test_labels = train_test_split(Data, Label, test_size=0.3, random_state=0,stratify=Label)
validation_data, test_data, validation_labels, test_labels=train_test_split(test_data, test_labels, test_size=(2/3), random_state=0, stratify=test_labels)

In [None]:
print(len(train_data))
print(len(validation_data))
print(len(test_data))

159705
22815
45630


### Saving Data

In [None]:
np.save('/content/gdrive/MyDrive/MLA Project/balancedData/trainData.npy', train_data)
np.save('/content/gdrive/MyDrive/MLA Project/balancedData/trainLabels.npy', train_labels)
np.save('/content/gdrive/MyDrive/MLA Project/balancedData/validationData.npy', validation_data)
np.save('/content/gdrive/MyDrive/MLA Project/balancedData/validationLabels.npy', validation_labels)
np.save('/content/gdrive/MyDrive/MLA Project/balancedData/testData.npy', test_data)
np.save('/content/gdrive/MyDrive/MLA Project/balancedData/testLabels.npy', test_labels)

# **Removing** **Comments**

### Script

In [None]:
DataFrame = pd.read_parquet('/content/gdrive/MyDrive/MLA Project/dataset.parquet')

In [None]:
LANGUAGES = {
    "Assembly": [';','\n'],
    "Batchfile": ['::','\n','REM','\n'],
    "C": ['/*','*/','//','\n'],
    "C#": ['/*','*/','//','\n'],
    "C++": ['/*','*/','//','\n'],
    "CMake": ['#[[',']]','#','\n'],            
    "CSS": ['/*','*/'],
    "Dockerfile": ['#','\n'],
    "FORTRAN": ['!','\n'],
    "GO": ['/*','*/','//','\n'],
    "Haskell": ['--','\n','{-','-}'],
    "HTML": ['<!--', '-->'],
    "Java": ['/*','*/','//','\n'],
    "JavaScript": ['/*','*/','//','\n'],
    "Julia": ['#','\n','#=','=#'],
    "Lua": ['--[[','--]]','--','\n'], 
    "Markdown": ['[',']: #','[//]: # (',')','[//]: # "','"','[]: # (',')','[]: # "','"','<!--','-->'],
    "Makefile": ['#','\n'],
    "PHP": ['/*','*/','//','\n'],
    "Perl": ['#','\n','=begin','=end'],
    "PowerShell": ['#','\n','<#','#>'],
    "Python": ['#','\n'],
    "Ruby": ['#','\n','=begin','=end'],
    "Rust": ['/*','*/','//','\n'],
    "SQL": ['/*','*/','--','\n'],
    "Scala": ['/*','*/','//','\n'],
    "Shell": [": '", " '"],
    "TypeScript": ['//','\n','/*','*/'],
    "TeX": ['%','\n'],
    "Visual Basic": ["'","\n"]
}

In [None]:
def removeSpecialComments(code, specialChars, shell=False):
  lines = code.splitlines()
  temp = []
  for line in lines:
    if shell:
      tempLine = line.strip()
    else:
      tempLine = line
    if tempLine == '':
      continue
    if tempLine[0] in specialChars:
      temp.append(line)
  for line in temp:
    lines.remove(line)
  code = "\n".join(lines)
  return code

In [None]:
def RemoveComments(code, language):
  identifier = LANGUAGES[language]
  for k in range(0, len(identifier), 2):
    start = identifier[k]
    end = identifier[k+1]
    temp = []
    commentIndex = []
    for i in range(len(code)):
      if code[i : i + len(start)] == start:
        for j in range(i+len(start)+1, len(code)):
          if code[j : j + len(end)] == end and code[j - 1 : j - 1 + len(start)] != start:
            commentIndex.append((i, j+len(end)))
            break
    temp[:0] = code
    for l,m in reversed(commentIndex):
      del temp[l:m + len(end)]
    code = ''.join(temp)
  if language == "FORTRAN":
    code = removeSpecialComments(code, ['*', 'C', 'c', 'd', 'D'])
  elif language == 'Shell':
    code = removeSpecialComments(code, ['#'], True)
  return code

In [None]:
new_df = pd.DataFrame(columns = ['code', 'language'])
languages=DataFrame.language.unique()
for l in range(len(languages)):
  for i in range(int(len(DataFrame)/len(languages))):
    x = DataFrame.loc[DataFrame['language'] == languages[l]]["code"][i]
    y = RemoveComments(x, languages[l])
    new_row = {'code':y, 'language':languages[l]}
    new_df = new_df.append(new_row, ignore_index=True)

new_df.to_parquet('/content/gdrive/MyDrive/MLA Project/dataWithoutComments.parquet')

### Balancing Data

In [None]:
DataFrame = pd.read_parquet('/content/gdrive/MyDrive/MLA Project/dataWithoutComments.parquet')

In [None]:
DataFrame.groupby('language').count()

Unnamed: 0_level_0,code
language,Unnamed: 1_level_1
Assembly,10000
Batchfile,10000
C,10000
C#,10000
C++,10000
CMake,10000
CSS,10000
Dockerfile,10000
FORTRAN,10000
GO,10000


In [None]:
languages=DataFrame.language.unique()
new_df = pd.DataFrame(columns = ['code', 'language'])
for l in range(len(languages)):
  for i in range(int(len(DataFrame)/len(languages))):
    x = DataFrame['code'][(l*(int(len(DataFrame)/len(languages))))+i]
    if len(x) != 0 :
      new_row = {'code':x, 'language':languages[l]}
      new_df = new_df.append(new_row, ignore_index=True)

In [None]:
new_df.groupby('language').count()

Unnamed: 0_level_0,code
language,Unnamed: 1_level_1
Assembly,9397
Batchfile,9955
C,9815
C#,9954
C++,9776
CMake,9149
CSS,9892
Dockerfile,9862
FORTRAN,9769
GO,9894


In [None]:
new_df.to_parquet('/content/gdrive/MyDrive/MLA Project/dataWithoutEmptyFiles.parquet')

In [None]:
new_df = pd.read_parquet('/content/gdrive/MyDrive/MLA Project/dataWithoutEmptyFiles.parquet')

In [None]:
len(new_df)

290298

In [None]:
files_count_map = {
    "Assembly": 9397,
    "Batchfile": 9955,
    "C": 9815,
    "C#": 9954,
    "C++": 9776,
    "CMake": 9149,
    "CSS": 9892,
    "Dockerfile": 9862,
    "FORTRAN": 9769,
    "GO": 9894,
    "Haskell": 9995,
    "HTML":9415,
    "Java": 9976,
    "JavaScript": 9928,
    "Julia": 9607,
    "Lua": 7605,
    "Makefile": 9309,
    "Markdown": 9999,
    "PHP": 9999,
    "Perl": 9469,
    "PowerShell": 9792,
    "Python": 9649,
    "Ruby": 9816,
    "Rust": 9308,
    "SQL": 9667,
    "Scala": 9970,
    "Shell": 9978,
    "TypeScript": 9703,
    "TeX": 9888,
    "Visual Basic": 9762
}

In [None]:
# Mapping
languages=new_df.language.unique()
files_count = []
for i in range (len(languages)):
  files_count.append(files_count_map[languages[i]])
files_count = np.array(files_count)

# Making new df with balanced data
final_df = pd.DataFrame(columns = ['code', 'language'])
start = 0
end = 0
for l in range (len(languages)):
  end +=files_count[l]
  count = 0
  for j in range(start,end):
    if count < 7605 :
      x = new_df['code'][j]
      new_row = {'code':x, 'language':languages[l]}
      final_df = final_df.append(new_row, ignore_index=True)
      count +=1
  start +=files_count[l]

In [None]:
final_df.groupby('language').count()

Unnamed: 0_level_0,code
language,Unnamed: 1_level_1
Assembly,7605
Batchfile,7605
C,7605
C#,7605
C++,7605
CMake,7605
CSS,7605
Dockerfile,7605
FORTRAN,7605
GO,7605


In [None]:
dataset_count = final_df.groupby('language').count()
ex.pie(dataset_count.index[:],names='language',title='Proportion Of Different Languages',hole=0.33)

In [None]:
final_df.to_parquet('/content/gdrive/MyDrive/MLA Project/balancedData.parquet')

# **Tokenizer**

### Load data

In [None]:
trainData = np.load('/content/gdrive/MyDrive/MLA Project/balancedData/trainData.npy', allow_pickle=True)
trainLabels = np.load('/content/gdrive/MyDrive/MLA Project/balancedData/trainLabels.npy', allow_pickle=True)
testData = np.load('/content/gdrive/MyDrive/MLA Project/balancedData/testData.npy', allow_pickle=True)
testLabels = np.load('/content/gdrive/MyDrive/MLA Project/balancedData/testLabels.npy', allow_pickle=True)
validationData = np.load('/content/gdrive/MyDrive/MLA Project/balancedData/validationData.npy', allow_pickle=True)
validationLabels = np.load('/content/gdrive/MyDrive/MLA Project/balancedData/validationLabels.npy', allow_pickle=True)

### Training

In [None]:
text = []
file_count = 1
for i in tqdm(range(1, trainData.size + 1)):
  text.append(trainData[i-1])
  if i % 15900 == 0:
    with open(f'/content/gdrive/MyDrive/MLA Project/dump/{file_count}.txt', 'w') as fp:
      fp.write('\n'.join(text))
      file_count += 1
      text = []
with open(f'/content/gdrive/MyDrive/MLA Project/dump/{file_count}.txt', 'w') as fp:
      fp.write('\n'.join(text))

100%|██████████| 159705/159705 [00:07<00:00, 21834.56it/s]


In [None]:
from pathlib import Path
paths = [str(x) for x in Path('/content/gdrive/MyDrive/MLA Project/dump').glob('**/*.txt')]

In [None]:
print(paths)

['/content/gdrive/MyDrive/MLA Project/dump/1.txt', '/content/gdrive/MyDrive/MLA Project/dump/2.txt', '/content/gdrive/MyDrive/MLA Project/dump/3.txt', '/content/gdrive/MyDrive/MLA Project/dump/4.txt', '/content/gdrive/MyDrive/MLA Project/dump/5.txt', '/content/gdrive/MyDrive/MLA Project/dump/6.txt', '/content/gdrive/MyDrive/MLA Project/dump/7.txt', '/content/gdrive/MyDrive/MLA Project/dump/8.txt', '/content/gdrive/MyDrive/MLA Project/dump/9.txt', '/content/gdrive/MyDrive/MLA Project/dump/10.txt', '/content/gdrive/MyDrive/MLA Project/dump/11.txt']


In [None]:
from tokenizers import ByteLevelBPETokenizer
# initialize
tokenizer = ByteLevelBPETokenizer()
# and train
tokenizer.train(files=paths, vocab_size=50000 , min_frequency=2,
                special_tokens=['<|endoftext|>', '<s>', '<pad>', '</s>', '<unk>', '<mask>'])

In [None]:
tokenizer.save_model('/content/gdrive/MyDrive/MLA Project/tokenizer/without comments')

['/content/gdrive/MyDrive/MLA Project/tokenizer/without comments/vocab.json',
 '/content/gdrive/MyDrive/MLA Project/tokenizer/without comments/merges.txt']

### Tokenizing Data

In [None]:
# Set up tokenizer
tokenizer = ByteLevelBPETokenizer("/content/gdrive/MyDrive/MLA Project/tokenizer/without comments/vocab.json", "/content/gdrive/MyDrive/MLA Project/tokenizer/without comments/merges.txt",)
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)
tokenizer.enable_padding(length=512)

In [None]:
for i in tqdm(range(15)):
  encoding = tokenizer.encode_batch(trainData[10000*i:10000*(i+1)])
  en_train_data = [e.ids for e in encoding]
  np.save(f'/content/gdrive/MyDrive/MLA Project/balancedData/TOKENIZEDtrainData{i}.npy', en_train_data)
i += 1
encoding = tokenizer.encode_batch(trainData[10000*i:])
en_train_data = [e.ids for e in encoding]
np.save(f'/content/gdrive/MyDrive/MLA Project/balancedData/TOKENIZEDtrainData{i}.npy', en_train_data)

100%|██████████| 15/15 [11:07<00:00, 44.51s/it]


In [None]:
temp = []
for i in tqdm(range(16)):
  temp.extend(np.load(f'/content/gdrive/MyDrive/MLA Project/balancedData/TOKENIZEDtrainData{i}.npy'))
np.save('/content/gdrive/MyDrive/MLA Project/balancedData/TOKENIZEDtrainData.npy', temp)

100%|██████████| 16/16 [00:02<00:00,  7.73it/s]


In [None]:
for i in tqdm(range(2)):
  encoding = tokenizer.encode_batch(validationData[10000*i:10000*(i+1)])
  en_validation_data = [e.ids for e in encoding]
  np.save(f'/content/gdrive/MyDrive/MLA Project/balancedData/TOKENIZEDvalidationData{i}.npy', en_validation_data)
i += 1
encoding = tokenizer.encode_batch(validationData[10000*i:])
en_validation_data = [e.ids for e in encoding]
np.save(f'/content/gdrive/MyDrive/MLA Project/balancedData/TOKENIZEDvalidationData{i}.npy', en_validation_data)

100%|██████████| 2/2 [01:28<00:00, 44.05s/it]


In [None]:
temp = []
for i in tqdm(range(3)):
  temp.extend(np.load(f'/content/gdrive/MyDrive/MLA Project/balancedData/TOKENIZEDvalidationData{i}.npy'))
np.save('/content/gdrive/MyDrive/MLA Project/balancedData/TOKENIZEDvalidationData.npy', temp)

100%|██████████| 3/3 [00:00<00:00, 18.55it/s]


In [None]:
del validationData
del en_validation_data

In [None]:
for i in tqdm(range(4)):
  encoding = tokenizer.encode_batch(testData[10000*i:10000*(i+1)])
  en_test_data = [e.ids for e in encoding]
  np.save(f'/content/gdrive/MyDrive/MLA Project/balancedData/TOKENIZEDtestData{i}.npy', en_test_data)
i += 1
encoding = tokenizer.encode_batch(testData[10000*i:])
en_test_data = [e.ids for e in encoding]
np.save(f'/content/gdrive/MyDrive/MLA Project/balancedData/TOKENIZEDtestData{i}.npy', en_test_data)

100%|██████████| 4/4 [03:22<00:00, 50.64s/it]


In [None]:
temp = []
for i in tqdm(range(5)):
  temp.extend(np.load(f'/content/gdrive/MyDrive/MLA Project/balancedData/TOKENIZEDtestData{i}.npy'))
np.save('/content/gdrive/MyDrive/MLA Project/balancedData/TOKENIZEDtestData.npy', temp)

100%|██████████| 5/5 [00:00<00:00, 15.70it/s]


In [None]:
del testData
del en_test_data

# **Transformer**

### Loading Data

In [None]:
trainData = np.load('/content/gdrive/MyDrive/MLA Project/data/TOKENIZEDtrainData.npy')
validationData = np.load('/content/gdrive/MyDrive/MLA Project/data/TOKENIZEDvalidationData.npy')
testData = np.load('/content/gdrive/MyDrive/MLA Project/data/TOKENIZEDtestData.npy')
trainLabels = np.load('/content/gdrive/MyDrive/MLA Project/data/trainLabels.npy', allow_pickle=True)
testLabels = np.load('/content/gdrive/MyDrive/MLA Project/data/testLabels.npy', allow_pickle=True)
validationLabels = np.load('/content/gdrive/MyDrive/MLA Project/data/validationLabels.npy', allow_pickle=True)

### Loading Model

In [None]:
model = RobertaForSequenceClassification.from_pretrained('/content/gdrive/MyDrive/MLA Project/model/weights')

## Training Script

**Constants**

In [None]:
CODEBERTA_PRETRAINED = "huggingface/CodeBERTa-language-id"
EVALUATE = False
LANGUAGES = {
    "Assembly": 0,
    "Batchfile": 1,
    "C": 2,
    "C#": 3,
    "C++": 4,
    "CMake": 5,
    "CSS": 6,
    "Dockerfile": 7,
    "FORTRAN": 8,
    "GO": 9,
    "Haskell": 10,
    "HTML":11,
    "Java": 12,
    "JavaScript": 13,
    "Julia": 14,
    "Lua": 15,
    "Makefile": 16,
    "Markdown": 17,
    "PHP": 18,
    "Perl": 19,
    "PowerShell": 20,
    "Python": 21,
    "Ruby": 22,
    "Rust": 23,
    "SQL": 24,
    "Scala": 25,
    "Shell": 26,
    "TypeScript": 27,
    "TeX": 28,
    "Visual Basic": 29
}

**Data Class**

In [None]:
class CodeDataset(Dataset):
    def __init__(self, data, label, language):
      self.data = data
      self.label = label
      self.language = language

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
      return torch.tensor(self.data[index]), torch.tensor(self.language[self.label[index]])

In [None]:
train_dataset = CodeDataset(trainData, trainLabels, LANGUAGES)
eval_dataset = CodeDataset(validationData, validationLabels, LANGUAGES)
test_dataset = CodeDataset(testData, testLabels, LANGUAGES)

**Model**

In [None]:
model = RobertaForSequenceClassification.from_pretrained(CODEBERTA_PRETRAINED, num_labels=len(LANGUAGES), ignore_mismatched_sizes=True)

Downloading:   0%|          | 0.00/756 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/336M [00:00<?, ?B/s]

Some weights of the model checkpoint at huggingface/CodeBERTa-language-id were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at huggingface/CodeBERTa-language-id and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([6, 768]) in the checkpoint and torch.Size([10, 768]) 

**DataLoader**

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)

**Miscellaneous**

In [None]:
model.to("cuda")
model.train()
for param in model.roberta.parameters():
  param.requires_grad = False
## ^^ Only train final layer.

In [None]:
print(f"num params:", model.num_parameters())
print(f"num trainable params:", model.num_parameters(only_trainable=True))

num params: 83473950
num trainable params: 613662


**Validation & Test**

In [None]:
def evaluate(test = False):
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = np.empty((0), dtype=np.int64)
    out_label_ids = np.empty((0), dtype=np.int64)

    model.eval()
    if test == False :
      dataloader = DataLoader(eval_dataset, batch_size=128)
    else:
      dataloader = DataLoader(test_dataset, batch_size=128)
    for step, (input_ids, labels) in enumerate(tqdm(dataloader, desc="Eval")):
        with torch.no_grad():
            outputs = model(input_ids=input_ids.to("cuda"), labels=labels.to("cuda"))
            loss = outputs[0]
            logits = outputs[1]
            eval_loss += loss.mean().item()
            nb_eval_steps += 1
        preds = np.append(preds, logits.argmax(dim=1).detach().cpu().numpy(), axis=0)
        out_label_ids = np.append(out_label_ids, labels.detach().cpu().numpy(), axis=0)
        del input_ids
        del labels
        del outputs
        torch.cuda.empty_cache()
    eval_loss = eval_loss / nb_eval_steps
    acc = simple_accuracy(preds, out_label_ids)
    f1 = f1_score(y_true=out_label_ids, y_pred=preds, average="macro")
    if test == False :
      print("=== Eval: loss ===", eval_loss)
      print("=== Eval: acc. ===", acc)
      print("=== Eval: f1 ===", f1)
    else:
      print("=== Test: loss ===", eval_loss)
      print("=== Test: acc. ===", acc)
      print("=== Test: f1 ===", f1)
    return (eval_loss, acc)

**Training**

In [None]:
trainLA = np.load('/content/gdrive/MyDrive/MLA Project/training whole model/trainLoss_Accuracy.npy')
validationLA = np.load('/content/gdrive/MyDrive/MLA Project/training whole model/validationLoss_Accuracy.npy')

In [None]:
train_iterator = trange(0, 2, desc="Epoch")
optimizer = torch.optim.AdamW(model.parameters())
for _ in train_iterator:
    train_loss = 0.0
    nb_train_steps = 0
    preds = np.empty((0), dtype=np.int64)
    out_label_ids = np.empty((0), dtype=np.int64)
    epoch_iterator = tqdm(train_dataloader, desc="Iteration")
    for step, (input_ids, labels) in enumerate(epoch_iterator):
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids.to("cuda"), labels=labels.to("cuda"))
        loss = outputs[0]
        loss.backward()
        logits = outputs[1]
        train_loss += loss.mean().item()
        nb_train_steps += 1
        preds = np.append(preds, logits.argmax(dim=1).detach().cpu().numpy(), axis=0)
        out_label_ids = np.append(out_label_ids, labels.detach().cpu().numpy(), axis=0)
        optimizer.step()
        del input_ids
        del labels
        del outputs
        torch.cuda.empty_cache()
    train_loss = train_loss / nb_train_steps
    acc = simple_accuracy(preds, out_label_ids)
    f1 = f1_score(y_true=out_label_ids, y_pred=preds, average="macro")
    print("=== Train: loss ===", train_loss)
    print("=== Train: acc. ===", acc)
    print("=== Train: f1 ===", f1)
    model.save_pretrained("/content/gdrive/MyDrive/MLA Project/training whole model/model")
    val = evaluate()
    trainLA = np.append(trainLA, [[train_loss, acc]],axis=0)
    validationLA = np.append(validationLA, [val],axis=0)
    np.save('/content/gdrive/MyDrive/MLA Project/training whole model/trainLoss_Accuracy.npy', trainLA)
    np.save('/content/gdrive/MyDrive/MLA Project/training whole model/validationLoss_Accuracy.npy', validationLA)
    model.train()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Iteration:  18%|█▊        | 1133/6130 [32:54<2:25:29,  1.75s/it][A
Iteration:  18%|█▊        | 1134/6130 [32:55<2:25:39,  1.75s/it][A
Iteration:  19%|█▊        | 1135/6130 [32:57<2:25:53,  1.75s/it][A
Iteration:  19%|█▊        | 1136/6130 [32:59<2:25:59,  1.75s/it][A
Iteration:  19%|█▊        | 1137/6130 [33:01<2:26:12,  1.76s/it][A
Iteration:  19%|█▊        | 1138/6130 [33:02<2:26:01,  1.76s/it][A
Iteration:  19%|█▊        | 1139/6130 [33:04<2:26:13,  1.76s/it][A
Iteration:  19%|█▊        | 1140/6130 [33:06<2:26:18,  1.76s/it][A
Iteration:  19%|█▊        | 1141/6130 [33:08<2:25:59,  1.76s/it][A
Iteration:  19%|█▊        | 1142/6130 [33:09<2:26:11,  1.76s/it][A
Iteration:  19%|█▊        | 1143/6130 [33:11<2:26:03,  1.76s/it][A
Iteration:  19%|█▊        | 1144/6130 [33:13<2:25:57,  1.76s/it][A
Iteration:  19%|█▊        | 1145/6130 [33:15<2:25:46,  1.75s/it][A
Iteration:  19%|█▊        | 1146/6130 [33:17<2:25:5

=== Train: loss === 2.3956004555034793
=== Train: acc. === 0.10034462718068457
=== Train: f1 === 0.10032644166224432



Eval:   0%|          | 0/876 [00:00<?, ?it/s][A
Eval:   0%|          | 1/876 [00:00<08:11,  1.78it/s][A
Eval:   0%|          | 2/876 [00:01<08:25,  1.73it/s][A
Eval:   0%|          | 3/876 [00:01<08:13,  1.77it/s][A
Eval:   0%|          | 4/876 [00:02<08:22,  1.74it/s][A
Eval:   1%|          | 5/876 [00:02<08:16,  1.75it/s][A
Eval:   1%|          | 6/876 [00:03<08:15,  1.75it/s][A
Eval:   1%|          | 7/876 [00:03<08:15,  1.75it/s][A
Eval:   1%|          | 8/876 [00:04<08:14,  1.76it/s][A
Eval:   1%|          | 9/876 [00:05<08:13,  1.76it/s][A
Eval:   1%|          | 10/876 [00:05<08:13,  1.76it/s][A
Eval:   1%|▏         | 11/876 [00:06<08:14,  1.75it/s][A
Eval:   1%|▏         | 12/876 [00:06<08:14,  1.75it/s][A
Eval:   1%|▏         | 13/876 [00:07<08:14,  1.75it/s][A
Eval:   2%|▏         | 14/876 [00:07<08:12,  1.75it/s][A
Eval:   2%|▏         | 15/876 [00:08<08:11,  1.75it/s][A
Eval:   2%|▏         | 16/876 [00:09<08:11,  1.75it/s][A
Eval:   2%|▏         | 17/876 [

=== Eval: loss === 2.313496868904323
=== Eval: acc. === 0.09999286275069588
=== Eval: f1 === 0.01818063846353491



Iteration:   0%|          | 0/6130 [00:00<?, ?it/s][A
Iteration:   0%|          | 1/6130 [00:01<2:58:15,  1.75s/it][A
Iteration:   0%|          | 2/6130 [00:03<2:59:27,  1.76s/it][A
Iteration:   0%|          | 3/6130 [00:05<2:59:04,  1.75s/it][A
Iteration:   0%|          | 4/6130 [00:07<2:58:57,  1.75s/it][A
Iteration:   0%|          | 5/6130 [00:08<2:59:14,  1.76s/it][A
Iteration:   0%|          | 6/6130 [00:10<2:58:56,  1.75s/it][A
Iteration:   0%|          | 7/6130 [00:12<2:58:39,  1.75s/it][A
Iteration:   0%|          | 8/6130 [00:14<2:58:35,  1.75s/it][A
Iteration:   0%|          | 9/6130 [00:15<2:58:50,  1.75s/it][A
Iteration:   0%|          | 10/6130 [00:17<2:58:41,  1.75s/it][A
Iteration:   0%|          | 11/6130 [00:19<2:58:37,  1.75s/it][A
Iteration:   0%|          | 12/6130 [00:21<2:58:54,  1.75s/it][A
Iteration:   0%|          | 13/6130 [00:22<2:58:47,  1.75s/it][A
Iteration:   0%|          | 14/6130 [00:24<2:58:36,  1.75s/it][A
Iteration:   0%|          | 1

**Final** **Test**

In [None]:
#Using Test
_ = evaluate(True)

Eval: 100%|██████████| 469/469 [17:14<00:00,  2.21s/it]

=== Test: loss === 0.9508065186075564
=== Test: acc. === 0.7295833333333334
=== Test: f1 === 0.7406745598779096





#**Plots**

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
def Plotting(trainLA,validationLA):
    trainAcc = trainLA[:,1]
    validationAcc = validationLA[:,1]
    trainLoss = trainLA[:,0]
    validationLoss = validationLA[:,0]
    # Create figure with secondary y-axis
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    fig2 = make_subplots(specs=[[{"secondary_y": True}]])

    # Add traces
    fig.add_trace(go.Scatter(y=validationLoss, name="Validation Loss"), secondary_y=False)
    fig.add_trace(go.Scatter(y=trainLoss, name="Training Loss"), secondary_y=False)
    fig2.add_trace(go.Scatter(y=validationAcc, name="Validation Accuracy"), secondary_y=False)
    fig2.add_trace(go.Scatter(y=trainAcc, name="Train Accuracy"), secondary_y=False)

    # Add figure title
    fig.update_layout(title_text="Loss of Model")
    fig2.update_layout(title_text="Accuracy of Model")

    # Set x-axis title
    fig.update_xaxes(title_text="Epoch")
    fig.update_yaxes(title_text="Loss")
    fig2.update_xaxes(title_text="Epoch")
    fig2.update_yaxes(title_text="Accuracy")
    fig.show()
    fig2.show()
    return

## 30 Programming Languages with comments

In [None]:
trainLA = np.load('/content/gdrive/MyDrive/MLA Project/model/results with/trainLoss_Accuracy.npy')
validationLA = np.load('/content/gdrive/MyDrive/MLA Project/model/results with/validationLoss_Accuracy.npy')

In [None]:
Plotting(trainLA,validationLA)

## 30 Programming Languages without comments

In [None]:
trainLA = np.load('/content/gdrive/MyDrive/MLA Project/model/results without/trainLoss_Accuracy.npy')
validationLA = np.load('/content/gdrive/MyDrive/MLA Project/model/results without/validationLoss_Accuracy.npy')

In [None]:
Plotting(trainLA,validationLA)