In [None]:
!pip install sentence-transformers

In [None]:
!pip install transformers
!pip install pandas
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
import os,json
import pandas as pd
import torch
from torch.utils.data import DataLoader, RandomSampler
from transformers import RobertaTokenizer, RobertaForMaskedLM
from transformers import LineByLineTextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import urlextract
import re


In [None]:
extractor = urlextract.URLExtract()

In [None]:
def check_diff_lang(string):
    return any(ord(char) > 127 for char in string)

def check_usable(data):
    temp=data.split("\n")
    length = len(data.split(" "))
    urls = extractor.find_urls(data)

    # to remove url
    for url in urls:
        data = re.sub(re.escape(url), '', data)
        
    avoid_sentence=False
    if length <= 2:
        for word in data.split(" "):
            if any(char.isdigit() for char in word):
                print(word," here ",data)
                avoid_sentence = True
                break
    if check_diff_lang(data):
        avoid_sentence = True
        
    if temp == None or temp=="":
        avoid_sentence = True
    if avoid_sentence:
        return True
    return False

# Function to read the datas from the given folder and make it as a dataset 
def read_files(path,file):
    count = 0
    for filename in sorted(os.listdir(path)):
        if filename.endswith('.jsonl'):
            file_path = os.path.join(path, filename)
            with open(file_path, 'r') as f:
                for line in f:
                    data = json.loads(line)
                    data= data["docstring"].split("\n")[0]
                    if check_usable(data):
                        continue
                    data = data.encode('unicode_escape').decode()
                    file.write(fr"{data}"+"\n")
                    count+=1
            print(file_path," ",count)
    return count
    
    

In [None]:

def prep_data(folder_paths,text_file):
    tot_count=0
    with open(text_file, 'w', encoding='utf-8') as f:
        for path in folder_paths:
            tot_count = tot_count + read_files(path,f)
    return tot_count
            
#prep_data([r'./java/java/final/jsonl/train',r'./python/python/final/jsonl/train'],'train_data.txt')
#prep_data([r'./java/java/final/jsonl/test',r'./python/python/final/jsonl/test'],'test_data.txt')
#prep_data([r'./java/java/final/jsonl/valid',r'./python/python/final/jsonl/valid'],'valid_data.txt')
#paths to load the dataset
print("Number of training dataset sentences = ",prep_data([r'./python/python/final/jsonl/train'],'train_data.txt'))
print("Number of test dataset sentences = ",prep_data([r'./python/python/final/jsonl/test'],'test_data.txt'))
print("Number of validation dataset sentences = ",prep_data([r'./python/python/final/jsonl/valid'],'valid_data.txt'))



In [None]:
# put 'sentence-transformers/stsb-roberta-base' in place of roberta-base to change the model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForMaskedLM.from_pretrained('roberta-base')

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
#load the datasets and apply the tokenizer of the model
train_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="train_data.txt",
    block_size=128,
)
valid_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="valid_data.txt",
    block_size=128,
)
test_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="test_data.txt",
    block_size=56,
)


In [None]:
import math

#to reduce the dataset for fine tuning, currently considering 15% of dataset
train_dataset=train_dataset[:int(math.floor(len(train_dataset)*0.15))]
valid_dataset=valid_dataset[:int(math.floor(len(valid_dataset)*0.15))]
test_dataset=test_dataset[:int(math.floor(len(test_dataset)*0.15))]
print(len(train_dataset), len(valid_dataset), len(test_dataset))

In [None]:
# Setting up the data collator to perform MLM task
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

# Setting up the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_steps=10000,
    save_steps=50000,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=500,
    do_train=True,
    do_eval=True,
    evaluation_strategy="steps",
    eval_accumulation_steps=3,
    load_best_model_at_end=True,
)
torch.cuda.memory_summary(device=None, abbreviated=False)

In [None]:
print(device)

In [None]:
# Trainning the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
    eval_dataset=valid_dataset,
    compute_metrics=lambda p: {"accuracy": (p.predictions.argmax(axis=-1) == p.label_ids).mean().item()},
)

trainer.model.to(device)
trainer.train()

model.save_pretrained('./fin_modelmlm')


In [None]:
#Caution to test the dataset, the system will try to assign big memory, if not able to allocate, it will through an error.
#Even if this evaluation fails, thats fine, the idea is to train it for a few shots and save the model and test it in our main 
#evaluation which is in Evaluation script.
del(train_dataset)
del(valid_dataset)
torch.cuda.empty_cache()
eval_result = trainer.evaluate(test_dataset)

print(eval_result)