In [1]:
import warnings
import os
warnings.filterwarnings("ignore")

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Usuing device: {device}')

from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import TrainingArguments, Trainer
from transformers import TextDataset, DataCollatorForLanguageModeling

Usuing device: cuda


In [2]:
!nvidia-smi

Sat Apr 20 18:57:01 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 552.22                 Driver Version: 552.22         CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3060 Ti   WDDM  |   00000000:01:00.0  On |                  N/A |
| 30%   29C    P8             15W /  225W |    7666MiB /   8192MiB |      6%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [5]:
def contar_caracteres(archivo, max_block_size=128):
    with open(archivo, 'r', encoding='utf-8') as file:
        text = file.read()
    
    word_count = len(text.split())
    block_size = min(word_count, max_block_size)
    return block_size

def load_dataset(file_path, tokenizer):
    # tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = contar_caracteres(file_path),
        cache_dir=None,
    )
    print(contar_caracteres(file_path))
    return dataset

def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=mlm,
    )
    return data_collator

In [6]:
def train(train_file_path,model_name,output_dir,overwrite_output_dir,per_device_train_batch_size,num_train_epochs):
  
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)
      
  model = GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          logging_dir="C:/Users/User/DiTransformer-Code/logs",
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )
      
  trainer.train()
  trainer.save_model()

In [6]:
## GPT-2 Small ('gpt2'): 124 million parameters.
## GPT-2 Medium ('gpt2-medium'): 345 million parameters.
## GPT-2 Large ('gpt2-large'): 774 million parameters.
## GPT-2 XL ('gpt2-xl'): 1.5 billion parameters.
# , bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>'

model_name = 'Models'
output_dir = 'C:/Users/User/DiTransformer-Code/Models'
overwrite_output_dir = True
per_device_train_batch_size = 8
num_train_epochs = 1.0
save_steps = 50000

In [7]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
tokenizer.save_pretrained(output_dir)
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
model.save_pretrained(output_dir)

In [None]:
path = "Data/Dato_Bloques/Bloque 1/Code"
archivos = [archivo for archivo in os.listdir(path)]
for archivo in archivos:
    print(archivo)
    archivos_txt = [archivo for archivo in os.listdir(f"{path}/{archivo}") if archivo.endswith('.txt')]
    for archivo_txt in archivos_txt:
        if contar_caracteres(f"{path}/{archivo}/{archivo_txt}") != 0:
            print(archivo_txt)
            train(f"{path}/{archivo}/{archivo_txt}",model_name,output_dir,overwrite_output_dir,per_device_train_batch_size,num_train_epochs)

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained('Models')
model = GPT2LMHeadModel.from_pretrained('Models')

def generate(code, max_length=50):
    tokenized = tokenizer.encode(code, return_tensors='pt')
    resp = model.generate(
        tokenized,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )

    return (tokenizer.decode(resp[0]))

print(generate("import numpy as np"))