<a href="https://colab.research.google.com/github/Tobb-e/machine-learning/blob/main/Training_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title # Connect Google Drive

from google.colab import drive

drive.mount('/content/gdrive')

In [None]:
%env WANDB_LOG_MODEL=true

In [None]:
%pip install huggingface_hub==0.11.0 -q
%pip install transformers -q
%pip install scikit-learn -q
%pip install wandb -q
import wandb
%load_ext wandb
wandb.login()

In [None]:
%%time
%%wandb

import os
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments, TrainerCallback
from torch.optim import AdamW  # Import AdamW from torch.optim
from tqdm import tqdm

class WandbCallback(TrainerCallback):
    def on_log(self, args, state, control, model=None, logs=None, **kwargs):
        logs = logs or {}
        wandb.log(logs, step=state.global_step)

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: val[idx].clone().detach() for key, val in self.encodings.items()}


    def __len__(self):
        return len(self.encodings.input_ids)

run = wandb.init(
      project="python_model",
      config={
      "batch_size": 128,
      "learning_rate": 0.01,
      "dataset": "python_code",
})

os.environ["WANDB_NOTEBOOK_NAME"] = "./train_model_wandb.ipynb"

# Load the GPT-2 tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Load the GPT-2 model
model = AutoModelForCausalLM.from_pretrained("gpt2")

# Add special tokens and update the tokenizer's vocabulary
special_tokens_dict = {'pad_token': '[PAD]'}
num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)

# Resize token embeddings
model.resize_token_embeddings(len(tokenizer))

# Save the tokenizer to update the vocabulary size
tokenizer.save_pretrained('gdrive/MyDrive/machine_learning/models')

# save your trained model checkpoint to wandb
os.environ["WANDB_LOG_MODEL"]="true"

# Load the dataset
data = pd.read_csv('gdrive/MyDrive/machine_learning/csv/python_code02.csv')

# Split the dataset into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.25, random_state=42)

# Tokenize the training and validation datasets with a max_length
max_length = 512
train_encodings = tokenizer(train_data['Snippet'].tolist(), truncation=True, padding=True, max_length=max_length, return_tensors='pt')
val_encodings = tokenizer(val_data['Snippet'].tolist(), truncation=True, padding=True, max_length=max_length, return_tensors='pt')

# Create the custom dataset objects
train_dataset = CustomDataset(train_encodings)
val_dataset = CustomDataset(val_encodings)

# Create the data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Set up the training arguments
training_args = TrainingArguments(
    output_dir='gdrive/MyDrive/machine_learning/models',
    evaluation_strategy="steps",
    save_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
#    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    num_train_epochs=3,  # Number of complete passes through the training dataset
    weight_decay=0.01,
    push_to_hub=False,
    logging_dir='gdrive/MyDrive/machine_learning/logs',
    report_to="wandb",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    save_total_limit=2,
    save_steps=1000,
    eval_steps=1000,
    fp16=True, # Enable mixed precision training
)

# Create the Trainer object
trainer = Trainer(
    callbacks=[WandbCallback()],  # Add the WandbCallback here
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
    eval_dataset=val_dataset,
    optimizers=(AdamW(model.parameters(), lr=2e-5), None),  # Pass the AdamW optimizer
)

# Fine-tune the model on the training set
progress_bar = tqdm(range(training_args.num_train_epochs), desc="Training")
for epoch in progress_bar:
    trainer.train()
    progress_bar.set_description("Epoch {}".format(epoch+1))

wandb.log({"steps": steps, "loss": loss, "eval": eval})

# [optional] finish the wandb run, necessary in notebooks
wandb.finish()