<a href="https://colab.research.google.com/github/TheRealMatri/SEO-Testful/blob/main/Testful_for_Github.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required libraries for Hugging Face Transformers, Datasets, JAX, and JAXLib
!pip install transformers datasets jax jaxlib

In [None]:
# Check installed version of Hugging Face transformers
!pip show transformers

In [None]:
# If needed, upgrade transformers to the latest version
!pip install --upgrade transformers datasets jax jaxlib

In [None]:
# Import necessary libraries for model training
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset


In [None]:
# Load the pre-trained GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('ai-forever/rugpt3small_based_on_gpt2')
model = GPT2LMHeadModel.from_pretrained('ai-forever/rugpt3small_based_on_gpt2', torch_dtype=torch.float32)

In [None]:
# Load the train dataset (with 'train' split)
train_dataset = load_dataset('json', data_files='/content/dataset_train.jsonl', split='train')

# Load the test dataset (with 'test' split, but no split argument needed)
test_dataset = load_dataset('json', data_files='/content/dataset_test.jsonl', split='train')  # Use 'train' here as split name


In [None]:
# Define a label mapping with all possible categories
label_map = {
    "Дети": 0,
    "Локализация": 1,
    "Эстетика": 2,
    "Акции": 3,
    "Пенсионеры": 4,
    "Технологии": 5,
    "Ортодонтия": 6,
    "Экстренная помощь": 7,
}

# Function to encode labels
def encode_labels(example):
    example['label'] = label_map.get(example['label'], -1)  # Default to -1 if label is unknown
    return example

# Apply label encoding to both datasets
train_dataset = train_dataset.map(encode_labels)
test_dataset = test_dataset.map(encode_labels)

# Check the updated datasets (optional print)
print(train_dataset[0])  # Print the first item in the train dataset
print(test_dataset[0])   # Print the first item in the test dataset


In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load model & tokenizer from your saved folder
model = GPT2LMHeadModel.from_pretrained('./fine_tuned_model')
tokenizer = GPT2Tokenizer.from_pretrained('./fine_tuned_model')


In [None]:
# Define a function to tokenize the text
def tokenize_function(examples):
    # Concatenate the prompt and completion with a separator (e.g., a special token or just a space)
    inputs = [prompt + "\n" + completion for prompt, completion in zip(examples['prompt'], examples['completion'])]

    # Tokenize the inputs and ensure the correct format for labels
    model_inputs = tokenizer(inputs, truncation=True, padding=True, return_tensors="pt")

    # Ensure labels are shifted by one token for causal language modeling
    model_inputs['labels'] = model_inputs.input_ids.detach().clone()
    return model_inputs

# Apply the tokenization to the dataset
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)


In [None]:
# Define training arguments optimized for GPU (14GB VRAM, 12GB RAM)
training_args = TrainingArguments(
    output_dir='./results',                # Output directory to save model and checkpoints
    logging_dir='./logs',                  # Directory for storing logs
    logging_steps=500,                     # Log every 500 steps
    save_steps=5000,                       # Save checkpoints every 5000 steps
    save_total_limit=3,                    # Only keep the last 3 checkpoints
    per_device_train_batch_size=48,         # Set batch size for training based on GPU RAM
    per_device_eval_batch_size=48,          # Set batch size for evaluation based on GPU RAM         # Accumulate gradients over 4 steps to reduce memory usage
    weight_decay=0.01,                     # Strength of weight decay
    num_train_epochs=3,                    # Number of epochs
    warmup_steps=500,                      # Number of warmup steps for learning rate scheduler
    logging_first_step=True,               # Log the first step of training
    load_best_model_at_end=True,           # Load the best model at the end of training
    metric_for_best_model='loss',     # Metric for selecting the best model
    save_strategy="steps",                 # Save the model every certain number of steps
    eval_strategy="steps",                 # Evaluate every certain number of steps
    eval_steps=200,                        # Evaluate every 200 steps
    dataloader_num_workers=2,              # Number of workers for loading data
    fp16=True,                             # Enable mixed precision training for faster training
    learning_rate=5e-5,                    # Set the learning rate
    report_to="wandb",                     # Log metrics to W&B (optional, remove if not needed)
)

In [None]:
# Create the Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)


In [None]:
import time
from threading import Thread
from IPython.display import display, clear_output

# Set your desired training time in seconds (e.g., 1 hour)
TRAINING_TIME_LIMIT = 60*10  # 1 hour

# Flag to control training
stop_training = False

# Timer function
def stop_training_after_delay():
    global stop_training
    time.sleep(TRAINING_TIME_LIMIT)
    stop_training = True
    clear_output()
    print(f"⏰ Time's up! Stopping training after {TRAINING_TIME_LIMIT // 60} minutes.")

# Start timer
timer_thread = Thread(target=stop_training_after_delay)
timer_thread.start()

# Begin training loop
while not stop_training:
    trainer.train(resume_from_checkpoint=True)
    trainer.save_model("./fine_tuned_model")

print("✅ Training session ended.")


In [None]:
# Evaluate the model on the test dataset
eval_results = trainer.evaluate()

# Print the evaluation results
print(eval_results)

In [None]:
# Save the fine-tuned model and tokenizer
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

In [None]:
from transformers import pipeline, GPT2Tokenizer, GPT2LMHeadModel

# Load the fine-tuned model and tokenizer
model_path = "./fine_tuned_model"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# Create a text generation pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Input your prompt here
prompt = "Сгенерируй рекламу для стамоталогии Smile предлагающей очистку зубов со скидкой 20%"

# Generate output
output = generator(prompt, max_length=100, do_sample=True, top_k=50, top_p=0.95, temperature=0.8)

# Print generated text
print(output[0]['generated_text'])
