In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import pandas as pd
from sklearn.model_selection import train_test_split


In [2]:
data_path = "...." # the path to the data base of contents for training 
data = pd.read_excel(data_path)


In [3]:

data['text'] = data['Titlu'] + " [SEP] " + data['Lead'] + " [SEP] " + data['Continut']
texts = data['text'].tolist()

In [4]:

train_texts, val_texts = train_test_split(texts, test_size=0.1)


In [5]:
t
with open('train_texts.txt', 'w', encoding='utf-8') as f:
    f.write("\n".join(train_texts))
with open('val_texts.txt', 'w', encoding='utf-8') as f:
    f.write("\n".join(val_texts))


In [6]:
# Loading of GPT2 Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')


In [None]:
# Pre-processing Data
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="train_texts.txt",
    block_size=128)

val_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="val_texts.txt",
    block_size=128)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False)


In [8]:

model = GPT2LMHeadModel.from_pretrained('gpt2')

In [9]:

training_args = TrainingArguments(
    output_dir='./...',  # Directory where model checkpoints and output files will be saved
    overwrite_output_dir=True,  # Overwrite the output directory if it already exists
    num_train_epochs=15,  # Number of epochs to train the model
    per_device_train_batch_size=4,  # Batch size for training on each device (e.g., GPU)
    per_device_eval_batch_size=4,  # Batch size for evaluation on each device
    save_steps=10_000,  # Save the model checkpoint every 10,000 steps
    save_total_limit=2,  # Keep only the 2 most recent checkpoints
    prediction_loss_only=True,  # Return only the loss value during evaluation
)


In [10]:

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)


In [None]:

trainer.train()

In [12]:

model_path = './...' # the path for saving the model

In [None]:

model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)


In [None]:

tokenizer = GPT2Tokenizer.from_pretrained(model_path)  # Load the pre-trained tokenizer from the specified model path
model = GPT2LMHeadModel.from_pretrained(model_path)  # Load the pre-trained model from the specified model path


prompt_text = "Lege noua financiara"  # Prompt text to start generating the news article

input_ids = tokenizer.encode(prompt_text, return_tensors='pt')  # Encode the prompt text into input IDs for the model

generated_outputs = model.generate(
    input_ids,
    max_length=600,  # Maximum length of the generated text
    num_beams=2,  # Enable beam search with 2 beams for more diverse and high-quality outputs
    temperature=0.7,  # Adjust the temperature for controlling the randomness of predictions
    no_repeat_ngram_size=2,  # Prevent repetition of 2-grams (2 consecutive words)
    early_stopping=True,  # Stop generation early if all beams reach the end token
    num_return_sequences=2,  # Number of generated sequences to return
)

generated_text = tokenizer.decode(generated_outputs[0], skip_special_tokens=True)  # Decode the first generated sequence
print("Text generat:")  # Print the header for the generated text
print(generated_text)  # Print the generated text
