**POSITIVE WORDS GENERATOR**

In [None]:
pip install transformers datasets torch # Install prerequisites



In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
import torch

file_path = "/positive.txt"  # file path to the data file

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
### 1. Load and process the Dataset ###
def load_positive_words(file_path):
    words = []
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            parts = line.strip().split()  # Split line by spaces
            if len(parts) == 2:  # Ensure correct format (word, score)
                word, score = parts
                try:
                    score = float(score)  # Convert score to float
                    if score > 0:  # Keep only positive words
                        words.append(word)
                except ValueError:
                    continue  # Skip invalid lines
    return words
positive_words = load_positive_words(file_path) # Load dataset from file

# Convert list to dataset format
dataset = Dataset.from_dict({"text": positive_words})

In [None]:
### 2️. Load GPT-2 Tokenizer & Model ###
tokenizer = GPT2Tokenizer.from_pretrained("gpt2") # Load GPT-2 tokenizer
tokenizer.pad_token = tokenizer.eos_token  # Set padding token
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [None]:
### 3️. Tokenize the Data ###
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=10) # Let the Gpt2 tokenize the data

tokenized_datasets = dataset.map(tokenize_function, batched=True)

## Data collator for training (helps with batching)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # No masked language modeling (we want text generation)
)

Map:   0%|          | 0/3059 [00:00<?, ? examples/s]

In [None]:
### 4️. Fine-Tune GPT-2 ###
from sklearn.model_selection import train_test_split # Used to split the data into training data and evaluation data
import numpy as np

# Shuffle the indices so the split is not biased on alphabetical order
shuffled_indices = np.random.permutation(len(tokenized_datasets))

# Split the shuffled indices into training and evaluation indices
train_indices, eval_indices = train_test_split(
    shuffled_indices,
    test_size=0.2, # Use 20% of the data for evaluation
    random_state=42 # Seed for being able to reproduce the code and having fair comparisons when tuning hyperparameters
)

# Select the data based on the split indices
train_dataset = tokenized_datasets.select(train_indices)
eval_dataset = tokenized_datasets.select(eval_indices)

In [None]:
training_args = TrainingArguments(
    output_dir="./gpt2-positive-words",  # Directory to save the model
    evaluation_strategy="epoch",
    learning_rate=5e-5, # Learning rate hyperparameter
    per_device_train_batch_size=8, # Batch size for training
    num_train_epochs=5,  # Adjust based on performance
    weight_decay=0.01, # Regularization hyperparameter
    save_steps=500, # Save the model every 500 steps
    logging_dir="./logs",
    report_to="none"
)

trainer = Trainer( # Trainer to train the model 
    model=model, 
    args=training_args, 
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()  # Start training

  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,No log,4.340729
2,3.025100,4.090266
3,3.025100,4.095213
4,2.135700,4.245148
5,1.830000,4.300557


TrainOutput(global_step=1530, training_loss=2.320963985468048, metrics={'train_runtime': 4445.0814, 'train_samples_per_second': 2.752, 'train_steps_per_second': 0.344, 'total_flos': 62439609600000.0, 'train_loss': 2.320963985468048, 'epoch': 5.0})

In [None]:
### 5️. Generate New Positive Words ###
def generate_positive_words(model, tokenizer, prompt=""):
    model.eval() # Set model to evaluation mode
    input_ids = tokenizer.encode(prompt, return_tensors="pt")  # Convert text to input tensor

    output = model.generate( 
        input_ids,
        max_length=4,  # Max length of the words. Specially useful as we are not passing the padding to the generating process.
        num_return_sequences=5,  # Return the top (5) words generated
        temperature=1.5,  # Higher values = more randomness
        #top_k=10,  # Limit choices to the top 10 words [Replaced with top_p]
        #num_beams=5 # works with beam search. Explore multiple (5) possible sequences. [Replaced with sampling]
        do_sample=True,  # Use sampling to choose words
        top_p=0.9 # Adjust the threshold
    )

    return [tokenizer.decode(generated_text, skip_special_tokens=True) for generated_text in output]

# Example: Generate positive words
new_words = generate_positive_words(model, tokenizer, " " ) # In " " enter the starting characters for the generated words
print("Generated Positive Words:", new_words)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Positive Words: [' elcomesomer', ' ighnessesome', ' italityet', ' iredally,', ' irturalist']
