In [1]:
# Import necessary libraries
from google.colab import drive
# Mount Google Drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
!pip install transformers torch pandas
!pip install transformers[torch]
!pip install accelerate -U
!pip install datasets


Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed dataset

In [6]:
import pandas as pd

# Sample data
convo_list = ['They do not!', 'They do to!', 'I hope so.', 'She okay?', "Let's go.", 'Wow', "Okay -- you're gonna need to learn how to lie.", 'No', None, 'Like my fear of wearing pastels?', None, 'What good stuff?', "I figured you'd get to the good stuff eventually.", 'Thank God!  If I had to hear one more story about your coiffure...', "Me.  This endless ...blonde babble. I'm like boring myself.", 'What crap?', 'do you listen to this crap?', 'No...', None, 'You always been this selfish?']

# Convert list to DataFrame
df = pd.DataFrame(convo_list, columns=['dialogue'])

# Drop rows with None or NaN values
df.dropna(inplace=True)

# Convert DataFrame back to list
cleaned_convo_list = df['dialogue'].tolist()


In [7]:
# Save cleaned conversation to a text file
with open('cleaned_convo.txt', 'w') as f:
    for line in cleaned_convo_list:
        f.write(line + '\n')


In [9]:
!pip install transformers torch pandas datasets accelerate




In [10]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset

# Load pre-trained model and tokenizer
model_name = 'gpt2'  # Ensure this is the smallest GPT-2 model
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Set padding token
tokenizer.pad_token = tokenizer.eos_token

# Load dataset
dataset = load_dataset('text', data_files={'train': 'cleaned_convo.txt'})

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Set training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=1,  # Reduce batch size to 1
    gradient_accumulation_steps=8,  # Accumulate gradients over 8 steps
    fp16=True,  # Enable mixed precision training
    save_steps=10_000,
    save_total_limit=2,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets['train']
)

# Train the model
trainer.train()


Map:   0%|          | 0/17 [00:00<?, ? examples/s]

Step,Training Loss


TrainOutput(global_step=6, training_loss=11.474019368489584, metrics={'train_runtime': 172.0745, 'train_samples_per_second': 0.296, 'train_steps_per_second': 0.035, 'total_flos': 3135504384000.0, 'train_loss': 11.474019368489584, 'epoch': 2.8235294117647056})

In [12]:
import torch

# Load the fine-tuned model
model.eval()

# Generate text
input_text = "They do not!"
input_ids = tokenizer.encode(input_text, return_tensors='pt')

# Create attention mask
attention_mask = torch.ones(input_ids.shape, device=input_ids.device)

# Generate text with attention mask and additional parameters to reduce repetition
output = model.generate(
    input_ids,
    attention_mask=attention_mask,
    max_length=50,
    num_return_sequences=1,
    no_repeat_ngram_size=2,  # Avoid repeating n-grams
    repetition_penalty=2.0,  # Penalize repetition
    pad_token_id=tokenizer.eos_token_id
)

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)


They do not!
The only thing that is wrong with this story? The fact that the guy who was supposed to be a hero, and then got killed by an assassin. He's just like you guys are: he doesn't have any redeem
