In [None]:
!pip install -q transformers datasets accelerate peft sentencepiece

from datasets import load_dataset


ds = load_dataset("Programmer-RD-AI/genz-slang-pairs-1k")

print(ds)
print(ds["train"][0])



README.md: 0.00B [00:00, ?B/s]

genz_dataset.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/1005 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['normal', 'gen_z'],
        num_rows: 1005
    })
})
{'normal': "I'm really tired today, I think I need some rest.", 'gen_z': "I'm totally drained today, need to catch some Z's."}


Split train/validation:



In [None]:
from datasets import DatasetDict

raw_ds = ds["train"].train_test_split(test_size=0.1, seed=42)
dataset = DatasetDict({
    "train": raw_ds["train"],
    "validation": raw_ds["test"]
})


In [None]:
import random

# Very simple Gen Z style emoji map
emoji_map = {
    "good": ["🔥", "✨", "🤌"],
    "great": ["🔥", "🤯"],
    "awesome": ["🔥", "😭"],
    "fun": ["🤣", "🥳"],
    "party": ["🥳", "🔥"],
    "love": ["❤️‍🔥", "🥺", "😍"],
    "sad": ["😭", "💔"],
    "cry": ["😭", "💀"],
    "angry": ["😡", "🤬"],
    "boring": ["💀", "🫠"],
    "tired": ["😪", "🫠"],
    "school": ["💀", "😭"],
    "exam": ["💀", "😭"],
    "college": ["😭", "🔥"],
    "dead": ["💀"],
}

fallback_emojis = ["😭", "🔥", "💀", "🤯", "✨", "😭✨"]

def add_emojis_to_text(text):
    text_lower = text.lower()
    used = []

    # keyword-based emojis
    for k, ems in emoji_map.items():
        if k in text_lower:
            used.append(random.choice(ems))


    if not used:
        used.append(random.choice(fallback_emojis))


    if not any(ch in text for ch in "😭🔥💀🤯✨😍🥳😡💔"):
        text = text.strip() + " " + " ".join(used)
    else:
        text = text.strip() + " " + " ".join(used)

    return text


Tokenization for T5

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
base_model = T5ForConditionalGeneration.from_pretrained(model_name)

prefix = "translate to genz: "  # simple instruction prefix
max_source_len = 64
max_target_len = 64

def preprocess(examples):
    inputs = [prefix + s for s in examples["normal"]]
    targets = examples["gen_z"]

    model_inputs = tokenizer(
        inputs, max_length=max_source_len, truncation=True
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets, max_length=max_target_len, truncation=True
        )["input_ids"]

    model_inputs["labels"] = labels
    return model_inputs

tokenized_ds = dataset.map(
    preprocess,
    batched=True,
    remove_columns=dataset["train"].column_names
)


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/904 [00:00<?, ? examples/s]



Map:   0%|          | 0/101 [00:00<?, ? examples/s]

In [None]:
from peft import LoraConfig, get_peft_model
import torch

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q", "v"],  # T5 attention projections
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM",
)

model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()


trainable params: 589,824 || all params: 61,096,448 || trainable%: 0.9654


Trainer setup:

In [None]:
from transformers import DataCollatorForSeq2Seq, TrainingArguments, Trainer
import torch

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=base_model,
    padding="longest"
)

training_args = TrainingArguments(
    output_dir="./t5-genz-lora",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=3e-4,
    weight_decay=0.01,
    logging_steps=50,
    fp16=torch.cuda.is_available(),
    report_to="none"
)

trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

  trainer = Trainer(


Step,Training Loss
50,3.5785
100,2.8785
150,2.5633
200,2.36
250,2.2378
300,2.2306


TrainOutput(global_step=339, training_loss=2.59374394599667, metrics={'train_runtime': 497.9693, 'train_samples_per_second': 5.446, 'train_steps_per_second': 0.681, 'total_flos': 17703547699200.0, 'train_loss': 2.59374394599667, 'epoch': 3.0})

Inference function

In [None]:
import math
import random

fallback_emojis = ["😭", "🔥", "💀", "🤯", "✨", "😍", "🥳", "😡", "💔"]

def ensure_emoji(text: str) -> str:
    # If there is already an emoji, keep it
    if any(ch in text for ch in "😭🔥💀🤯✨😍🥳😡💔"):
        return text.strip()
    # Otherwise, append a random fallback emoji
    return text.strip() + " " + random.choice(fallback_emojis)


def to_genz(
    text: str,
    max_new_tokens: int | None = None,
    temperature: float = 0.8,
    top_p: float = 0.9,
    mode: str = "medium",  # "light", "medium", "full"
):
    # Simple length-based control if not provided
    if max_new_tokens is None:
        base = max(12, len(text.split()) + 5)
        max_new_tokens = min(64, base)

    # Style intensity prompt
    if mode == "light":
        style_hint = "Rewrite in slightly Gen Z style, keep it polite, not too cringe, keep meaning same. Add at most one emoji.\n"
    elif mode == "full":
        style_hint = "Rewrite in EXTREME Gen Z slang, super dramatic, very casual, but keep the core meaning same. Use 1-2 emojis.\n"
    else:
        style_hint = "Rewrite in clear Gen Z slang, casual tone, keep meaning same, avoid offensive words. Add 1-2 emojis.\n"

    inp = style_hint + "Normal: " + text + "\nGen Z:"

    inputs = tokenizer(inp, return_tensors="pt").to(model.device)

    output_ids = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
    )

    out = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Take only the part after "Gen Z:" if present
    if "Gen Z:" in out:
        out = out.split("Gen Z:", 1)[-1]
    out = out.strip()
    out = ensure_emoji(out)
    return out


In [None]:
!pip install -q gradio
import gradio as gr

def genz_interface(text, mode, temperature, max_len):
    return to_genz(
        text,
        max_new_tokens=int(max_len),
        temperature=float(temperature),
        mode=mode,
    )

examples = [
    ["I am going to college right now.", "medium", 0.8, 32],
    ["I am very tired after studying all day.", "medium", 0.9, 40],
    ["The party was amazing and we had a lot of fun.", "full", 0.9, 40],
    ["I failed my exam and I feel bad.", "light", 0.7, 32],
]

demo = gr.Interface(
    fn=genz_interface,
    inputs=[
        gr.Textbox(lines=3, label="Normal text", placeholder="Type any normal sentence…"),
        gr.Dropdown(
            choices=["light", "medium", "full"],
            value="medium",
            label="Gen Z intensity",
        ),
        gr.Slider(0.3, 1.2, value=0.8, step=0.05, label="Creativity (temperature)"),
        gr.Slider(16, 80, value=40, step=4, label="Max new tokens"),
    ],
    outputs=gr.Textbox(lines=5, label="Gen Z style"),
    title="Normal ➜ Gen Z Style Transfer",
    description="Convert boring normal English into Gen Z slang with different intensity levels.",
    examples=examples,
)

demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://7e4711af0eee12d901.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
model.save_pretrained("my_genz_t5_model")
print("Model saved to: my_genz_t5_model")

Model saved to: my_genz_t5_model


### 1. Authenticate with GitHub

To push to GitHub, you'll need to use a [Personal Access Token (PAT)](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token).

**Steps to create a PAT:**
1. Go to your GitHub settings.
2. Navigate to `Developer settings` > `Personal access tokens` > `Tokens (classic)`.
3. Click `Generate new token (classic)`.
4. Give it a descriptive name (e.g., `Colab-Access`).
5. Set an expiration (e.g., 30 days or longer if needed).
6. Grant the `repo` scope (this is essential for pushing).
7. Click `Generate token` and **copy it immediately** as you won't be able to see it again.

Once you have your PAT, you can store it securely in Colab's secrets manager (click the '🔑' icon on the left panel) with a name like `GH_TOKEN`. Then, run the following cell to make it accessible to your environment.

In [None]:
from google.colab import userdata

# Replace 'GH_TOKEN' with the name you gave your PAT in Colab secrets
GH_TOKEN = userdata.get('GH_TOKEN')

# Configure Git with your email and username
!git config --global user.email "your_email@example.com" # Replace with your email
!git config --global user.name "Your GitHub Username" # Replace with your GitHub username

print("Git configured with your email and username.")
print("Personal Access Token loaded from Colab secrets.")

### 2. Clone your GitHub repository

Choose the repository where you want to save your model. Replace `your-username` and `your-repo-name` with your actual GitHub username and repository name.

In [None]:
import os

repo_owner = "your-username"  # Replace with your GitHub username
repo_name = "your-repo-name"    # Replace with your repository name

# Construct the clone URL using the PAT for authentication
repo_url = f"https://{GH_TOKEN}@github.com/{repo_owner}/{repo_name}.git"

# Clone the repository
!git clone {repo_url}

# Change into the repository directory
%cd {repo_name}

print(f"Successfully cloned repository: {repo_name}")

### 3. Copy your saved model files into the repository

Your model was saved to a directory named `my_genz_t5_model` in the root of your Colab environment. Now, copy these files into your cloned repository.

In [None]:
# The model was saved to './t5-genz-lora' as per the trainer output directory
# Let's assume the user wants to copy the final PEFT model (my_genz_t5_model)

# Navigate back to the root if you are still in the repo directory
%cd ..

# Create a directory for your model within the cloned repo if it doesn't exist
model_target_path = f"{repo_name}/my_genz_t5_model"
!mkdir -p {model_target_path}

# Copy the saved model files (from where 'model.save_pretrained' saved them)
# It saves the adapter weights and config, not the full base model.
# The actual PEFT model is saved to the directory specified in save_pretrained.
# Based on the earlier cell, model.save_pretrained("my_genz_t5_model")
!cp -r my_genz_t5_model/* {model_target_path}/

print(f"Model files copied to {model_target_path}")

### 4. Add, Commit, and Push Changes

Now, you can add the new files, commit them, and push them to your GitHub repository.

In [None]:
# Change into the repository directory to perform git operations
%cd {repo_name}

# Add all new/modified files
!git add .

# Commit the changes
!git commit -m "Add Gen Z T5 LoRA model files"

# Push the changes to the remote repository
!git push origin main # Or your default branch name (e.g., master)

print("Model successfully pushed to GitHub!")