In [14]:
!pip -q install transformers datasets peft bitsandbytes accelerate huggingface_hub

In [32]:
# added a pass use_reentrant for pytorch checkpoint - warning that stuff will deprecate otherwise
import torch.utils.checkpoint
torch.utils.checkpoint.USE_REENTRANT = False

import os
import pandas as pd
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from datasets import Dataset
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    TaskType
)
import random
import numpy as np

In [4]:
# to reproduce
seed = random.randint(1, 10000)
print(seed)

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(seed)

3321


In [6]:
df = pd.read_csv('merged_articles.csv')

In [7]:
df.head()

Unnamed: 0,file_name,content
0,human_0.txt,"LONDON, England (Reuters) -- Harry Potter star..."
1,human_1.txt,Editor's note: In our Behind the Scenes series...
2,human_10.txt,WASHINGTON (CNN) -- As he awaits a crucial pro...
3,human_100.txt,(CNN) -- A man and woman suspected of kidnappi...
4,human_1000.txt,(CNN) -- Rafael Nadal failed to clinch the yea...


In [8]:
df = df.sample(frac=1, random_state=seed).reset_index(drop=True)

In [9]:
# 70% train, 10% val, 10% test, 10% research (for uid analysis)
train_size = int(0.7 * len(df))
val_size = int(0.1 * len(df))
test_size = int(0.1 * len(df))

In [10]:
train_df = df[:train_size]
val_df = df[train_size:train_size+val_size]
test_df = df[train_size+val_size:train_size+val_size+test_size]
research_test_df = df[train_size+val_size+test_size:]

In [11]:
print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")
print(f"Test samples: {len(test_df)}")
print(f"Research test samples: {len(research_test_df)}")

Training samples: 1400
Validation samples: 200
Test samples: 200
Research test samples: 200


In [12]:
research_test_df.to_csv('/content/research_test_set.csv', index=False)
print("Research test set saved to '/content/research_test_set.csv'")

Research test set saved to '/content/research_test_set.csv'


In [13]:
# convert to hf dataset
def df_to_dataset(df):
    return Dataset.from_pandas(df[['content']])

train_dataset = df_to_dataset(train_df)
val_dataset = df_to_dataset(val_df)

login to huggingface (need to request access from meta)

In [15]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [16]:
model_id = "meta-llama/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

**Note these parameters are optimized for my Colab setup**

A100 Cluster
* 40gb VRAM
* 83.5 GB RAM

In [17]:
def tokenize_function(examples):
    return tokenizer(
        examples["content"],
        padding="max_length",
        truncation=True,
        max_length=1024 #use 512 otherwise
    )

Remove multi-processing on regular Colab

In [18]:
tokenized_train = train_dataset.map(tokenize_function, batched=True, num_proc=4)
tokenized_val = val_dataset.map(tokenize_function, batched=True, num_proc=4)

Map (num_proc=4):   0%|          | 0/1400 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/200 [00:00<?, ? examples/s]

In [19]:
tokenized_train.column_names

['content', 'input_ids', 'attention_mask']

In [20]:
# format for training
columns_to_remove = [col for col in tokenized_train.column_names if col not in ["input_ids", "attention_mask"]]
tokenized_train = tokenized_train.remove_columns(columns_to_remove)
tokenized_val = tokenized_val.remove_columns(columns_to_remove)

In [21]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    load_in_4bit=True,  # 8-bit for A100, 4-bit otherwise
    torch_dtype=torch.bfloat16,  # Changed to bfloat16 which works better on A100s
    device_map="auto"
)

config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [22]:
# Prep for LoRA training
model = prepare_model_for_kbit_training(model)

In [23]:
lora_config = LoraConfig(
    r=32,  # Have rank at 16 on regular, 32 on A100
    lora_alpha=64,  # 32 regular, 64 on A100
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],  # Added more modules
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

In [24]:
# Apply LoRA to model
model = get_peft_model(model, lora_config)
print(f"Trainable parameters: {model.print_trainable_parameters()}")

trainable params: 79,953,920 || all params: 6,818,369,536 || trainable%: 1.1726
Trainable parameters: None


In [34]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",  # Changed from "epoch" to "steps" for more frequent evaluation
    eval_steps=100,  # Evaluate every 100 steps
    learning_rate=2e-4,
    per_device_train_batch_size=16,  # Increased from 4 to 16
    per_device_eval_batch_size=16,  # Increased from 4 to 16
    gradient_accumulation_steps=4,  # Added gradient accumulation for effective batch size of 64
    num_train_epochs=5,
    weight_decay=0.01,
    push_to_hub=False,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=3,
    fp16=True,  # Enable mixed precision training
    logging_steps=10,
    report_to="tensorboard",  # Enable TensorBoard reporting
    optim="adamw_torch_fused",  # Use fused Adam optimizer for faster training
)

In [35]:
# data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [36]:
# trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
)
# ignore the label_names thing, we're not doing classification since its causal

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [37]:
# start training
trainer.train()

  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
100,1.4562,1.874192


  return fn(*args, **kwargs)


TrainOutput(global_step=110, training_loss=1.6105980613014914, metrics={'train_runtime': 3200.0365, 'train_samples_per_second': 2.187, 'train_steps_per_second': 0.034, 'total_flos': 2.87607292428288e+17, 'train_loss': 1.6105980613014914, 'epoch': 5.0})

In [38]:
trainer.save_model("./final_model")
model.save_pretrained("./lora_adapters")
tokenizer.save_pretrained("./tokenizer")
!tar -czvf model_files.tar.gz ./final_model ./lora_adapters ./tokenizer
print("Model files saved and compressed")

./final_model/
./final_model/tokenizer_config.json
./final_model/special_tokens_map.json
./final_model/adapter_config.json
./final_model/tokenizer.model
./final_model/training_args.bin
./final_model/README.md
./final_model/adapter_model.safetensors
./final_model/tokenizer.json
./lora_adapters/
./lora_adapters/adapter_config.json
./lora_adapters/README.md
./lora_adapters/adapter_model.safetensors
./tokenizer/
./tokenizer/tokenizer_config.json
./tokenizer/special_tokens_map.json
./tokenizer/tokenizer.model
./tokenizer/tokenizer.json
Model files saved and compressed


Parameter notes - can be a bit more optimal potentially:
- since our dataset is on the smaller end, might want more epochs
- potentially higher LoRA rank (64?)
- add more target modules
- if have time, a grid search over learning rates could help (1e-4 to 5e-4)

### Evaluation

In [45]:
# perplexity
from torch.nn import CrossEntropyLoss
import numpy as np

def compute_perplexity(model, test_df, tokenizer, batch_size=4):
    """
    Compute perplexity on raw text data from a dataframe.
    """
    model.eval()
    device = model.device
    total_loss = 0
    total_tokens = 0

    # process in batches to avoid OOM issues
    for i in range(0, len(test_df), batch_size):
        batch_texts = test_df.iloc[i:i+batch_size]['content'].tolist()

        # tokenize with padding and truncation (truncation caused a bunch of errors earlier)
        encodings = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )

        # move to gpu
        input_ids = encodings['input_ids'].to(device)
        attention_mask = encodings['attention_mask'].to(device)

        # labels (shifted input_ids)
        labels = input_ids.clone()

        # forward pass
        with torch.no_grad():
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

        # get loss
        loss = outputs.loss

        # count non-padding tokens
        non_padding = attention_mask.sum().item()

        # accumulate
        total_loss += loss.item() * non_padding
        total_tokens += non_padding

        # print progress
        if i % (10 * batch_size) == 0:
            print(f"Processed {i}/{len(test_df)} examples")

    # calculate perplexity
    avg_loss = total_loss / total_tokens
    perplexity = math.exp(avg_loss)

    return perplexity

import math

test_perplexity = compute_perplexity(model, test_df, tokenizer)
print(f"Test set perplexity: {test_perplexity:.2f}")

Processed 0/200 examples
Processed 40/200 examples
Processed 80/200 examples
Processed 120/200 examples
Processed 160/200 examples
Test set perplexity: 18.90


In [46]:
# Qualitatively looking at some samples
def generate_samples(model, tokenizer, test_df, num_samples=5, max_length=512):
    results = []

    # Select random samples
    sample_indices = np.random.choice(len(test_df), num_samples, replace=False)

    for idx in sample_indices:
        # Get a prompt (first 50 tokens of the article)
        prompt_text = test_df.iloc[idx]['content'][:200] + "..."

        # Tokenize
        inputs = tokenizer(prompt_text, return_tensors="pt").to("cuda")

        # Generate
        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs["input_ids"],
                max_length=max_length,
                do_sample=True,
                top_p=0.92,
                temperature=0.8,
                num_return_sequences=1
            )

        # Decode
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        results.append({
            "prompt": prompt_text,
            "generated": generated_text,
            "original": test_df.iloc[idx]['content']
        })

    return results

In [47]:
test_samples = generate_samples(model, tokenizer, test_df)
with open("generated_samples.txt", "w") as f:
    for i, sample in enumerate(test_samples):
        f.write(f"Sample {i+1}\n")
        f.write(f"Prompt: {sample['prompt']}\n\n")
        f.write(f"Generated:\n{sample['generated']}\n\n")
        f.write("-" * 80 + "\n\n")

In [48]:
import os
import glob
from google.colab import files

# Download all files!!

def get_files_to_zip():
    all_files = glob.glob("*")
    files_to_zip = []
    excluded_extensions = ['.pyc', '.pyo', '.pyd', '__pycache__']
    excluded_dirs = ['./sample_data']

    for f in all_files:
        # Skip if it's in excluded directories
        if any(f.startswith(d) for d in excluded_dirs):
            continue

        # Skip if it has excluded extension
        if any(f.endswith(e) for e in excluded_extensions):
            continue

        files_to_zip.append(f)

    return files_to_zip

# get files to zip
files_to_zip = get_files_to_zip()
print(f"Found {len(files_to_zip)} files to zip")

# create zip command
zip_command = f"zip -r colab_project.zip {' '.join(files_to_zip)}"
print("Creating zip file...")
!$zip_command

# download the zip file
print("Downloading zip file...")
files.download('colab_project.zip')

print("Done!")

Found 9 files to zip
Creating zip file...
  adding: final_model/ (stored 0%)
  adding: final_model/tokenizer_config.json (deflated 68%)
  adding: final_model/special_tokens_map.json (deflated 73%)
  adding: final_model/adapter_config.json (deflated 56%)
  adding: final_model/tokenizer.model (deflated 55%)
  adding: final_model/training_args.bin (deflated 52%)
  adding: final_model/README.md (deflated 66%)
  adding: final_model/adapter_model.safetensors (deflated 7%)
  adding: final_model/tokenizer.json (deflated 85%)
  adding: lora_adapters_only/ (stored 0%)
  adding: lora_adapters_only/tokenizer_config.json (deflated 68%)
  adding: lora_adapters_only/config/ (stored 0%)
  adding: lora_adapters_only/special_tokens_map.json (deflated 73%)
  adding: lora_adapters_only/adapter_config.json (deflated 56%)
  adding: lora_adapters_only/tokenizer.model (deflated 55%)
  adding: lora_adapters_only/README.md (deflated 66%)
  adding: lora_adapters_only/adapter_model.safetensors (deflated 7%)
  add

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Done!
