In [1]:
import torch
torch.mps.empty_cache()

torch.mps.set_per_process_memory_fraction(0.0)

In [3]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Suppress parallelism warning

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer
)
from peft import get_peft_model, LoraConfig
import evaluate
import numpy as np
import torch

dataset = load_dataset('Sribhuvan/FinanceData')
dataset = dataset["train"].train_test_split(test_size=0.1)
dataset["validation"] = dataset.pop("test")
print(dataset)


model_checkpoint = 'distilgpt2'

model = AutoModelForCausalLM.from_pretrained(model_checkpoint)

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token for distilgpt2

def tokenize_function(examples):
    combined_text = [title + "\n" + content for title, content in zip(examples['Title'], examples['Content'])]
    return tokenizer(combined_text, truncation=True, max_length=512)

tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset["train"].column_names
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["attn.c_attn"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"

)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

metric = evaluate.load("perplexity")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    if isinstance(logits, np.ndarray):
        logits = torch.from_numpy(logits)
    if isinstance(labels, np.ndarray):
        labels = torch.from_numpy(labels)
    shift_logits = logits[..., :-1, :].contiguous()
    shift_labels = labels[..., 1:].contiguous()
    loss_fct = torch.nn.CrossEntropyLoss()
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    return {"perplexity": torch.exp(loss).item()}

# Training arguments
training_args = TrainingArguments(
    output_dir="./distilgpt2-finance",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    logging_steps=100,
    gradient_accumulation_steps=4,
    # report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

trainer.save_model("./distilgpt2-finance-final")
tokenizer.save_pretrained("./distilgpt2-finance-final")

DatasetDict({
    train: Dataset({
        features: ['Title', 'Content'],
        num_rows: 332
    })
    validation: Dataset({
        features: ['Title', 'Content'],
        num_rows: 37
    })
})


Map:   0%|          | 0/332 [00:00<?, ? examples/s]

Map:   0%|          | 0/37 [00:00<?, ? examples/s]



trainable params: 294,912 || all params: 82,207,488 || trainable%: 0.3587


  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 3.4527180194854736, 'eval_perplexity': 30.893020629882812, 'eval_runtime': 18.1539, 'eval_samples_per_second': 2.038, 'eval_steps_per_second': 0.275, 'epoch': 0.95}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 3.452033281326294, 'eval_perplexity': 30.872316360473633, 'eval_runtime': 16.2123, 'eval_samples_per_second': 2.282, 'eval_steps_per_second': 0.308, 'epoch': 2.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 3.4518215656280518, 'eval_perplexity': 30.86591339111328, 'eval_runtime': 31.6522, 'eval_samples_per_second': 1.169, 'eval_steps_per_second': 0.158, 'epoch': 2.86}
{'train_runtime': 889.8712, 'train_samples_per_second': 1.119, 'train_steps_per_second': 0.034, 'train_loss': 3.5180763244628905, 'epoch': 2.86}


('./distilgpt2-finance-final/tokenizer_config.json',
 './distilgpt2-finance-final/special_tokens_map.json',
 './distilgpt2-finance-final/vocab.json',
 './distilgpt2-finance-final/merges.txt',
 './distilgpt2-finance-final/added_tokens.json',
 './distilgpt2-finance-final/tokenizer.json')

In [1]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Suppress parallelism warning

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer
)
from peft import get_peft_model, LoraConfig
import evaluate
import numpy as np
import torch

dataset = load_dataset('Sribhuvan/FinanceData')
dataset = dataset["train"].train_test_split(test_size=0.1)
dataset["validation"] = dataset.pop("test")
print(dataset)

model_checkpoint = 'distilgpt2'
model = AutoModelForCausalLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token for distilgpt2

# Increase max_length to 1024 tokens (or your desired length)
def tokenize_function(examples):
    combined_text = [title + "\n" + content for title, content in zip(examples['Title'], examples['Content'])]
    return tokenizer(combined_text, truncation=True, max_length=1024)

tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset["train"].column_names
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["attn.c_attn"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

metric = evaluate.load("perplexity")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    if isinstance(logits, np.ndarray):
        logits = torch.from_numpy(logits)
    if isinstance(labels, np.ndarray):
        labels = torch.from_numpy(labels)
    shift_logits = logits[..., :-1, :].contiguous()
    shift_labels = labels[..., 1:].contiguous()
    loss_fct = torch.nn.CrossEntropyLoss()
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    return {"perplexity": torch.exp(loss).item()}

training_args = TrainingArguments(
    output_dir="./distilgpt2-finance",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    logging_steps=100,
    gradient_accumulation_steps=4,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

trainer.save_model("./distilgpt2-finance-final-1")
tokenizer.save_pretrained("./distilgpt2-finance-final-1")

DatasetDict({
    train: Dataset({
        features: ['Title', 'Content'],
        num_rows: 332
    })
    validation: Dataset({
        features: ['Title', 'Content'],
        num_rows: 37
    })
})


Map:   0%|          | 0/332 [00:00<?, ? examples/s]

Map:   0%|          | 0/37 [00:00<?, ? examples/s]



trainable params: 294,912 || all params: 82,207,488 || trainable%: 0.3587


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33msribhuvan[0m ([33msribhuvan-org[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 3.450232744216919, 'eval_perplexity': 31.21708106994629, 'eval_runtime': 44.0609, 'eval_samples_per_second': 0.84, 'eval_steps_per_second': 0.113, 'epoch': 0.95}


RuntimeError: MPS backend out of memory (MPS allocated: 17.24 GB, other allocations: 405.98 MB, max allowed: 18.13 GB). Tried to allocate 1.09 GB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
from huggingface_hub import login

write_key = ''
login(write_key)

hf_name = 'Sribhuvan'
repo_name = 'distilgpt2-finance'
model_id = f"{hf_name}/{repo_name}"

model.push_to_hub(model_id)
tokenizer.push_to_hub(model_id)

## With Language Head

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Suppress parallelism warning

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer
)
from peft import get_peft_model, LoraConfig
import evaluate
import numpy as np
import torch

# Load and split the dataset
dataset = load_dataset('Sribhuvan/FinanceData')
dataset = dataset["train"].train_test_split(test_size=0.1)
dataset["validation"] = dataset.pop("test")
print(dataset)

# Load model and tokenizer from the checkpoint
model_checkpoint = 'distilgpt2'
model = AutoModelForCausalLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token for distilgpt2

# Tokenization function with increased max_length for larger prompts
def tokenize_function(examples):
    combined_text = [title + "\n" + content for title, content in zip(examples['Title'], examples['Content'])]
    return tokenizer(combined_text, truncation=True, max_length=1024)

# Tokenize the dataset
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset["train"].column_names
)

# Data collator for causal language modeling (no masking)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Configure LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["attn.c_attn"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
# Wrap the model with LoRA modifications
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Load evaluation metric
metric = evaluate.load("perplexity")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    if isinstance(logits, np.ndarray):
        logits = torch.from_numpy(logits)
    if isinstance(labels, np.ndarray):
        labels = torch.from_numpy(labels)
    # Shift logits and labels for causal LM loss calculation
    shift_logits = logits[..., :-1, :].contiguous()
    shift_labels = labels[..., 1:].contiguous()
    loss_fct = torch.nn.CrossEntropyLoss()
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    return {"perplexity": torch.exp(loss).item()}

# Training arguments
training_args = TrainingArguments(
    output_dir="./distilgpt2-finance",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    logging_steps=100,
    gradient_accumulation_steps=4,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
trainer.save_model("./distilgpt2-finance-final")
tokenizer.save_pretrained("./distilgpt2-finance-final")

# --- Inference Section ---

# Merge the LoRA weights into the base model to restore full generation capability
model = model.merge_and_unload()

# Now test generation
prompt = "What is the best debt strategy?"
input_ids = tokenizer.encode(prompt, return_tensors="pt")
# Adjust max_length as needed for your prompt size
output_ids = model.generate(input_ids, max_length=512)
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("Generated Text:", output_text)


DatasetDict({
    train: Dataset({
        features: ['Title', 'Content'],
        num_rows: 332
    })
    validation: Dataset({
        features: ['Title', 'Content'],
        num_rows: 37
    })
})


Map:   0%|          | 0/332 [00:00<?, ? examples/s]

Map:   0%|          | 0/37 [00:00<?, ? examples/s]



trainable params: 294,912 || all params: 82,207,488 || trainable%: 0.3587


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33msribhuvan[0m ([33msribhuvan-org[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 3.399294853210449, 'eval_perplexity': 30.194801330566406, 'eval_runtime': 48.1133, 'eval_samples_per_second': 0.769, 'eval_steps_per_second': 0.104, 'epoch': 0.95}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 3.398453950881958, 'eval_perplexity': 30.169889450073242, 'eval_runtime': 20.8469, 'eval_samples_per_second': 1.775, 'eval_steps_per_second': 0.24, 'epoch': 2.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 3.3982129096984863, 'eval_perplexity': 30.162782669067383, 'eval_runtime': 19.8986, 'eval_samples_per_second': 1.859, 'eval_steps_per_second': 0.251, 'epoch': 2.86}
{'train_runtime': 837.3561, 'train_samples_per_second': 1.189, 'train_steps_per_second': 0.036, 'train_loss': 3.5353131612141926, 'epoch': 2.86}


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


RuntimeError: Placeholder storage has not been allocated on MPS device!

In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Suppress parallelism warning

import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer
)
from peft import get_peft_model, LoraConfig
import evaluate
import numpy as np

# Determine the device
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print("Using device:", device)

# Load and split the dataset
dataset = load_dataset('Sribhuvan/FinanceData')
dataset = dataset["train"].train_test_split(test_size=0.1)
dataset["validation"] = dataset.pop("test")
print(dataset)

# Load model and tokenizer from the checkpoint
model_checkpoint = 'distilgpt2'
model = AutoModelForCausalLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token for distilgpt2

# Tokenization function with increased max_length for larger prompts
def tokenize_function(examples):
    combined_text = [title + "\n" + content for title, content in zip(examples['Title'], examples['Content'])]
    return tokenizer(combined_text, truncation=True, max_length=1024)

# Tokenize the dataset
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset["train"].column_names
)

# Data collator for causal language modeling (no masking)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Configure LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["attn.c_attn"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

# Wrap the model with LoRA modifications
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Move model to the selected device
model.to(device)

# Load evaluation metric
metric = evaluate.load("perplexity")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    if isinstance(logits, np.ndarray):
        logits = torch.from_numpy(logits)
    if isinstance(labels, np.ndarray):
        labels = torch.from_numpy(labels)
    # Shift logits and labels for causal LM loss calculation
    shift_logits = logits[..., :-1, :].contiguous()
    shift_labels = labels[..., 1:].contiguous()
    loss_fct = torch.nn.CrossEntropyLoss()
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    return {"perplexity": torch.exp(loss).item()}

# Training arguments
training_args = TrainingArguments(
    output_dir="./distilgpt2-finance",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    logging_steps=100,
    gradient_accumulation_steps=4,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
trainer.save_model("./distilgpt2-finance-final")
tokenizer.save_pretrained("./distilgpt2-finance-final")

# --- Inference Section ---

# Merge the LoRA weights into the base model to restore full generation capability
model = model.merge_and_unload()
# Move merged model to the selected device
model.to(device)

# Test generation with a proper attention mask
prompt = "What is the best debt strategy?"
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
# Create an attention mask that has 1s for every token in the input_ids
attention_mask = torch.ones(input_ids.shape, device=device)
# Adjust max_length as needed for your prompt size
output_ids = model.generate(input_ids, attention_mask=attention_mask, max_length=512)
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("Generated Text:", output_text)


Using device: mps
DatasetDict({
    train: Dataset({
        features: ['Title', 'Content'],
        num_rows: 332
    })
    validation: Dataset({
        features: ['Title', 'Content'],
        num_rows: 37
    })
})


Map:   0%|          | 0/332 [00:00<?, ? examples/s]

Map:   0%|          | 0/37 [00:00<?, ? examples/s]



trainable params: 294,912 || all params: 82,207,488 || trainable%: 0.3587


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33msribhuvan[0m ([33msribhuvan-org[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 3.3826780319213867, 'eval_perplexity': 28.76781463623047, 'eval_runtime': 31.2229, 'eval_samples_per_second': 1.185, 'eval_steps_per_second': 0.16, 'epoch': 0.95}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 3.381748676300049, 'eval_perplexity': 28.741777420043945, 'eval_runtime': 49.6612, 'eval_samples_per_second': 0.745, 'eval_steps_per_second': 0.101, 'epoch': 2.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 3.3814828395843506, 'eval_perplexity': 28.73432159423828, 'eval_runtime': 39.3296, 'eval_samples_per_second': 0.941, 'eval_steps_per_second': 0.127, 'epoch': 2.86}
{'train_runtime': 1076.5154, 'train_samples_per_second': 0.925, 'train_steps_per_second': 0.028, 'train_loss': 3.5354487101236978, 'epoch': 2.86}


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Text: What is the best debt strategy?

























































































































































































































































































































































































































































































































