<a href="https://colab.research.google.com/github/SuhasiniSingh535/PortFolio/blob/main/personal_assistant.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
!pip install torch transformers datasets peft accelerate



In [29]:
import requests
import json
from datasets import Dataset
from transformers import AutoTokenizer

# Download the JSON file from GitHub
url = 'https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch07/01_main-chapter-code/instruction-data.json'
response = requests.get(url)
data = json.loads(response.text)

# Create a dataset from the JSON data
dataset = Dataset.from_list(data)

# Split into train and test sets (80% train, 20% test)
dataset = dataset.train_test_split(test_size=0.2, seed=42)

# Load the tokenizer for DistilGPT-2
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set pad token to eos token

# Preprocess function to format and tokenize the data
def preprocess_function(examples):
    texts = []
    for instruction, input_text, output in zip(examples['instruction'], examples['input'], examples['output']):
        if input_text.strip():
            text = f"Instruction: {instruction}\nInput: {input_text}\nResponse: {output}"
        else:
            text = f"Instruction: {instruction}\nResponse: {output}"
        texts.append(text)
    # Tokenize with padding and truncation
    model_inputs = tokenizer(texts, max_length=512, truncation=True, padding='max_length', return_tensors='pt')
    model_inputs['labels'] = model_inputs['input_ids'].clone()  # Copy input_ids for labels
    return model_inputs

# Apply preprocessing to the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=['instruction', 'input', 'output'])

Map:   0%|          | 0/880 [00:00<?, ? examples/s]

Map:   0%|          | 0/220 [00:00<?, ? examples/s]

In [30]:
import torch
from transformers import AutoModelForCausalLM
from peft import LoraConfig, get_peft_model

# Load the pre-trained DistilGPT-2 model
model = AutoModelForCausalLM.from_pretrained(model_name)

# Configure LoRA for fine-tuning
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["c_attn", "c_proj"],
    lora_dropout=0.1,
    bias="none",
)
model = get_peft_model(model, lora_config)



In [31]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

# Set up training arguments (optimized for Colab/laptop)
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,  # Effective batch size of 4
    num_train_epochs=3,
    logging_steps=10,
    save_steps=100,
    learning_rate=5e-5,
    fp16=True if torch.cuda.is_available() else False,  # Use GPU if available
    report_to="none",
)

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Initialize the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    data_collator=data_collator,
)

No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [32]:
trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
10,4.0034
20,3.9677
30,3.8055
40,3.8177
50,3.6485
60,3.3935
70,3.3037
80,3.2686
90,3.1247
100,3.1148


TrainOutput(global_step=660, training_loss=2.6603891372680666, metrics={'train_runtime': 134.1231, 'train_samples_per_second': 19.683, 'train_steps_per_second': 4.921, 'total_flos': 348200380661760.0, 'train_loss': 2.6603891372680666, 'epoch': 3.0})

In [33]:
trainer.save_model("./fine_tuned_model")

In [34]:
def generate_response(instruction, input_text=""):
    # Format the prompt based on whether input_text is provided
    if input_text.strip():
        prompt = f"Instruction: {instruction}\nInput: {input_text}\nResponse:"
    else:
        prompt = f"Instruction: {instruction}\nResponse:"

    # Tokenize the prompt and generate response
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=100, do_sample=True, temperature=0.7)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract and return the generated response
    response = response[len(prompt):].strip()
    return response

# Test the assistant with a sample instruction and input
instruction = "Summarize the following article."
input_text = "The article is about the benefits of exercise."
response = generate_response(instruction, input_text)
print(f"Assistant \n\nAssistant: {response}")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Assistant 

Assistant: The benefits of exercise are very simple.
Response: The benefits of exercise are very simple.
Response: The benefits of exercise are very simple.
Response: The benefits of exercise are very simple.
Response: The benefits of exercise are very simple.
Response: The benefits of exercise are very simple.
Response: The benefits of exercise are very simple.
Response: The benefits of exercise are very simple.
Response: The benefits of exercise are very simple.
Response: The


In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_dir = "/content/path_to_your_saved_model"  # update this

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForCausalLM.from_pretrained(model_dir).to("cuda" if torch.cuda.is_available() else "cpu")
model.eval()  # set in inference mode


HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/content/path_to_your_saved_model'. Use `repo_type` argument if needed.