In [None]:
!pip install gradio transformers bitsandbytes accelerate peft trl pandas datasets



In [None]:
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer
from threading import Thread
import pandas as pd
import bitsandbytes

In [None]:
!pip install --upgrade pandas datasets transformers

import torch
from datasets import load_dataset, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

# Configuration
dataset_id = "jojogo9/Food_Recipes"
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
output_model = "llm-007"
start_row = 100  # Starting row
end_row = 300  # Ending row
sample_size = 200  # Sample size for the dataset
max_length = 128  # Max length for tokenization

# Check device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Function to prepare training data with start and end row parameters
def prepare_train_data(data_id, start_row=0, end_row=None, sample_size=200):
    # Load the dataset
    data = load_dataset(data_id, split="train")
    data_df = data.to_pandas()

    # Select the rows between start_row and end_row
    if end_row is not None:
        data_df = data_df.iloc[start_row:end_row]
    else:
        data_df = data_df.iloc[start_row:]

    # If the resulting dataset is larger than sample_size, sample the data
    if len(data_df) > sample_size:
        data_df = data_df.sample(n=sample_size, random_state=42)

    # Format the text column as required
    data_df["text"] = data_df.apply(
        lambda x: (
            "im_start user\n" + x["ingredients"] + "\nim_end\n" +
            "im_start assistant\n" + x["name"] + "\nim_end\n" +
            "im_start id: " + str(x["id"]) + "\nim_end\n" +
            "im_start steps: " + x["steps"] + "\nim_end\n"
        ),
        axis=1
    )

    # Convert the DataFrame back to a Dataset
    data = Dataset.from_pandas(data_df)

    return data

# Prepare the training data
data = prepare_train_data(dataset_id, start_row=start_row, end_row=end_row, sample_size=sample_size)

# Function to get model and tokenizer
def get_model_and_tokenizer(model_id):
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        load_in_8bit=True  # If 4-bit isn't supported directly
    )
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

# Load model and tokenizer
model, tokenizer = get_model_and_tokenizer(model_id)

# Function to tokenize, encode, and truncate the dataset
def encode_and_truncate_dataset(dataset, tokenizer, max_length=128):
    def tokenize_function(examples):
        return tokenizer(
            examples["text"],
            padding="max_length",  # Pad to the maximum length
            truncation=True,       # Truncate to the maximum length
            max_length=max_length  # Maximum length of sequences
        )

    tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
    return tokenized_dataset

# Encode and truncate the dataset
encoded_dataset = encode_and_truncate_dataset(data, tokenizer, max_length)

# Example to check the first encoded item
print(encoded_dataset[0])

# Save the dataset if needed
encoded_dataset.save_to_disk("encoded_dataset")

# Now your encoded_dataset is ready for training with the language model




The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

{'name': 'tide me over   indian chaat  simple veggie salad', 'id': 63793, 'minutes': 2, 'contributor_id': 12657, 'submitted': '2003-06-05', 'tags': "['15-minutes-or-less', 'time-to-make', 'course', 'main-ingredient', 'cuisine', 'preparation', 'occasion', 'for-1-or-2', 'low-protein', 'healthy', 'lunch', 'salads', 'snacks', 'vegetables', 'asian', 'indian', 'easy', 'beginner-cook', 'low-fat', 'summer', 'vegetarian', 'dietary', 'spicy', 'low-sodium', 'low-cholesterol', 'seasonal', 'low-saturated-fat', 'low-calorie', 'low-carb', 'inexpensive', 'healthy-2', 'low-in-something', 'tomatoes', 'taste-mood', 'savory', 'number-of-servings', 'presentation', 'served-cold']", 'nutrition': '[73.1, 1.0, 15.0, 0.0, 5.0, 3.0, 4.0]', 'n_steps': 4, 'steps': "['toss everything well in a serving bowl', 'grab a fork or large spoon and eat immediately , or refrigerate until hunger strikes , and then eat !', 'feel extremely pleased with yourself that you snacked on this instead of that bag of chips you were eyei

Saving the dataset (0/1 shards):   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from peft import LoraConfig
from trl import SFTTrainer

# Define your LoraConfig
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Define a function to get model and tokenizer
def get_model_and_tokenizer(model_id):
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        # Remove quantization_config argument
    )
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token = tokenizer.eos_token
    return model, tokenizer

# Example usage
model_id = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'
model, tokenizer = get_model_and_tokenizer(model_id)


No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

In [None]:

# Define the training arguments
training_arguments = TrainingArguments(
    output_dir="output_model",
    per_device_train_batch_size=2,  # Reduce batch size 8
    gradient_accumulation_steps=32,   # Increase accumulation steps 8
    optim="paged_adamw_32bit",
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    save_strategy="epoch",
    logging_steps=10,
    num_train_epochs=3,
    max_steps=250,
    fp16=True,
    # push_to_hub=True
)

# Assume 'data' is your training dataset
trainer = SFTTrainer(
    model=model,
    train_dataset=data,
    peft_config=peft_config,
    dataset_text_field="text",
    args=training_arguments,
    tokenizer=tokenizer,
    packing=False,
    max_seq_length=1024
)


trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


KeyboardInterrupt: 

In [None]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): lora.Linear(
            (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=2048, out_features=8, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=8, out_features=2048, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
          )
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): lora.Linear(
            (base_layer): Linear(in_features=2048, out_features=256, bias=False)
            (lora_dropout): ModuleDic

In [None]:
trainer.evaluate(dataset)