In [9]:
import os
import torch
import accelerate
from dataclasses import fields,dataclass
import pandas as pd
from typing import Optional
from dotenv import load_dotenv
from datasets import load_dataset
from trl import SFTTrainer
from peft import LoraConfig,prepare_model_for_kbit_training
from transformers import (AutoTokenizer, 
                        AutoModelForCausalLM,
                        pipeline,
                        BitsAndBytesConfig,
                        GemmaTokenizer,
                        HfArgumentParser,
                        TrainingArguments)

In [10]:
from huggingface_hub import interpreter_login

In [11]:
interpreter_login()


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token can be pasted using 'Right-Click'.
Token is valid (permission: read).
Your token has been saved to C:\Users\vish

In [12]:
df = pd.read_csv('./datasets/modify2.csv')

In [13]:
def format_rows(row):
    q=row['Input']
    a=row['Output']
    # Formatting our dataset to LLaMa format
    format = f'[INST] {q} [/INST] {a}'
    return format

In [14]:
# Convert all rows info to llama format 
df['formatted'] = df.apply(format_rows,axis=1)
df

Unnamed: 0,Input,Output,formatted
0,Masala Karela Recipe,"6 Karela (Bitter Gourd/ Pavakkai) - deseeded,S...",[INST] Masala Karela Recipe [/INST] 6 Karela (...
1,Ragi Semiya Upma Recipe - Ragi Millet Vermicel...,"1-1/2 cups Rice Vermicelli Noodles (Thin),1 On...",[INST] Ragi Semiya Upma Recipe - Ragi Millet V...
2,Gongura Chicken Curry Recipe - Andhra Style Go...,"500 grams Chicken,2 Onion - chopped,1 Tomato -...",[INST] Gongura Chicken Curry Recipe - Andhra S...
3,Pudina Khara Pongal Recipe (Rice and Lentils C...,"1 cup Rice - soaked for 20 minutes,1/2 cup Yel...",[INST] Pudina Khara Pongal Recipe (Rice and Le...
4,Udupi Style Ash Gourd Coconut Curry Recipe,500 grams Vellai Poosanikai (Ash gourd/White P...,[INST] Udupi Style Ash Gourd Coconut Curry Rec...
...,...,...,...
250781,zydeco soup,"['celery', 'onion', 'green sweet pepper', 'gar...","[INST] zydeco soup [/INST] ['celery', 'onion',..."
250782,zydeco spice mix,"['paprika', 'salt', 'garlic powder', 'onion po...","[INST] zydeco spice mix [/INST] ['paprika', 's..."
250783,zydeco ya ya deviled eggs,"['hard-cooked eggs', 'mayonnaise', 'dijon must...",[INST] zydeco ya ya deviled eggs [/INST] ['har...
250784,cookies by design cookies on a stick,"['butter', 'eagle brand condensed milk', 'ligh...",[INST] cookies by design cookies on a stick ...


In [15]:
new_df = df.rename(columns={'formatted':'text'})
new_df = new_df[['text']]
new_df.head()

Unnamed: 0,text
0,[INST] Masala Karela Recipe [/INST] 6 Karela (...
1,[INST] Ragi Semiya Upma Recipe - Ragi Millet V...
2,[INST] Gongura Chicken Curry Recipe - Andhra S...
3,[INST] Pudina Khara Pongal Recipe (Rice and Le...
4,[INST] Udupi Style Ash Gourd Coconut Curry Rec...


In [16]:
from sklearn.model_selection import train_test_split

train_df, eval_df = train_test_split(new_df, test_size=0.26, random_state=42)
train_df.to_csv('./datasets/train_inst.csv',index=False)
eval_df.to_csv('./datasets/eval_inst.csv',index=False)

In [18]:
training_ds = load_dataset('csv',data_files='./datasets/train_inst.csv',split='train')
training_ds
eval_ds = load_dataset('csv',data_files='./datasets/eval_inst.csv',split='train')
eval_ds

Generating train split: 185581 examples [00:01, 120426.16 examples/s]
Generating train split: 65205 examples [00:00, 128570.60 examples/s]


Dataset({
    features: ['text'],
    num_rows: 65205
})

# Fine tuning LLaMa Dataset for phi-2

In [20]:
# Check if CUDA is available and set the device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# max_sequence_length = 690
# def preprocess_example(example):
#     example['text'] = example['text'][:max_sequence_length]
#     return example
# training_ds = training_ds.map(preprocess_example)

# model_id = "microsoft/phi-2"
# new_model = 'mental-health-LLM'
# tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)

# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = 'right'

# try:
#     model = AutoModelForCausalLM.from_pretrained(
#         model_id,
#         trust_remote_code=True,
#         low_cpu_mem_usage=True,
#         device_map={"": 0},
#         revision='refs/pr/23'
#     )

#     model.config.use_cache = False
#     model.config.pretraining_tp = 1

#     training_args = TrainingArguments(
#         output_dir='./mental-healthLLM',
#         num_train_epochs=2,
#         per_device_train_batch_size=1,  # Reduced batch size for T4
#         gradient_accumulation_steps=64,  # Increased to maintain effective batch size
#         evaluation_strategy='steps',
#         eval_steps=1500,
#         optim='paged_adamw_8bit',
#         learning_rate=2e-4,
#         lr_scheduler_type='cosine',
#         save_steps=1500,
#         warmup_ratio=0.05,
#         weight_decay=0.01,
#         max_steps=-1,
#         fp16=True  # Enable mixed precision to save memory
#     )

#     peft_config = LoraConfig(
#         r=32,
#         lora_alpha=64,
#         lora_dropout=0.05,
#         bias='none',
#         task_type='CAUSAL_LM',
#         target_modules=['Wqkv', 'fc1', 'fc2']
#     )

#     trainer = SFTTrainer(
#         model=model,
#         train_dataset=training_ds,
#         peft_config=peft_config,
#         dataset_text_field='text',  # Use the 'formatted' column
#         tokenizer=tokenizer,
#         args=training_args
#     )

#     trainer.train()

# except Exception as e:
#     print('At line:', e.__traceback__.tb_lineno)
#     print('________________ERROR________________:', e)

# Adjust max sequence length in tokenization
# max_sequence_length = 690
# def preprocess_example(example):
#     example['text'] = example['text'][:max_sequence_length]
#     return example
# training_ds = training_ds.map(preprocess_example)

# model_id = "microsoft/phi-2"
# new_model = 'mental-health-LLM'
# tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)

# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = 'right'

# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type='nf4',  # normalizing float 4
#     bnb_4bit_compute_dtype=torch.float16,
#     bnb_4bit_use_double_quant=False  # Double quantization can degrade the performance
# )

# try:
#     model = AutoModelForCausalLM.from_pretrained(
#         model_id,
#         trust_remote_code=True,
#         # quantization_config=bnb_config,
#         low_cpu_mem_usage=True,
#         device_map={"": 0}
#     ).to(device)  # Ensure the model is moved to the correct device

#     model.config.use_cache = True  # Enable caching
#     model.config.pretraining_tp = 1
#     model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

#     training_args = TrainingArguments(
#         output_dir='./mental-healthLLM',
#         num_train_epochs=2,
#         per_device_train_batch_size=1,  # Reduced batch size for T4 or local GPU with less memory
#         gradient_accumulation_steps=64,  # Increased to maintain effective batch size
#         evaluation_strategy='steps',
#         eval_steps=1500,
#         optim='paged_adamw_8bit',
#         learning_rate=2e-4,
#         lr_scheduler_type='cosine',
#         save_steps=1500,
#         warmup_ratio=0.05,
#         weight_decay=0.01,
#         max_steps=-1,
#         fp16=True  # Enable mixed precision to save memory
#     )

#     peft_config = LoraConfig(
#         r=32,
#         lora_alpha=64,
#         lora_dropout=0.05,
#         bias='none',
#         task_type='CAUSAL_LM',
#         target_modules=['Wqkv', 'fc1', 'fc2']
#     )

#     trainer = SFTTrainer(
#         model=model,
#         train_dataset=training_ds,
#         peft_config=peft_config,
#         dataset_text_field='text',  # Use the 'text' column
#         tokenizer=tokenizer,
#         args=training_args
#     )

#     trainer.train()

# except Exception as e:
#     print('At line:', e.__traceback__.tb_lineno)
#     print('________________ERROR________________:', e)
from transformers import IntervalStrategy

model_id = "microsoft/phi-2"
new_model = 'Recipe-Generator'

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

# BitsAndBytes configuration for 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',  # normalizing float 4
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False  # Avoid double quantization for better performance
)

try:
    # Load model with quantization
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        trust_remote_code=True,
        quantization_config=bnb_config,
        flash_attn=True,
        flash_rotary=True,
        low_cpu_mem_usage=True,
        device_map={"": 0},
        revision='refs/pr/23'
    )

    # Set model configuration for training
    model.config.use_cache = False
    model.config.pretraining_tp = 1

    # Prepare model for k-bit training
    model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

    # Training arguments
    training_args = TrainingArguments(
        output_dir='./Recipe-Generator',
        num_train_epochs=1,
        per_device_train_batch_size=1,  # Reduce batch size to fit in memory
        gradient_accumulation_steps=64,  # Increase gradient accumulation steps
        evaluation_strategy=IntervalStrategy.STEPS,
        eval_steps=1500,
        optim='paged_adamw_8bit',
        learning_rate=2e-4,
        lr_scheduler_type='cosine',
        save_steps=1500,
        warmup_ratio=0.05,
        weight_decay=0.01,
        fp16=True,  # Use mixed precision
        max_steps=-1
    )

    # PEFT configuration
    peft_config = LoraConfig(
        r=32,
        lora_alpha=64,
        lora_dropout=0.05,
        bias='none',
        task_type='CAUSAL_LM',
        target_modules=['Wqkv', 'fc1', 'fc2']
    )

    # Initialize trainer
    trainer = SFTTrainer(
        model=model,
        train_dataset=training_ds,
        eval_dataset=eval_ds,
        peft_config=peft_config,
        dataset_text_field='text',
        tokenizer=tokenizer,
        args=training_args
    )

    # Train and save the model
    trainer.train()
    trainer.save_model('./Recipe-Generator')
    tokenizer.save_pretrained('./Recipe-Generator')

except Exception as e:
    print('At line:', e.__traceback__.tb_lineno)
    print('________________ERROR________________:', e)

Using device: cuda


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.62s/it]
You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.
Map: 100%|██████████| 185581/185581 [00:18<00:00, 9905.89 examples/s] 
Map: 100%|██████████| 65205/65205 [00:07<00:00, 9286.50 examples/s] 
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mvishwateja2684[0m ([33mvishwa-teja[0m). Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 4/2899 [06:16<72:53:44, 90.65s/it] 

KeyboardInterrupt: 