In [None]:
!pip install -qqq -U pip
!pip install -qqq bitsandbytes --progress-bar off
!pip install -qqq torch --progress-bar off
!pip install -qqq -U git+https://github.com/huggingface/transformers.git@main --progress-bar off
!pip install -qqq -U git+https://github.com/huggingface/peft.git --progress-bar off
!pip install -qqq accelerate --progress-bar off
!pip install -qqq datasets --progress-bar off
!pip install -qqq loralib --progress-bar off
!pip install -qqq einops --progress-bar off
!pip install -qqq sentencepiece --progress-bar off

In [None]:
import gc
import torch

# model.to("cpu")
# del model
torch.cuda.empty_cache()
gc.collect()

In [None]:
import json
import os
from pprint import pprint

import bitsandbytes as bnb
import pandas as pd
import torch
import torch.nn as nn
import transformers
from datasets import load_dataset
from huggingface_hub import notebook_login
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    prepare_model_for_kbit_training,
)
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
notebook_login()

In [None]:
%%capture
!pip install unsloth "xformers==0.0.28.post2"
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [None]:
from unsloth import FastLanguageModel
import torch
import os
dtype = None
load_in_4bit = True


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3.1-8b-Instruct",
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = os.getenv("HUGGINGFACE_TOKEN"),
)

In [None]:
from unsloth import FastLanguageModel
import torch
import os
dtype = None
load_in_4bit = True


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3.1-8b-Instruct",
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = os.getenv("HUGGINGFACE_TOKEN"),
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

In [None]:
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
EOS_TOKEN

In [None]:
model.config

In [None]:
import pandas as pd
df = pd.read_csv('/content/mongodb_prompts_with_stories.csv')
df.head()

In [None]:
df.shape

In [None]:
# Define column names as variables
story_column = "Story"  # Update with actual column name
prompt_column_1 = "Annotator ID 1"  # Update with actual column name
prompt_column_2 = "Annotator ID 3"  # Update with actual column name
prompt_column_3 = "Annotator ID 4"  # Update with actual column name

In [None]:
# Create a new DataFrame where each prompt/story pair becomes an input-output pair
data = []

# Iterate through each row and create prompt-story pairs
for index, row in df.iterrows():
    # Each prompt and its corresponding story will be a training example
    for prompt_col in [prompt_column_1, prompt_column_2, prompt_column_3]:
        data.append({
            "input_text": row[prompt_col],
            "output_text": row[story_column]
        })

# Convert the list to a pandas DataFrame
train_df = pd.DataFrame(data)

In [None]:
train_df=train_df.dropna()

In [None]:
train_df.head()

In [None]:
final_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Act as Intelligent Assistant and an expert to generate coherent stories in urdu, Read this Urdu_Prompt and generate a Urdu Story from this.

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

def formatting_prompts_func(row):
    input_text = row["input_text"]
    output_text = row["output_text"]
    # Must add EOS_TOKEN to ensure proper tokenization and termination
    text = final_prompt.format(input_text, output_text) + EOS_TOKEN
    return text




In [None]:
train_df["text"] = train_df.apply(formatting_prompts_func, axis=1)

In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(train_df[['text']])


In [None]:
dataset

In [None]:
print(dataset['text'][100])

In [None]:
total_tokens=0
max_len = 0

for i in dataset['text']:
  x = str(i)
  total_tokens += len(x)
  if len(x) > max_len:
    max_len = len(x)

total_tokens , max_len


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = 8192,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1,
        max_steps = 500,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",
    ),
)

In [None]:
import torch
torch.cuda.empty_cache()
import gc
gc.collect()

In [None]:
trainer_stats = trainer.train()

In [None]:
model.push_to_hub_merged("sarmadsiddiqui29/Llama-3.1-8B-Instruct-Urdu-Story", tokenizer, save_method = "merged_16bit", token = os.getenv("HUGGINGFACE_TOKEN"))
tokenizer.push_to_hub("sarmadsiddiqui29/Llama-3.1-8B-Instruct-Urdu-Story", tokenizer, token = os.getenv("HUGGINGFACE_TOKEN"))
