In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import torch
print("GPU is available" if torch.cuda.is_available() else "GPU is not available")


In [None]:
!pip install transformers accelerate bitsandbytes
!pip install accelerate>=0.26.0
!pip install peft
!pip install trl
!pip install chardet

In [None]:
import chardet
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer

In [None]:
token = "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"
model_name = "HuggingFaceH4/zephyr-7b-alpha"
dataset_name = "/kaggle/input/data-endoscopy/data_final.txt"
output_dir = "./results"
new_model_dir = "./Endoscopy-Zephyr"

In [None]:
device_map = {"": "cuda" if torch.cuda.is_available() else "cpu"}
print(f"Using device: {device_map['']}")
file_path = dataset_name
with open(file_path, 'rb') as f:
    raw_data = f.read()
    encoding = chardet.detect(raw_data)['encoding']
    print(f"Detected encoding: {encoding}")
try:
    dataset = load_dataset('text', data_files=file_path, encoding=encoding)
    print("Dataset loaded successfully.")
except Exception as e:
    print(f"An error occurred while loading the dataset: {e}")
use_8bit = True  

bnb_config = BitsAndBytesConfig(
    load_in_8bit=use_8bit
)


In [None]:
lora_r = 64
lora_alpha = 16
lora_dropout = 0.1

use_8bit = True  
bnb_8bit_compute_dtype = "float16"  
bnb_8bit_quant_type = "nf4"  
use_nested_quant = False

num_train_epochs = 2
per_device_train_batch_size = 6
gradient_accumulation_steps = 1
learning_rate = 2e-4
weight_decay = 0.001
optim = "paged_adamw_32bit"
save_steps = 0
logging_steps = 25
warmup_ratio = 0.03
group_by_length = True
max_grad_norm = 0.3
lr_scheduler_type = "cosine"
fp16 = False  
bf16 = False  

In [None]:
from transformers import AutoConfig
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True, token=token)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,  
    device_map=device_map,  
    low_cpu_mem_usage=True,  
    token=token,  
    trust_remote_code=True,  
)
model.config.use_cache = False 

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, token=token)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)

In [None]:
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset['train'],
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False
)

trainer.train()

trainer.save_model(new_model_dir)
tokenizer.save_pretrained(new_model_dir)

model = AutoModelForCausalLM.from_pretrained(new_model_dir, device_map=device_map)
tokenizer = AutoTokenizer.from_pretrained(new_model_dir)

input_text = "Explain about endoscopy"
inputs = tokenizer(input_text, return_tensors="pt").to(device_map[""])
outputs = model.generate(inputs["input_ids"], max_length=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
