#Install dependencies & model setup

In [None]:
!pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
# !pip install datsets transformers[sentencepiece]
# !pip install sentencepiece
!pip install git+https://github.com/huggingface/accelerate.git
!pip install -q datasets bitsandbytes einops wandb

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

def preprocess_function(examples):
    return tokenizer(examples["intent"], examples["response"],  examples["instruction"],  examples["category"],  truncation=True)

def get_preprocessed_dataset(path_to_data, split_rate):
  data = pd.read_csv('/content/Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv')
  idx_split = int(len(data)*split_rate)
  data[:idx_split].to_json('json_train.json', orient='records')
  data[idx_split:].to_json('json_test.json', orient='records')
  data_files = {"train": "json_train.json", "test": "json_test.json"}
  dataset_name = '/content'
  dataset = load_dataset(dataset_name, data_files=data_files)
  tokenized_datasets = dataset.map(preprocess_function, batched=True)
  return tokenized_datasets

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from peft import prepare_model_for_kbit_training
from peft import LoraConfig, get_peft_model
import transformers
import pandas as pd
from datasets import load_dataset, Dataset

model_name = "ybelkada/falcon-7b-sharded-bf16"


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True
    )

model.config.use_cache = False

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

In [4]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "query_key_value",
        "dense",
        "dense_h_to_4h",
        "dense_4h_to_h",
    ]
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

In [None]:
# load up dataset & preprocess
tokenized_datasets = get_preprocessed_dataset(path_to_data='/content/Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv', split_rate=0.75)

#Training process

In [None]:
# api-key = 024d6bf50eff22ce75f696edb9fd651f6fb942f1

tokenizer.pad_token = tokenizer.eos_token
trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=10,
        save_steps = 10,
        logging_steps = 10,
        learning_rate = 2e-4,
        max_grad_norm = 0.3,
        max_steps = 500,
        warmup_ratio = 0.03,
        lr_scheduler_type = "constant",
        fp16=True,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

In [None]:
model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model  # Take care of distributed/parallel training
model_to_save.save_pretrained("outputs")

#Test trained model  

In [6]:
lora_config = LoraConfig.from_pretrained('/content/drive/MyDrive/outputs')
model = get_peft_model(model, lora_config).to('cuda')

In [10]:
system_prompt = "### System:\nYou are falcon-7b sharded. You are assistant for people to get information about flight, buying tickets, \
                luggage, refunding and other related questions.\n\n"
message = "How can i buy a ticket to Moscow from Tokyo?"
prompt = f"{system_prompt}### User: {message}\n\n### Assistant:\n"


inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
inputs.pop('token_type_ids')
outputs = model.generate(**inputs, do_sample=True, top_p=0.95, top_k=4, max_new_tokens=256)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


### System:
You are falcon-7b sharded. You are assistant for people to get information about flight, buying tickets,                 luggage, refunding and other related questions.

### User: How can i buy a ticket to Moscow from Tokyo?

### Assistant:
You need to select your departure airport - Tokyo. Then you need to select your destination airport - Moscow.

You will get a list of possible flights.

