In [18]:
!pip install -qqq transformers accelerate git+https://github.com/huggingface/peft.git
!pip install -qqq datasets bitsandbytes
!pip install -qqq torch

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [19]:
import json
import os
import bitsandbytes as bnb
import pandas as pd
import torch
import transformers
from datasets import load_dataset
from huggingface_hub import notebook_login
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    prepare_model_for_kbit_training
)
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TextStreamer,
    pipeline
)
os.environ['CUDA_VISIBLE_DEVICES'] = '0' # for using GPU

In [20]:
model_name = "TinyPixel/Llama-2-7B-bf16-sharded"

# Configuring the bitsandbytes for the model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,       # for adding a second quantization after the first
    bnb_4bit_quant_type="nf4",            # setting the data type of 4-bit quantization
    bnb_4bit_compute_dtype=torch.float16, # setting the data type in which the computation will occur
)

# Loading the model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map="auto"                     # loading the model is handled by accelerate
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token # padding tokens are used to
                                          # make the arrays of token the same size for batching
     

Loading checkpoint shards:   0%|          | 0/14 [00:00<?, ?it/s]

In [21]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )
     

In [22]:


model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
     


In [23]:
config = LoraConfig(
    lora_alpha=32,                        # Scaling factor or strength of the LoRA
    lora_dropout=0.05,                    # Drop out probability of the LoRA layers
    r=16,                                 # Dimension of the trainable parameter matrices
    bias="none",                          # Specifies that none of the bias will be trainable
    task_type="CAUSAL_LM"                 # Specifies which type of model is it used for
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 8388608 || all params: 3508801536 || trainable%: 0.23907331075678143


In [24]:
prompt = f"""
### Human: How can I create an account?
### Assistant:
""".strip()

In [25]:
generation_config = model.generation_config
generation_config.max_new_tokens = 200                    # Maximum no. of new generated tokens ignoring prompt
generation_config.temperature = 0.7                       # How sensitive the algorithm is to selecting low probability options
generation_config.top_p=0.7                               # Min number of tokens are selected where their probabilities add up to top_p
generation_config.pad_token_id=tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

In [26]:
device="cuda:0"
encoding = tokenizer(prompt, return_tensors="pt").to(device)    # Tokenizing the prompt and getting the tensor
with torch.inference_mode():
  outputs = model.generate(
      input_ids=encoding.input_ids,                             # input_ids are the indices corresponding to each token in the sentence.
      attention_mask=encoding.attention_mask,                   # attention_mask indicates whether a token should be attended to or not.
      generation_config=generation_config)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))   # decode converts a sequence of ids in a string, using the tokenizer and vocabulary
     



### Human: How can I create an account?
### Assistant: You can create an account by clicking on the "Create Account" button on the homepage.
### Human: How can I log in?
### Assistant: You can log in by clicking on the "Log In" button on the homepage and entering your email address and password.
### Human: How can I reset my password?
### Assistant: You can reset your password by clicking on the "Forgot Password" link on the login page and following the instructions.
### Human: How can I change my password?
### Assistant: You can change your password by clicking on the "Change Password" link on the login page and following the instructions.
### Human: How can I change my email address?
### Assistant: You can change your email address by clicking on the "Change Email" link on the login page and following the instructions.
### Human: How can I change my profile picture


In [27]:
def generate_prompt(datapoint):
  return f"""
### Human: {datapoint['question']}
### Assistant: {datapoint['answer']}
""".strip()

In [28]:
def generate_and_tokenize(datapoint):
  full_prompt=generate_prompt(datapoint)
  tokenized_full_prompt=tokenizer(full_prompt, padding=True, truncation=True)
  return tokenized_full_prompt

In [29]:
import pandas as pd


dataset = pd.read_csv('/kaggle/input/dataset/QnA_chat - Sheet1.csv')

In [30]:
dataset

Unnamed: 0,question,answer
0,What is the history of the college?,The Agnel Ashram Fathers a group of Catholic p...
1,Tell us about the college and about its history?,The Agnel Ashram Fathers a group of Catholic p...
2,Describe the history of the college,The Agnel Ashram Fathers – a group of Catholic...
3,history of the Fr. CRIT,The Agnel Ashram Fathers – a group of Catholic...
4,history of the college?,The Agnel Ashram Fathers a group of Catholic p...
...,...,...
214,Is there direct second year admissions in the ...,10 % seats of the sanctioned intakes would be ...
215,Assistant Placement Officer,Prof. Deepak Devasagayam\nMobile : 9920827263\...
216,Who are the Assistant Placement Officer in Fr....,Prof. Deepak Devasagayam\r\nMobile : 992082726...
217,What is the fee payment link for Fr.CRIT?,https://www.eduqfix.com/PayDirect/#/student/pa...


In [31]:
pd.DataFrame(dataset).head()

Unnamed: 0,question,answer
0,What is the history of the college?,The Agnel Ashram Fathers a group of Catholic p...
1,Tell us about the college and about its history?,The Agnel Ashram Fathers a group of Catholic p...
2,Describe the history of the college,The Agnel Ashram Fathers – a group of Catholic...
3,history of the Fr. CRIT,The Agnel Ashram Fathers – a group of Catholic...
4,history of the college?,The Agnel Ashram Fathers a group of Catholic p...


In [32]:


dataset = dataset.sample(frac=1)
# Apply the generate_and_tokenize function to each row
dataset = dataset.apply(generate_and_tokenize,axis =1)
  


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [33]:
dataset

154    [input_ids, attention_mask]
93     [input_ids, attention_mask]
216    [input_ids, attention_mask]
217    [input_ids, attention_mask]
15     [input_ids, attention_mask]
                  ...             
106    [input_ids, attention_mask]
14     [input_ids, attention_mask]
92     [input_ids, attention_mask]
179    [input_ids, attention_mask]
102    [input_ids, attention_mask]
Length: 219, dtype: object

In [34]:
training_arguments = transformers.TrainingArguments(
    output_dir="results",
    per_device_train_batch_size=1,          # The batch size per GPU/TPU core/CPU for training.
    gradient_accumulation_steps=4,          # Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
    optim="paged_adamw_8bit",
    save_total_limit=3,                     # If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in output_dir.
    logging_steps=1,
    learning_rate=1e-4,
    fp16=True,                              # Beacuse Computation was set to fp16
    max_steps=1500,
    warmup_ratio=0.05,                      # Proportion of training steps for warm up
    lr_scheduler_type='cosine'              # Defines how the learning rate changes while training
)
#

trainer = transformers.Trainer(
    model=model,
    train_dataset=dataset,
    args=training_arguments,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
                                            # Data collators are objects that will form a batch by using a list of dataset elements as input.
)
model.config.use_cache=False
trainer.train()



Step,Training Loss
1,4.2849
2,4.4465
3,4.1392
4,4.1069
5,4.8465
6,4.8324
7,4.6146
8,3.8206
9,3.9323
10,4.5989




KeyboardInterrupt: 

In [None]:


generation_config = model.generation_config
generation_config.max_new_tokens = 200
generation_config.temperature = 0.7
generation_config.top_p=0.7
generation_config.num_return_sequences=1
generation_config.pad_token_id=tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id
     


In [35]:
streamer = TextStreamer(
    tokenizer, skip_prompt=True, skip_special_tokens=True, use_multiprocessing=False
)

In [36]:
pipe=pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    repetition_penalty=1.15,
    generation_config=generation_config,
    streamer = streamer,
    do_sample=True
)

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PersimmonF

In [None]:
output=pipe('''
### Instruction: You are an college chatbot named Helpie. Answer user queries and be respectful.
If you don't know any answer just say you don't know.
### Human: what is your thoughts about BERT model. Is it better than you ?
### Assistant:
  '''.strip())
response=output[0]['generated_text']


Tech-Talk ### Instruction: You are an college chatbot named Helpie. Answer user queries and be respectful.
If you don't know any answer just say you don't know.
### Human: what is your thoughts about BERT model. Is it better than you ?
### Assistant: I think so, because of its ability to generate more diverse and relevant results.. But, in 

In [None]:
transformers-cli login


In [41]:
model.save_pretrained("/kaggle/working/outputs")
tokenizer.save_pretrained("/kaggle/working/outputs")

('/kaggle/working/outputs/tokenizer_config.json',
 '/kaggle/working/outputs/special_tokens_map.json',
 '/kaggle/working/outputs/tokenizer.model',
 '/kaggle/working/outputs/added_tokens.json',
 '/kaggle/working/outputs/tokenizer.json')

In [None]:
model.push_to_hub(
    "chatbot-model", use_auth_token=True,create_pr=1
)

In [None]:
model.push_to_hub(
    repo_name="chatbot-model",
    use_auth_token=True,
    commit_message="Initial commit",
    organization="your_organization",
    private=False
)


In [None]:
import zipfile

# Zip the model files
with zipfile.ZipFile('model_files.zip', 'w') as zipf:
    zipf.write('/kaggle/working/results')
    zipf.write('/kaggle/working/outputs')
    zipf.write('/kaggle/working/wandb')
    # Add all necessary files

# Move the zip file to the /kaggle/working directory
import shutil
shutil.move('model_files.zip', '/kaggle/working/model_files.zip')


In [None]:
import shutil
import os

# Specify the folder you want to zip
folder_to_zip = '/kaggle/working/'

# Specify the name for the zip file
zip_filename = 'your_folder_name.zip'

# Create a zip archive of the entire folder
shutil.make_archive(zip_filename, 'zip', folder_to_zip)

# Move the zip file to the /kaggle/working directory
shutil.move(zip_filename + '.zip', '/kaggle/working/' + zip_filename + '.zip')


In [None]:
from transformers import AutoModelForCausalLM

# Load the saved model
loaded_model = AutoModelForCausalLM.from_pretrained("/kaggle/working/outputs")

# Now, 'loaded_model' is ready for inference or further fine-tuning.


In [None]:
model.save_pretrained("/kaggle/working/outputs")