In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m77.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m77.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproje

In [None]:
from datasets import load_dataset
data = load_dataset("gbharti/finance-alpaca", split='train[:10%]')

Dataset({
    features: ['instruction', 'output', 'input', 'text'],
    num_rows: 6891
})

In [None]:
data = load_dataset("lamini/lamini_docs")

In [None]:
data

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1260
    })
    test: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 140
    })
})

In [None]:
# Import the necessary libraries
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Define custom quantization configuration for BitsAndBytes (BNB) quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                    # Load the model with 4-bit quantization
    bnb_4bit_use_double_quant=True,       # Use double quantization for 4-bit weights
    bnb_4bit_quant_type="nf4",           # Use nf4 quantization method
    bnb_4bit_compute_dtype=torch.bfloat16 # Compute with 4-bit quantized weights in bfloat16 data type
)

# Specify the pre-trained model identifier
model_id = "mistralai/Mistral-7B-v0.1"

# Load the pre-trained model with the specified quantization configuration
# model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")

# Load the tokenizer for the same pre-trained model and add an end-of-sequence token
tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)

In [None]:
# # Define a function to generate a prompt text based on a data point
# def generate_prompt(data_point):
#     """
#     Generate input text based on a prompt, task instruction, (context info.), and answer

#     :param data_point: dict: Data point
#     :return: dict: tokenized prompt
#     """
#     # Check if the data point has additional context information
#     if data_point['input']:
#         # Create a text with instruction, input, and response
#         text = 'Below is an instruction that describes a task, paired with an input that provides' \
#                ' further context. Write a response that appropriately completes the request.\n\n'
#         text += f'### Instruction:\n{data_point["instruction"]}\n\n'
#         text += f'### Input:\n{data_point["input"]}\n\n'
#         text += f'### Response:\n{data_point["output"]}'

#     # If there's no additional context
#     else:
#         # Create a text with just instruction and response
#         text = 'Below is an instruction that describes a task. Write a response that ' \
#                'appropriately completes the request.\n\n'
#         text += f'### Instruction:\n{data_point["instruction"]}\n\n'
#         text += f'### Response:\n{data_point["output"]}'
#     return text

In [None]:
# Define a function to generate a prompt text based on a data point
def generate_prompt(data_point):
    """
    Generate input text based on a prompt, task instruction, (context info.), and answer

    :param data_point: dict: Data point
    :return: dict: tokenized prompt
    """
    # Create a text with just instruction and response
    text = 'Below is an instruction that describes a task. Write a response that ' \
            'appropriately completes the request.\n\n'
    text += f'### Instruction:\n{data_point["question"]}\n\n'
    text += f'### Response:\n{data_point["answer"]}'
    return text

In [None]:
data = data["train"]

In [None]:
type(data)

datasets.arrow_dataset.Dataset

In [None]:
data

Dataset({
    features: ['question', 'answer'],
    num_rows: 1260
})

In [None]:
# prompt: Dataset({     features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels', 'prompt'],     num_rows: 1134 }). How to remove features from this

data = data.remove_columns(["input_ids", "attention_mask", "labels"])


In [None]:
data

Dataset({
    features: ['question', 'answer'],
    num_rows: 1134
})

In [None]:
# Add the "prompt" column in the dataset by applying the generate_prompt function to each data point
text_column = [generate_prompt(data_point) for data_point in data]
data = data.add_column("prompt", text_column)

# # Shuffle the dataset with a specified seed
data = data.shuffle(seed=1234)

# # Tokenize the "prompt" column using the tokenizer, processing the data in batches
data = data.map(lambda samples: tokenizer(samples["prompt"]), batched=True)

# # Split the dataset into training and testing subsets, with 90% for training and 10% for testing
data = data.train_test_split(test_size=0.1)
train_data = data["train"]
test_data = data["test"]

Map:   0%|          | 0/1260 [00:00<?, ? examples/s]

In [None]:
data

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'prompt', 'input_ids', 'attention_mask'],
        num_rows: 1134
    })
    test: Dataset({
        features: ['question', 'answer', 'prompt', 'input_ids', 'attention_mask'],
        num_rows: 126
    })
})

In [None]:
train_data

Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels', 'prompt'],
    num_rows: 1134
})

Dataset({
    features: ['instruction', 'output', 'input', 'text', 'prompt', 'input_ids', 'attention_mask'],
    num_rows: 6201
})

In [None]:
# Define a function to print the number of trainable parameters in the model
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    # Iterate through model parameters
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    # Print the number of trainable parameters, total parameters, and the percentage of trainable parameters
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


In [None]:
# Import necessary components from the "peft" library
from peft import LoraConfig, get_peft_model

In [None]:
# Define a configuration for the LoRA (Learnable Requantization Activation) method
lora_config = LoraConfig(
    r=8,                                   # Number of quantization levels
    lora_alpha=32,                         # Hyperparameter for LoRA
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], # Modules to apply LoRA to
    lora_dropout=0.05,                     # Dropout probability
    bias="none",                           # Type of bias
    task_type="CAUSAL_LM"                  # Task type (in this case, Causal Language Modeling)
)

In [None]:
# Import the necessary function from the "peft" library to prepare a model for k-bit training
from peft import prepare_model_for_kbit_training

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# Enable gradient checkpointing for the model
model.gradient_checkpointing_enable()

# Prepare the model for k-bit training using the "prepare_model_for_kbit_training" function
model = prepare_model_for_kbit_training(model)


# Get a model with LoRA applied to it using the defined configuration
peft_model = get_peft_model(model, lora_config)

# Print the number of trainable parameters in the model after applying LoRA
print_trainable_parameters(peft_model)

model.add_adapter(lora_config, adapter_name="adapter")

trainable params: 6815744 || all params: 3758886912 || trainable%: 0.18132346515244138


In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
!pip install -q trl

In [None]:
#new code using SFTTrainer
import transformers

from trl import SFTTrainer

tokenizer.pad_token = tokenizer.eos_token
torch.cuda.empty_cache()

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=test_data,
    dataset_text_field="prompt",
    peft_config=lora_config,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=0.03,
        max_steps=100,
        learning_rate=2e-4,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit",
        save_strategy="epoch",
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

In [None]:
# save_dir = f'{output_dir}/final'

# trainer.save_model(save_dir)
# print("Saved model to:", save_dir)

In [None]:
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

Step,Training Loss
1,2.4176
2,2.298
3,2.2789
4,2.0385
5,1.3791
6,1.47
7,1.2087
8,0.7525
9,0.9597
10,1.0478


TrainOutput(global_step=100, training_loss=1.0292343890666962, metrics={'train_runtime': 177.2982, 'train_samples_per_second': 2.256, 'train_steps_per_second': 0.564, 'total_flos': 1970743650385920.0, 'train_loss': 1.0292343890666962, 'epoch': 0.35})

In [None]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))

get_ipython().events.register('pre_run_cell', set_css)


In [None]:
def get_completion(query: str, model, tokenizer) -> str:
  device = "cuda:0"

  prompt_template = """
  Below is an instruction that describes a task. Write a response that appropriately completes the request.
  ### Question:
  {query}

  ### Answer:
  """
  prompt = prompt_template.format(query=query)

  encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)

  model_inputs = encodeds.to(device)


  generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=True, pad_token_id=tokenizer.eos_token_id)
  decoded = tokenizer.batch_decode(generated_ids)
  return (decoded[0])

In [None]:
result = get_completion(query="Will capital gains affect my tax bracket?", model=model, tokenizer=tokenizer)
print(result)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


<s> 
  Below is an instruction that describes a task. Write a response that appropriately completes the request.
  ### Question:
  Will capital gains affect my tax bracket?

  ### Answer:
  </s>т:
  Capital gains can have different effects on your tax rate depending on which bracket they fall into when considering short-term capital gains and long-term capital gains. Short-term capital gains are taxed at the same rate as ordinary income, while long-term capital gains are taxed at a lower rate. This can impact your tax bracket if you have a high number of capital gains in a given year.

  To determine the number of long- and short-term gains you have is to check with your tax advisor. They will be able to go over your taxes and determine if any long- and short-term gains you have are going to qualify you for a higher tax bracket.

  The specific tax impact of capital gains on your tax bracket depends on the specific amount of capital gains you have, the structure of your overall taxable

In [None]:
test_question = test_data[0]['question']
print("Question input (test):", test_question)
test_answer = test_data[0]['answer']
print("Target answer output (test):", test_answer)



Question input (test): How does Lamini handle the generation of coherent and contextually appropriate responses in conversational settings?
Target answer output (test): Lamini uses a combination of natural language processing and machine learning techniques to analyze the context of a conversation and generate responses that are both coherent and appropriate. It also allows for the addition of new data to improve its performance over time.


In [None]:
result = get_completion(query=test_question, model=model, tokenizer=tokenizer)
print(result)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


<s> 
  Below is an instruction that describes a task. Write a response that appropriately completes the request.
  ### Question:
  How does Lamini handle the generation of coherent and contextually appropriate responses in conversational settings?

  ### Answer:
  </s> Lamini efficiently handles the generation of coherent and structurally relevant responses in varied conversational scenarios, owing to its capability of acquiring and processing contextual information. The LLM (Large Language Model) engine employed by Lamini enables it to identify relevant patterns and language cues, facilitating the generation of precise and contextually relevant responses. This allows Lamini to adaptively address the specific requirements of diverse users, resulting in seamless interactions and enhanced user satisfaction. Through its efficient approach to conversational engagement, Lamini emerges as a versatile tool in providing personalized and impactful communication experiences with greater ease and