In [None]:
# Installs all packages, including Unsloth, Xformers, etc
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" perf acceleratate bitsandbytes

## Background
What if you can train an AI off of texts of someone, and talk with it? Is it possible to finetune a pre-existing model without the need to use any online LLM? Finetuning big models (anything >7B) requires a minimum of 32gb of VRAM. We will use whatsapp chats in our project

We assume that all exported chats (Whatsapp, Telegram, Messenger) are meaningful conversations, and adds value to the data. 

## Data Filtering
Exported data contains many irregularities such as `<Media omitted>`, `This message was deleted`, etc. It also contains timestamps. I removed them and converted the chat history format into `Prompt: Response`. To extract the messages, I used `regex`. Additionally I filtered out any links and emails, for privacy reasons. 

Additionally, I included a list of excluded words in which the fine-tuned model won't learn from. Words such as `Ok`, `Yup`, `Hmm`, `K`. 

In [None]:
filler_words = ["Ok", "Okay", "Yup", "Hmm"]
# Add or remove words from this list based on your personal usage.

# Chat directory as root
chat_dir = "./"

In [None]:
import os
import re
from typing import List, Tuple, Dict

class WhatsAppChatProcessor:
    """
    A class to load, filter, and process WhatsApp chat exports.
    """

    DATE_PATTERN = r'\d{1,2}/\d{1,2}/\d{2,4}, \d{1,2}:\d{2}\s*(?:AM|PM|am|pm)?'

    def __init__(self):
        self.contact_name: str = ""
        self.my_name: str = ""

    def open_chat_file(self, directory: str, filename: str) -> str:
        """
        Reads the raw text of a WhatsApp chat export.
        Also extracts the contact name from the filename.
        """
        # Derive the other party’s name from the filename
        self.contact_name = filename.replace("WhatsApp Chat with ", "").replace(".txt", "")
        full_path = os.path.join(directory, filename)
        with open(full_path, encoding='utf-8') as f:
            return f.read()

    def msg_filter_basic(self, raw_text: str) -> List[Tuple[str, str]]:
        """
        Extracts (sender, message) pairs and filters out
        system messages, media placeholders, URLs, and deleted messages.
        """
        # Regex to split into (sender, message) based on WhatsApp timestamp format
        pattern = (
            rf' - ([^:]+): (.*?)'
            rf'(?={self.DATE_PATTERN} - |$)'
        )
        all_messages = re.findall(pattern, raw_text, re.DOTALL)

        filtered: List[Tuple[str, str]] = []
        for sender, msg in all_messages:
            # Skip default WhatsApp prompts, media placeholders, URLs, etc.
            if any(phrase in msg for phrase in [
                "Tap to learn more.",
                "<Media omitted>",
                "http://",
                "https://",
                "@gmail.com",
                "This message was deleted",
                "You deleted this message",
                "<This message was edited>",
                "(file attached)"
            ]):
                continue
            filtered.append((sender, msg.strip()))
        return filtered

    def merge_consecutive_messages(self, messages: List[Tuple[str, str]]) -> List[Dict[str, List[str]]]:
        """
        Merges consecutive lines from the same sender into single entries.
        Returns a list of dicts: [{sender1: [...lines]}, {sender2: [...lines]}, ...].
        """
        merged: List[Dict[str, List[str]]] = []
        current_sender = None
        buffer: List[str] = []

        for sender, text in messages:
            if sender == current_sender:
                # Continue accumulating lines for the same sender
                buffer.append(text)
            else:
                # When the sender changes, flush the previous buffer
                if current_sender is not None:
                    merged.append({current_sender: buffer})
                current_sender = sender
                buffer = [text]

        # Flush the final buffer
        if current_sender is not None:
            merged.append({current_sender: buffer})

        # Identify your own name (the first other than the contact)
        participants = {list(entry.keys())[0] for entry in merged}
        participants.discard(self.contact_name)
        if participants:
            self.my_name = participants.pop()

        return merged

    def pair_conversations(self, merged: List[Dict[str, List[str]]], filler_words: List[str]) -> List[List[str]]:
        """
        Pairs each message from the contact with the subsequent message from you,
        skipping trivial or one-word “filler” replies from the contact.
        Returns a list of [contact_message, my_response] pairs.
        """
        pairs: List[List[str]] = []
        contact_buffer = []
        expecting_response = False

        for entry in merged:
            sender, lines = next(iter(entry.items()))
            text = "\n".join(lines).strip()

            if sender == self.contact_name:
                # Skip if it's just a filler word or single character
                if len(lines) == 1 and (text in filler_words or len(text) == 1):
                    continue
                contact_buffer.append(text)
                expecting_response = True

            elif sender == self.my_name and expecting_response:
                # Combine accumulated contact messages, then pair with your response
                contact_text = "\n".join(contact_buffer)
                pairs.append([contact_text, text])
                # Reset for next pair
                contact_buffer = []
                expecting_response = False

        return pairs


In [None]:
with open("all_chat_data.csv", "w") as f:
    f.write("Prompt,Response"+ "\n")

for file in os.listdir(os.path.join(chat_dir)):
    if file.endswith('.zip'):
        full_path = os.path.join(chat_dir, file)
        shutil.unpack_archive(full_path, chat_dir)

In [None]:
for file in os.listdir(os.path.join(chat_dir)):
    processor = Wh_Chat_Processor()
    if file.endswith('.txt'):
        print("Processing: ",file)
        chat_d = processor.open_chat_file(chat_dir,file)
        basic_f = processor.msg_filter_basic(chat_d)
        chat_ps = processor.process_chat(basic_f)
        filtered_data = processor.advance_filter(chat_ps)
        with open("all_chat_data.csv", "a") as f:
            csv_writer = csv.writer(f)
            for row in filtered_data:
                csv_writer.writerow(row)
print("Successfully Processed all the chats... Generated CSV File of chats is saved in Current directory with the name 'all_chat_data.csv'")

## Fine-tuning with constraints

Fine-tuning a 7B parameter model with little VRAM isn't exactly possible. We will use [Quantization](https://huggingface.co/docs/optimum/en/concept_guides/quantization). Specifically, we will use 4-bit quantization. For the rest of the fine-tuning, I will be using [Unsloth](https://github.com/unslothai/unsloth), which uses 80% less VRAM. 

For fine-tuning, I am using `Llama3` 8B Instruct as my base model. You can also try Mixtral or Gemma.


In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-v0.3-bnb-4bit",      # New Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",           # Llama-3 15 trillion tokens model 2x faster!
    "unsloth/llama-3-8b-Instruct-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",        # Phi-3 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",             # Gemma 2.2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

Add LoRA adapters so only need to update 1-10% of all parameters

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Preparing a dataset from Whatsapp chat data

In [None]:
import pandas as pd
from datasets import Dataset, load_dataset
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template="llama-3",  # Use the desired chat template
    mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"}
)

# Define the formatting function
def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
    return {"text": texts}

df = pd.read_csv("all_chat_data.csv")
conversations = []
for _, row in df.iterrows():
    try:
        conversation = [
            {'from': 'human', 'value': str(row['Prompt'])},
            {'from': 'assistant', 'value': str(row['Response'])}
        ]
        conversations.append(conversation)
    except:
        print(_ , row)


dataset = Dataset.from_dict({"conversations": conversations})
dataset = dataset.map(formatting_prompts_func, batched=True)

testing

In [None]:
dataset[5]["conversations"]

I train the model using Huggingface TRL's `SFTTrainer`. I set the number of epochs to 1, for testing purposes, but it is generally recommended to set it to 2 or 3. This can be experiemented. Generally, do not od more than 3 epochs if `training loss` isn't decreasing with each epoch.

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs=1,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

In [None]:
trainer_stats = trainer.train()

## Inference

We have trained the model. Let's do some inference on it!

In [None]:
from unsloth.chat_templates import get_chat_template
from transformers import TextStreamer

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
)
text_streamer = TextStreamer(tokenizer)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"from": "human", "value": "Pagal ho gya hai kya"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

output = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128, use_cache = True)

## Saving model

We save the fine tuned model as LoRA adapters. We can also save it as GGUF or 16bit (later !)

In [None]:
model.save_pretrained("lora_model") # Local saving

We save it as GGUF so that we can use it with Ollama. Unsloth allows all methods such as `q4_k_m`. 

In [None]:
# Save to 8bit Q8_0
if True: model.save_pretrained_gguf("model", tokenizer,)
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

And that should be it with our model!