# 💬 Digital Leader Persona Chatbot Training
This notebook walks through the full pipeline:
1. Parsing WhatsApp exports
2. Building instruction-tuning data
3. Loading and preparing the model with LoRA
4. Fine-tuning the model
5. Inference example


In [2]:
import re
import json
from pathlib import Path

def extract_user_messages(txt_path: str, sender_name: str) -> list[str]:
    pattern = re.compile(r'\d+/\d+/\d+, \d+:\d+ (?:AM|PM) - (.*?): (.*)')
    msgs = []
    with open(txt_path, 'r', encoding='utf-8') as f:
        for line in f:
            m = pattern.match(line)
            if m and m.group(1) == sender_name:
                msgs.append(m.group(2).strip())
    return msgs

# Example usage
txt_file = 'parsed_conversations_fixed.txt'
sender = 'Leader'
messages = extract_user_messages(txt_file, sender)
assert len(messages) > 1, 'Not enough messages to train on'  # Ensure data exists


In [2]:
def build_jsonl(messages: list[str], out_path: str):
    persona_desc = 'Respond like a wise, bold, visionary leader.'
    data = []
    for i in range(len(messages)-1):
        prompt = f"{persona_desc}\nQ: {messages[i]}\n"
        response = f"A: {messages[i+1]}"
        data.append({
            'instruction': prompt,
            'input': '',
            'output': response
        })
    with open(out_path, 'w', encoding='utf-8') as f:
        for rec in data:
            f.write(json.dumps(rec) + '\n')

jsonl_path = 'leader_persona_train.jsonl'
build_jsonl(messages, jsonl_path)
print(f"✅ Training data written to {jsonl_path}")


✅ Training data written to leader_persona_train.jsonl


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import prepare_model_for_kbit_training, get_peft_model, LoraConfig, TaskType

model_id = 'mistralai/Mistral-7B-Instruct-v0.2'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id, torch_dtype=torch.float16, device_map='auto'
)
model = prepare_model_for_kbit_training(model)
lora_config = LoraConfig(
    r=8, lora_alpha=16, target_modules=['q_proj','v_proj'],
    lora_dropout=0.1, bias='none', task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, lora_config)
print('🔄 Model and tokenizer ready for LoRA')


tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Some parameters are on the meta device because they were offloaded to the cpu and disk.


🔄 Model and tokenizer ready for LoRA


In [2]:
# from datasets import load_dataset
# from transformers import TrainingArguments, Trainer

# dataset = load_dataset('json', data_files=jsonl_path)
# training_args = TrainingArguments(
#     output_dir='./leader_bot_model', per_device_train_batch_size=1,
#     gradient_accumulation_steps=4, learning_rate=2e-4,
#     num_train_epochs=3, logging_dir='./logs',
#     save_steps=100, save_total_limit=1, fp16=True
# )
# trainer = Trainer(
#     model=model, args=training_args,
#     train_dataset=dataset['train'], tokenizer=tokenizer
# )
# print('🏋️‍♂️ Starting training...')
# trainer.train()
# print('✅ Training complete. Model saved.')

# from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
# from datasets import load_dataset

# # === Load your dataset ===
# dataset = load_dataset("json", data_files={"train": "leader_persona_train.jsonl"})

# # === Data collator for Causal LM (no masking) ===
# data_collator = DataCollatorForLanguageModeling(
#     tokenizer=tokenizer,
#     mlm=False
# )

# # === Define training arguments ===
# training_args = TrainingArguments(
#     output_dir="./leader_bot_model",
#     per_device_train_batch_size=1,
#     gradient_accumulation_steps=4,
#     learning_rate=2e-4,
#     num_train_epochs=3,
#     logging_dir="./logs",
#     save_steps=100,
#     save_total_limit=1,
#     fp16=True,  # You are using float16 and LoRA
#     report_to="none"
# )

# # === Use the PEFT-prepared LoRA model you already set up ===
# trainer = Trainer(
#     model=model,  # ← this is your LoRA-wrapped model
#     args=training_args,
#     train_dataset=dataset["train"],
#     data_collator=data_collator
# )

# # === Start training ===
# print("🏋️‍♂️ Starting training...")
# trainer.train()
# print("✅ Training complete. Model saved to:", training_args.output_dir)

from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    Trainer, TrainingArguments, DataCollatorForLanguageModeling
)
from peft import prepare_model_for_kbit_training, get_peft_model, LoraConfig, TaskType
from datasets import load_dataset
import torch

# === Config ===
model_id = "mistralai/Mistral-7B-Instruct-v0.2"
jsonl_path = "leader_persona_train.jsonl"

# === Load tokenizer ===
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token  # Needed for padding

# === Load full model (no bitsandbytes, no quantization) ===
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float32,  # Best for CPU or basic GPU
    device_map="auto"
)

# === Apply LoRA ===
model = prepare_model_for_kbit_training(model)
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, lora_config)

# === Load dataset ===
dataset = load_dataset("json", data_files={"train": jsonl_path})

# === Format into causal LM prompt format ===
def format_for_causal_lm(example):
    return {
        "text": f"{example['instruction']}\n{example['input']}\n{example['output']}"
    }

dataset = dataset.map(format_for_causal_lm)
dataset = dataset.remove_columns(["instruction", "input", "output"])

# === Tokenize the text ===
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)

dataset = dataset.map(tokenize, batched=True)

# === Data collator for causal LM ===
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# === Training arguments ===
training_args = TrainingArguments(
    output_dir="./leader_bot_model",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=3,
    logging_dir="./logs",
    save_steps=100,
    save_total_limit=1,
    fp16=False,  # No GPU = no fp16
    report_to="none",
    remove_unused_columns=False
)

# === Trainer ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    data_collator=data_collator
)

# === Train! ===
print("🏋️‍♂️ Starting training...")
trainer.train()
print("✅ Training complete. Model saved to:", training_args.output_dir)





Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the disk and cpu.


NotImplementedError: Cannot copy out of meta tensor; no data! Please use torch.nn.Module.to_empty() instead of torch.nn.Module.to() when moving module from meta to a different device.

In [None]:
def chat_with_leader(prompt: str, max_new_tokens: int = 100) -> str:
    persona_desc = 'Respond like a wise, bold, visionary leader.'
    input_text = f"{persona_desc}\nQ: {prompt}\nA:"
    inputs = tokenizer(input_text, return_tensors='pt').to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example Chat
print('🗣️ You: What if people laugh at my dreams?')
print('🤖 Leader AI:', chat_with_leader('What if people laugh at my dreams?'))
