## Generating SFT training dataset


In [1]:
!uv pip install unsloth

[2mUsing Python 3.10.10 environment at: /home/zeus/miniconda3/envs/cloudspace[0m
[2mAudited [1m1 package[0m [2min 63ms[0m[0m


In [None]:
from unsloth import FastLanguageModel
import torch

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-instruct-v0.3",
    dtype = None,
    max_seq_length = 4096,
    load_in_4bit = True,  
)

FastLanguageModel.for_inference(model)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.9.6: Fast Mistral patching. Transformers: 4.55.4.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.278 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32768, 4096, padding_idx=770)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): M

In [None]:
SYSTEM_PROMPT = "" # Left blank for confidentiality

In [4]:

messages = [
    {"role": "system", "content": SYSTEM_PROMPT},
    {"role": "user", "content": "Cash flow management: salon with seasonal revenue - peak months ₦5,963,776, slow months ₦1,192,755. Fixed costs ₦373,723, variable costs 33% of revenue. How do I manage cash flow and should I get a working capital facility from Wema Bank?"},
]
inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=1024)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

<Reasoning> I understand the user is asking about cash flow management for a salon with seasonal revenue. Let me think through this systematically...

Situation Analysis: The salon has a seasonal revenue pattern, with peak months generating ₦5,963,776 and slow months generating ₦1,192,755. Fixed costs amount to ₦373,723, and variable costs are 33% of the total revenue.

Nigerian Context: The Nigerian economy has a seasonal pattern, with some industries experiencing peaks and troughs. The salon industry is one of them, as people tend to spend more on personal grooming during festive seasons.

Options Available: To manage cash flow effectively, the user can consider the following options:
1. Building an emergency fund to cover slow months.
2. Implementing cost-cutting measures during slow months.
3. Exploring working capital facilities from banks like Wema Bank to bridge the cash flow gap.

Risk Assessment: The main risks to consider are:
1. Inadequate emergency fund leading to financial

### Open file to save training data

In [None]:
import csv

def save_training_data(result, file_path: str = "training_data.csv"):
  """Saves the training data to a csv file."""
  with open(file_path, 'a', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(
        [
            result['developer'],
            result['user'],
            result['analysis'],
            result['final'],
            result['messages']
        ]
    )

# Insert csv header
with open('training_data.csv', 'w', newline='') as file:
  writer = csv.writer(file)
  writer.writerow(['developer', 'user', 'analysis', 'final', 'messages'])

### Define Output Cleaner class

In [None]:
import markdown
from bs4 import BeautifulSoup

class OutputCleaner:
  def __init__(self):
    pass
  def extract_reasoning(self, output: str) -> str:
    """Extracts the reasoning from the model output."""
    start_tag = "<Reasoning>"
    end_tag = "</Reasoning>"
    start_index = output.find(start_tag)
    end_index = output.find(end_tag)

    if start_index != -1 and end_index != -1:
      return output[start_index + len(start_tag):end_index].strip()
    else:
      return ""

  def extract_answer(self, output: str) -> str:
    """Extracts the answer from the model output."""
    start_tag = "<Answer>"
    end_tag = "</Answer>"
    start_index = output.find(start_tag)
    end_index = output.find(end_tag)

    if start_index != -1 and end_index != -1:
      return output[start_index + len(start_tag):end_index].strip()
    else:
      return ""

  def markdown_to_plain_text(self, output: str) -> str:
    html = markdown.markdown(output)
    soup = BeautifulSoup(html, 'html.parser')
    return soup.get_text()

### Load in financial qa dataset

In [7]:
import json
from typing import List
import random

def load_fin_questions(file_path: str) -> List:
  """Loads in financial qa dataset."""
  try:
    with open(file_path, "r") as file:
      return [q['question'] for q in json.load(file)["questions"]]
  except (FileNotFoundError, KeyError, json.JSONDecodeError) as e:
    print(f"Error opening {file_path}: {e}")
    return []

all_questions = []
all_questions.extend(load_fin_questions("personalized_nigerian_finance_questions.json"))
all_questions.extend(load_fin_questions("personalized_nigerian_finance_questions_1k.json"))

# Shuffle the questions
random.seed(42)
random.shuffle(all_questions)

### Run training dataset creation

In [None]:
from typing import List, Dict

output_cleaner = OutputCleaner()

def generate_training_dataset() -> List[Dict]:
	results = []
	for question in all_questions:
		messages = [
				{"role": "system", "content": SYSTEM_PROMPT},
				{"role": "user", "content": question},
		]
		inputs = tokenizer.apply_chat_template(
			messages,
			add_generation_prompt=True,
			tokenize=True,
			return_dict=True,
			return_tensors="pt",
		).to(model.device)

		outputs = model.generate(**inputs, max_new_tokens=1024)
		output = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])
		reasoning = output_cleaner.markdown_to_plain_text(output_cleaner.extract_reasoning(output))
		answer = output_cleaner.extract_answer(output)
		if reasoning and  answer:
			result = {
						"developer": "You are FinBuddy, a financial advisory chatbot for Nigerians by Nigerians",
						"user": question,
						"analysis": reasoning,
						"final": answer,
						"messages": [
							{ "content": "You are FinBuddy, a financial advisory chatbot for Nigerians by Nigerians", "role": "system", "thinking": None }, 
							{ "content": question, "role": "user", "thinking": None },
							{ "content": answer,  "role": "assistant", "thinking": reasoning}
						],
			}
		else:
			print("One or both variable values are missing")
			print(f"Reasoning: {reasoning}\n Answer: {answer}")
			pass
		save_training_data(result)
		results.append(result)

	return results

result = print(generate_training_dataset())

KeyboardInterrupt: 