In [None]:
!pip install -U accelerate peft trl bitsandbytes transformers datasets



Collecting peft
  Downloading peft-0.17.1-py3-none-any.whl.metadata (14 kB)
Collecting trl
  Downloading trl-0.21.0-py3-none-any.whl.metadata (11 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting transformers
  Downloading transformers-4.55.3-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Downloading peft-0.17.1-py3-none-any.whl (504 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m504.9/504.9 kB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.21.0-py3-none-any.whl (511 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.9/511.9 kB[0m [31m37.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl (61.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m15.7 MB/s[0m 

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

bnb_config = BitsAndBytesConfig( #this creates a configuration that makes the model use less computer memory
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_name = "Qwen/Qwen2.5-3B"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)#The tokenizer converts human text into numbers the AI can understand
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)
model.config.use_cache = False


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/683 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

In [None]:
import json
import pandas as pd
import math

df = pd.read_csv("HouseTS.csv")
records = []

for _, row in df.iterrows():
    sale_price = row['median_sale_price']
    rent = row['Median Rent']

    prompt = f"""Property details:
City: {row['city']}
Zipcode: {row['zipcode']}
Median Sale Price: {sale_price}
Median Rent: {rent}
Nearby: {row['school']} schools, {row['hospital']} hospitals, {row['restaurant']} restaurants
Agent Analysis:
- Median DOM: {row['median_dom']}
- Price per sqft: {row['median_ppsf']}
- Inventory: {row['inventory']}
Question: Is this a good investment?

Answer:"""

    #This tells us what percentage return we will get from rent each year
    if sale_price and not math.isnan(sale_price) and sale_price > 0:
        rental_yield = (rent * 12) / sale_price
        if rental_yield > 0.05:
            completion = " Yes. The rental yield is healthy and market conditions suggest stability."
        else:
            completion = " No. The rental yield is too low compared to the property value."
    else:
        completion = " No. Insufficient sale price data to determine investment potential."

    records.append({"prompt": prompt, "completion": completion})

with open("investment_training.jsonl", "w") as f:
    for r in records:
        f.write(json.dumps(r) + "\n")


In [None]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="investment_training.jsonl")

dataset_sample = dataset["train"].select(range(1500))  # Use only 1500 examples
dataset_sample = dataset_sample.train_test_split(test_size=0.1)


Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
def format_example(example): #we are taking each property example and combine the question and answer into one text string
    return f"{example['prompt']} {example['completion']}"

def tokenize(example):
    text = format_example(example)
    return tokenizer(text, truncation=True, padding="max_length", max_length=512)

tokenized = dataset_sample.map(tokenize, batched=False)


Map:   0%|          | 0/1350 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)
#this is just like a helper that takes the prepared property investment examples and organizes them into neat batches the ai can learn from
#here we are using qwen, a casual LM so we define mlm=false
#mlm means it will mask some words(hide) and try to predict them from the context: Bert model uses mlm


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="output",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    logging_steps=10,
    warmup_steps=10,
    learning_rate=2e-4,
    group_by_length=True,
    report_to="none"
)



In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)


  trainer = Trainer(


ValueError: You cannot perform fine-tuning on purely quantized models. Please attach trainable adapters on top of the quantized model to correctly perform fine-tuning. Please see: https://huggingface.co/docs/transformers/peft for more details

In [None]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

#LoRA adds small "adapter layers" on top of the frozen quantized model
#The big quantized model stays compressed and unchanged
#Only the tiny LoRA layers get trained (saves memory and time)

In [None]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 3,686,400 || all params: 3,089,625,088 || trainable%: 0.1193


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./qwen-finetuned-investment",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    logging_steps=10,
    save_strategy="epoch",
    report_to="none",
    fp16=True     #Half precision (16-bit)
)


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    data_collator=data_collator
)


  trainer = Trainer(


In [None]:
trainer.train() #this is from the first dataset sample

Step,Training Loss
10,2.0282
20,1.2854
30,0.7218
40,0.6228
50,0.5979
60,0.5526
70,0.531
80,0.5293
90,0.5267
100,0.5287


TrainOutput(global_step=507, training_loss=0.5485446119214421, metrics={'train_runtime': 2492.9415, 'train_samples_per_second': 1.625, 'train_steps_per_second': 0.203, 'total_flos': 3.4568489926656e+16, 'train_loss': 0.5485446119214421, 'epoch': 3.0})

In [None]:
import shutil

# Create a zip file
shutil.make_archive('qwen-finetuned-investment', 'zip', './qwen-finetuned-investment')

# Download the zip file
from google.colab import files
files.download('qwen-finetuned-investment.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>