In [None]:
!pip install -U transformers datasets peft bitsandbytes accelerate

Collecting transformers
  Downloading transformers-4.54.1-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting peft
  Downloading peft-0.17.0-py3-none-any.whl.metadata (14 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.13.0->peft)
  Downloading nvidia_cudn

# **DATA LOADING**

In [None]:
import json
from datasets import Dataset


def load_jsonl(path):
    with open(path, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f]

train_data = load_jsonl("/content/notes.jsonl")
val_data = load_jsonl("/content/notes_validation.jsonl")

def format_multiturn(example):
    conv = json.loads(example['message']) if isinstance(example['message'], str) else example['message']
    return {'text': '\n'.join([f"### {turn['role'].capitalize()}: {turn['content']}" for turn in conv])}

train_dataset = Dataset.from_list([format_multiturn(x) for x in train_data])
val_dataset = Dataset.from_list([format_multiturn(x) for x in val_data])


In [None]:
from huggingface_hub import HfFolder

HfFolder.save_token("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")

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

model_name = "mistralai/Mistral-7B-Instruct-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Load base model in 4-bit
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    device_map="auto"
)

# Prepare for k-bit training (4-bit)
model = prepare_model_for_kbit_training(model)

# Add LoRA
lora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # works for Mistral
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [None]:
def tokenize(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=1024)

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

# **TRAINING**

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datetime import datetime

training_args = TrainingArguments(
    output_dir="./mistral-lora-finetune",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    warmup_steps=10,
    max_steps=300,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    save_steps=100,
    eval_steps=100,
    save_total_limit=2,
    logging_dir="./logs",
    report_to="none",
    run_name="mistral-lora-" + datetime.now().strftime("%Y%m%d-%H%M")
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

model.config.use_cache = False
trainer.train()


  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwar

Step,Training Loss
10,1.5757
20,1.3818
30,1.2611
40,1.2933
50,1.2396
60,1.2268
70,1.1624
80,1.1463
90,1.1055
100,1.1315


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=300, training_loss=0.9806168015797933, metrics={'train_runtime': 2966.9734, 'train_samples_per_second': 0.202, 'train_steps_per_second': 0.101, 'total_flos': 2.609396261584896e+16, 'train_loss': 0.9806168015797933, 'epoch': 5.891089108910891})

# **INFERENCING/TESTING** (CHAT FUNCTION)

In [None]:
import torch

def chat(system_prompt=""):
    history = []
    print("Type 'exit' to quit.\n")

    while True:
        user_input = input("You: ")
        if user_input.lower() == "exit":
            break

        history.append({'role': 'user', 'content': user_input})

        # Construct prompt from history
        prompt = system_prompt.strip() + "\n"
        for turn in history:
            prompt += f"### {turn['role'].capitalize()}: {turn['content']}\n"
        prompt += "### Assistant:"

        # Tokenize and generate
        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
        output = model.generate(
            **inputs,
            max_new_tokens=512,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )

        # Extract only assistant's new response
        generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
        assistant_reply = generated_text[len(prompt):].strip().split("###")[0].strip()

        print("Assistant:", assistant_reply)
        history.append({'role': 'assistant', 'content': assistant_reply})


In [None]:
chat(system_prompt="""""") # Just run this cell!

Type 'exit' to quit.

You: Rome, 44 BC. Julius Caesar, the all-powerful dictator, has just been assassinated. The city is in chaos, factions vying for control. As Marcus, a young senator caught in the middle, you overhear whispers of a conspiracy to restore the Republic, led by Brutus and Cassius.
Assistant: Marcus is torn. His loyalty is to Caesar, but the Republic must be saved. He overhears Brutus's justification for the murder: 'It is necessary.' Do you, as Marcus, publicly support the Republic restoration, or secretly oppose it, fearing the consequences of Caesar's death? (support_republic/oppose_republic)
You: support_republic
Assistant: Marcus publicly declares his support for the Republic restoration. Brutus and Cassius are hailed as heroes. But the price of liberty is soon revealed. Civil war erupts, and Rome is divided. Where does Marcus stand in this new conflict? (stand_civil_war/hide_support)
You: exit


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **SAVE LORA MODEL**

In [None]:
from peft import PeftModel, PeftConfig

model.save_pretrained("mistral-lora-anya")
tokenizer.save_pretrained("mistral-lora-anya")

('mistral-lora-anya/tokenizer_config.json',
 'mistral-lora-anya/special_tokens_map.json',
 'mistral-lora-anya/chat_template.jinja',
 'mistral-lora-anya/tokenizer.model',
 'mistral-lora-anya/added_tokens.json',
 'mistral-lora-anya/tokenizer.json')

# **TO USE ON DIFFERENT MACHINE!**
Load the base model + LoRA weights.
AFTER RUN CHAT FUNCTION FROM ABOVE

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig

# Load tokenizer and base model
model_name = "mistralai/Mistral-7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained("mistral-lora-anya")  # or model_name if tokenizer not modified
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    device_map="auto"
)

# Load LoRA adapter
model = PeftModel.from_pretrained(base_model, "mistral-lora-anya")
model.eval()
model.config.use_cache = True


# **MERGING LORA + BASE MODEL**

In [None]:
from peft import PeftModel

# Merge LoRA with base weights
merged_model = model.merge_and_unload()
merged_model.save_pretrained("mistral-7b-merged")
tokenizer.save_pretrained("mistral-7b-merged")



('mistral-7b-merged/tokenizer_config.json',
 'mistral-7b-merged/special_tokens_map.json',
 'mistral-7b-merged/chat_template.jinja',
 'mistral-7b-merged/tokenizer.model',
 'mistral-7b-merged/added_tokens.json',
 'mistral-7b-merged/tokenizer.json')

# **UPLOADED ON MY HUGGINGFACE MODEL**

In [None]:
from huggingface_hub import HfApi, login

# Login to Hugging Face
login("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")  # Paste your token here

# Create repo (change name if needed)
repo_id = "rohit5775/mistral-7b-instruct-finetuned-anya"
api = HfApi()
api.create_repo(repo_id=repo_id, private=False, exist_ok=True)

# Upload model + tokenizer
merged_model.push_to_hub(repo_id)
tokenizer.push_to_hub(repo_id)

print(f"🚀 Model uploaded: https://huggingface.co/{repo_id}")

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  /tmp/tmpreaj6o2r/model.safetensors    :   0%|          | 16.6MB / 4.98GB            

README.md: 0.00B [00:00, ?B/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  /tmp/tmpjjxnovev/tokenizer.model      : 100%|##########|  493kB /  493kB            

🚀 Model uploaded: https://huggingface.co/rohit5775/mistral-7b-instruct-finetuned-anya


# **LOADING MY HUGGINGFACE MODEL**

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained(
    "rohit5775/mistral-7b-instruct-finetuned-anya",
    device_map="auto",
    torch_dtype="auto"
)

tokenizer = AutoTokenizer.from_pretrained("rohit5775/mistral-7b-instruct-finetuned-anya")

# **TEST WITH THE SAME CHAT FUNCTION**