In [1]:
# %pip install transformers datasets torch

In [2]:
# from google.colab import files
# uploaded = files.upload()


In [3]:
import json
import torch
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from huggingface_hub import notebook_login
import re

# === Step 1: Load dataset ===
def load_dataset(path):
    with open(path) as f:
        data = json.load(f)
    return Dataset.from_list([{"input_text": d["input"], "target_text": d["output"]} for d in data])

def get_dataset():
    train_data = load_dataset(r"/home/sysadm/Music/unitime_nlp/data/processed/train.json")
    val_data = load_dataset(r"/home/sysadm/Music/unitime_nlp/data/processed/val.json")
    test_data = load_dataset(r"/home/sysadm/Music/unitime_nlp/data/processed/test.json")

    print("✅ Loaded datasets:")
    print(f"Train size: {len(train_data)}, Val size: {len(val_data)}, Test size: {len(test_data)}")

    return DatasetDict({
        "train": train_data,
        "validation": val_data,
        "test": test_data
    })

# === Step 2: Format for Mistral ===
def format_for_mistral(example):
    return {
        "text": f"<s>[INST] Instruction: {example['input_text']} [/INST] {example['target_text']}</s>"
    }

# === Step 3: Tokenize ===
def tokenize_dataset(dataset, tokenizer, max_length=1024):
    def tokenize(example):
        return tokenizer(example["text"], truncation=True, padding="max_length", max_length=max_length)
    return dataset.map(format_for_mistral).map(tokenize)

# === Step 4: Train with Quantization ===
def train_mistral_model(tokenized_datasets):
    from transformers import BitsAndBytesConfig

    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,  # Use load_in_8bit=True for 8-bit instead
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )

    model = AutoModelForCausalLM.from_pretrained(
        "mistralai/Mistral-7B-v0.1",
        quantization_config=quant_config,
        device_map="auto"
    )

    tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", use_fast=True)
    tokenizer.pad_token = tokenizer.eos_token

    training_args = TrainingArguments(
        output_dir="./mistral-finetuned",
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        num_train_epochs=3,
        logging_steps=50,
        save_steps=200,
        evaluation_strategy="epoch",
        save_total_limit=2,
        fp16=True,
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        train_dataset=tokenized_datasets['train'],
        eval_dataset=tokenized_datasets['validation']
    )

    trainer.train()
    model.save_pretrained("mistral-finetuned")
    tokenizer.save_pretrained("mistral-finetuned")

# === XML Fixer ===
def fix_xml(prediction: str) -> str:
    prediction = re.sub(r'(?<!<)(\w+)(\s[^<>]*?/?>)', r'<\1\2', prediction)
    prediction = re.sub(r'(/?>)(?!>)', r'\1>', prediction)
    return prediction

# === Step 5: Predict and Export ===
def predict_on_test(test_data, model_dir="mistral-finetuned", output_file="predictions.xml"):
    model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto")
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model.eval()
    model.to("cuda" if torch.cuda.is_available() else "cpu")

    predictions = []

    for example in test_data:
        prompt = f"<s>[INST] Instruction: {example['input_text']} [/INST]"
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=1024)
        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
        fixed = fix_xml(decoded)

        predictions.append(fixed)

        print("Input:", example["input_text"])
        print("Prediction:\n", fixed)
        print("Ground Truth:\n", example["target_text"])
        print("=" * 80)

    # Save to XML
    with open(output_file, "w", encoding="utf-8") as f:
        f.write("\n\n".join(predictions))
    print(f"✅ Predictions saved to {output_file}")

# === Step 6: Main ===
print("\nLoading datasets...")
dataset = get_dataset()
print("✅ Datasets loaded successfully.")

notebook_login()

print("\nInitializing tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

print("\nTokenizing datasets...")
tokenized_dataset = tokenize_dataset(dataset, tokenizer)

print("\nStarting model training...")
train_mistral_model(tokenized_dataset)

print("\nPredicting on test set...")
test_examples = dataset["test"]
predict_on_test(test_examples)



Loading datasets...
✅ Loaded datasets:
Train size: 2400, Val size: 300, Test size: 300
✅ Datasets loaded successfully.


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…


Initializing tokenizer...

Tokenizing datasets...


Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]


Starting model training...


Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 21.0M/4.56G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   2%|1         | 178M/10.1G [00:00<?, ?B/s]

KeyboardInterrupt: 


Loading datasets...
✅ Loaded datasets:
Train size: 2400, Val size: 300, Test size: 300
✅ Datasets loaded successfully.


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…


Initializing tokenizer...


tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]


Starting model training...


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

In [None]:
# from transformers import T5Tokenizer

# def tokenize_data(dataset, tokenizer, max_input_len=256, max_target_len=512):
#     def preprocess(example):
#         input_enc = tokenizer(example["input_text"], truncation=True, padding="max_length", max_length=max_input_len)
#         target_enc = tokenizer(example["target_text"], truncation=True, padding="max_length", max_length=max_target_len)
#         return {
#             "input_ids": input_enc["input_ids"],
#             "attention_mask": input_enc["attention_mask"],
#             "labels": target_enc["input_ids"]
#         }
#     tokenized = dataset.map(preprocess, batched=True)
#     print("✅ Tokenization complete")
#     return tokenized
# from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments

# def fine_tune(tokenized_datasets):
#     model = T5ForConditionalGeneration.from_pretrained("t5-base")

#     training_args = TrainingArguments(
#         output_dir="./results",
#         eval_strategy="steps",
#         eval_steps=100,
#         logging_steps=50,
#         save_steps=200,
#         per_device_train_batch_size=4,
#         per_device_eval_batch_size=4,
#         num_train_epochs=3,
#         save_total_limit=2,
#         fp16=False,
#         logging_dir="./logs",
#         report_to="none"
#     )

#     trainer = Trainer(
#         model=model,
#         args=training_args,
#         train_dataset=tokenized_datasets["train"],
#         eval_dataset=tokenized_datasets["validation"]
#     )

#     print("🚀 Starting training...")
#     trainer.train()
#     print("✅ Training complete")
#     model.save_pretrained("t5-custom-finetuned")
#     return model

# import re
# import torch

# def fix_xml(prediction: str) -> str:
#     # Add < if missing at the start of tags
#     prediction = re.sub(r'(?<!<)(\w+)(\s[^<>]*?/?>)', r'<\1\2', prediction)
#     # Add > if missing at the end
#     prediction = re.sub(r'(/?>)(?!>)', r'\1>', prediction)
#     return prediction

# def predict(model, tokenizer, dataset, max_input_len=256, max_target_len=512):
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     model.to(device)
#     model.eval()

#     for example in dataset.select(range(3)):
#         inputs = tokenizer(example["input_text"], return_tensors="pt", truncation=True, padding=True, max_length=max_input_len)
#         inputs = {k: v.to(device) for k, v in inputs.items()}
#         output_ids = model.generate(**inputs, max_length=max_target_len)
#         prediction = tokenizer.decode(output_ids[0], skip_special_tokens=True)
#         fixed_prediction = fix_xml(prediction)

#         print("📥 Input:", example["input_text"])
#         print("✅ Raw Prediction:", prediction)
#         print("🛠 Fixed Prediction:", fixed_prediction)
#         print("🎯 Ground Truth:", example["target_text"])
#         print("-" * 50)


In [None]:
import torch