In [1]:
!pip install -q \
  pandas \
  numpy \
  scikit-learn \
  torch \
  transformers \
  peft\
  accelerate \
  bitsandbytes \
  nltk \
  rouge-score \
  tqdm



  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.5/207.5 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.1/21.1 MB[0m [31m82.4 MB/s[0m eta [36m0:

In [2]:
from huggingface_hub import login

# Paste your token here
login("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")


In [3]:
# fine_tune_llm.py
import os
import json
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    TrainingArguments, 
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import Dataset, load_from_disk
from tqdm import tqdm

# ====== CONFIGURATION ======
DATASET_PATH = "/kaggle/input/education-dialogue-datasets"
CACHE_PATH = "./cached_education_dataset"
MODEL_NAME = "mistralai/Mistral-7B-v0.1"
OUTPUT_DIR = "./models/llm/fine_tuned_model"
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
MAX_LENGTH = 512
BATCH_SIZE = 2
EPOCHS = 1  # Start with 1 for speed; increase if time allows

os.makedirs('models/llm', exist_ok=True)

def load_education_dialogue_dataset(base_path):
    """Load all conversation files from the Education Dialogue Dataset"""
    print("Loading education dialogue dataset...")
    conversation_files = [
        'conversations_train1.json',
        'conversations_train2.json',
        'conversations_train3.json',
        'conversations_train4.json',
        'conversations_train5.json',
        'conversations_eval.json'
    ]
    all_data = []
    for file_name in conversation_files:
        file_path = os.path.join(base_path, file_name)
        if os.path.exists(file_path):
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                all_data.extend(data)
        else:
            print(f"Warning: {file_path} not found")
    print(f"Loaded {len(all_data)} conversations")
    return all_data

def preprocess_and_cache_data(data, tokenizer, max_length=512, cache_path="cached_dataset"):
    """Preprocess data and cache to disk"""
    if os.path.exists(cache_path):
        print(f"Loading cached dataset from {cache_path}")
        return load_from_disk(cache_path)
    print("Preprocessing and caching dataset...")
    processed_data = []
    for item in tqdm(data, desc="Processing conversations"):
        conv = item.get('conversation', [])
        background = item.get('background_info', {})
        if len(conv) < 2:
            continue
        for i in range(len(conv) - 1):
            if conv[i]['role'] == 'Student' and conv[i+1]['role'] == 'Teacher':
                instruction = (
                    f"Background: {json.dumps(background)}\n"
                    f"Student: {conv[i]['text']}\n"
                    f"Generate appropriate teacher feedback:"
                )
                response = conv[i+1]['text']
                text = f"### Instruction:\n{instruction}\n\n### Response:\n{response}"
                tokenized = tokenizer(
                    text, 
                    max_length=max_length,
                    truncation=True,
                    padding="max_length",
                    return_tensors="pt"
                )
                processed_data.append({
                    "input_ids": tokenized["input_ids"][0],
                    "attention_mask": tokenized["attention_mask"][0]
                })
    dataset = Dataset.from_list(processed_data)
    dataset.save_to_disk(cache_path)
    return dataset

def setup_model_and_tokenizer():
    print("Loading Mistral 7B model...")
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        device_map={"": 0},  # Force all tensors to GPU 0
        trust_remote_code=True
    )
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.pad_token = tokenizer.eos_token
    model = prepare_model_for_kbit_training(model)
    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()
    return model, tokenizer

def main():
    # Setup model and tokenizer
    model, tokenizer = setup_model_and_tokenizer()
    # Load and preprocess data
    raw_data = load_education_dialogue_dataset(DATASET_PATH)
    train_dataset = preprocess_and_cache_data(
        raw_data, tokenizer, max_length=MAX_LENGTH, cache_path=CACHE_PATH
    )
    # Data collator
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )
    # Training arguments
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        per_device_train_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=4,
        num_train_epochs=EPOCHS,
        learning_rate=2e-4,
        fp16=True,
        save_total_limit=1,
        logging_steps=50,
        report_to="none",
        gradient_checkpointing=True,
        optim="paged_adamw_8bit",
        remove_unused_columns=False,
        max_steps=100 if os.environ.get("KAGGLE_KERNEL_RUN_TYPE", "") else None,
        label_names=["labels"]
    )
    # Trainer
    trainer = Trainer(
        model=model,
        train_dataset=train_dataset,
        data_collator=data_collator,
        args=training_args
    )
    # Fine-tune
    print("Starting fine-tuning...")
    trainer.train()
    # Save the fine-tuned model
    model.save_pretrained(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)
    print("Fine-tuning completed and model saved!")

if __name__ == "__main__":
    main()


2025-06-26 14:56:35.041258: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750949795.446659      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750949795.558790      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading Mistral 7B model...


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

trainable params: 6,815,744 || all params: 7,248,547,840 || trainable%: 0.0940
Loading education dialogue dataset...
Loaded 47234 conversations
Preprocessing and caching dataset...


Processing conversations: 100%|██████████| 47234/47234 [04:18<00:00, 182.86it/s]


Saving the dataset (0/2 shards):   0%|          | 0/332398 [00:00<?, ? examples/s]

Starting fine-tuning...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
50,0.376
100,0.2186


Fine-tuning completed and model saved!


In [4]:
import shutil

# Define the zip file path and the source directory
zip_file_path = "/kaggle/working/working_dir.zip"
source_dir = "/kaggle/working/"

# Create the zip file (exclude the zip file itself if it already exists)
shutil.make_archive(zip_file_path.replace(".zip", ""), 'zip', source_dir)

print("✅ Done! You can now download 'working_dir.zip' from the Output section.")


✅ Done! You can now download 'working_dir.zip' from the Output section.
