In [1]:
import numpy as np
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/expert-data/expert_data.csv


### Information of the related libraries

transformers : The main Hugging Face library. It gives you access to thousands of pre-trained models (AutoModelForCausalLM) and their corresponding tokenizers (AutoTokenizer). It's the foundation for everything.

peft: Stands for Parameter-Efficient Fine-Tuning. This library implements techniques like LoRA and QLoRA. drastically saves memory and compute power.

accelerate: Another Hugging Face library that works in the background. It automatically optimizes your PyTorch code to run efficiently on your specific hardware (single GPU, multiple GPUs, CPU) without you having to manually configure it.

bitsandbytes: This library is key for running large models on smaller GPUs.

trl: Stands for Transformer Reinforcement Learning library.Use case as Supervised Fine-Tuning Trainer.SFTTrainer is a convenient tool that simplifies the entire training script, handling data formatting and the training loop for you.

In [4]:
!pip install -q transformers datasets peft bitsandbytes accelerate
!pip install -q trl
import transformers
import datasets
import peft
import bitsandbytes
import accelerate


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

2025-07-01 09:31:11.605923: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751362271.821508      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751362271.880719      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Transformers version: 4.51.3
Datasets version: 3.6.0
PEFT version: 0.14.0
Bitsandbytes version: 0.46.0
Accelerate version: 1.5.2


## CELL 1: SETUP AND GPU VERIFICATION

### Ensuring reproducibility by setting random seeds.

### Verifying that a GPU is available and checking its specifications, which is absolutely critical for training a large model.

Use of random seed - 
Many parts of model training involve randomness (e.g., initializing model weights, shuffling data). By setting a "seed" (the number 42 is just a convention), you ensure that every time you run this script, the sequence of "random" numbers will be exactly the same. This makes your experiment reproducible, which is crucial for debugging and comparing results fairly.

In [2]:
import os
import gc
import json
import ast
import torch
import pandas as pd
import numpy as np
import random
from datetime import datetime

# Set seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

# Verify GPU
print("GPU VERIFICATION")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU Device: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
    
    # Current GPU memory usage
    allocated = torch.cuda.memory_allocated(0) / 1024**3
    reserved = torch.cuda.memory_reserved(0) / 1024**3
    print(f"Currently allocated: {allocated:.2f} GB")
    print(f"Currently reserved: {reserved:.2f} GB")
else:
    print("No GPU in use")

# Define paths
KAGGLE_INPUT_PATH = "/kaggle/input"
KAGGLE_WORKING_PATH = "/kaggle/working"


GPU VERIFICATION
PyTorch version: 2.6.0+cu124
CUDA available: True
CUDA version: 12.4
GPU Device: Tesla P100-PCIE-16GB
GPU Memory: 15.89 GB
Currently allocated: 0.00 GB
Currently reserved: 0.00 GB


### CELL 2: CONFIGURATION SETTINGS

* model_id: Which model to fine-tune (Llama-3 8B).
**Note - if the model is gatekeeped then it gives error,need to request access from the hugging face then use access via token**

* max_length: Maximum text length for each training example; balances context vs. memory.

* batch_size: How many examples to process at once (1 to save memory).

* gradient_accumulation_steps: Simulates a bigger batch size (1 * 4 = 4) without using more memory, leading to more stable training.

* lora_r & lora_alpha: Controls the "power" of the LoRA fine-tuning. r sets the capacity of the adapters, and alpha is a scaling factor.

* num_epochs: How many times to train on the entire dataset.

* learning_rate: How fast the model learns. This is a critical setting to tune.

* warmup_steps: Slowly ramps up the learning rate at the start to stabilize training.

* eval_steps & save_steps: How often to check progress and save backups during training.

In [None]:
# Model configuration
CONFIG = {
    "model_id": "meta-llama/Meta-Llama-3-8B",
    "max_length": 1024,
    "batch_size": 1,
    "gradient_accumulation_steps": 4,
    "lora_r": 32,
    "lora_alpha": 64,
    "num_epochs": 5,
    "learning_rate": 3e-4,
    "warmup_steps": 50,
    "eval_steps": 25,
    "save_steps": 50,
}

print("Configuration settings:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")


In [6]:
# Your CSV path from the file
csv_path = "/kaggle/input/expert-data/expert_data.csv"

# Load with latin-1 encoding 
df = pd.read_csv(csv_path, encoding='latin-1')

print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print(f"\nNull values per column:")
print(df.isnull().sum())


Dataset shape: (23, 13)
Columns: ['expert_id', 'expert_name', 'expert_headline', 'expert_bio', 'expert_geographies', 'expert_functions', 'expert_domain_other', 'expert_summary', 'expert_work_summary', 'project_relevent_company', 'project_relevent_designation', 'project_agenda_responses', 'agenda']

Null values per column:
expert_id                       0
expert_name                     0
expert_headline                 0
expert_bio                      0
expert_geographies              0
expert_functions                3
expert_domain_other             3
expert_summary                  0
expert_work_summary             0
project_relevent_company        2
project_relevent_designation    2
project_agenda_responses        0
agenda                          0
dtype: int64


In [7]:
# ============================================
# CELL 5: PARSE JSON COLUMNS
# ============================================

def parse_json_safely(json_str):
    """Parse JSON columns safely"""
    if pd.isna(json_str) or json_str == '':
        return None
    try:
        if isinstance(json_str, str):
            json_str = json_str.strip().replace('\ufeff', '')
            return json.loads(json_str)
    except json.JSONDecodeError:
        try:
            return ast.literal_eval(json_str)
        except:
            return None

# Test parsing on first row
test_row = df.iloc[0]
print("Testing JSON parsing...")

responses = parse_json_safely(test_row['project_agenda_responses'])
agenda = parse_json_safely(test_row['agenda'])

print(f"\nParsed responses: {type(responses)}")
if responses:
    print(f"Number of responses: {len(responses)}")
    print(f"First response: {responses[0]}")

print(f"\nParsed agenda: {type(agenda)}")
if agenda:
    print(f"Agenda keys: {agenda.keys()}")
    if 'questions' in agenda:
        print(f"Number of questions: {len(agenda['questions'])}")


Testing JSON parsing...

Parsed responses: <class 'list'>
Number of responses: 6
First response: {'answer': 'Very Comfortable', 'question': 'Provide insights on risk management strategies adopted by multinational banks post-2020.', 'expert_note': "Having served at Lloyds, HSBC, and ABN AMRO, I've led post-pandemic risk recalibration initiatives focusing on operational resilience and credit stress testing."}

Parsed agenda: <class 'dict'>
Agenda keys: dict_keys(['questions'])
Number of questions: 6


In [8]:
# ============================================
# CELL 6: PROCESS CSV DATA
# ============================================

expert_database = {}
training_examples = []
failed_rows = []

for idx, row in df.iterrows():
    try:
        responses = parse_json_safely(row['project_agenda_responses'])
        agenda = parse_json_safely(row['agenda'])
        
        if not responses or not agenda:
            failed_rows.append(idx)
            continue
        
        questions = agenda.get('questions', []) if isinstance(agenda, dict) else []
        
        # Format agenda
        formatted_agenda = "Agenda:\n"
        for i, question in enumerate(questions, 1):
            formatted_agenda += f"{i}. {question}\n"
        
        # Format responses
        expert_responses_detailed = []
        expert_responses_simple = []
        
        for i, response_item in enumerate(responses, 1):
            comfort = response_item.get('answer', '')
            note = response_item.get('expert_note', '')
            
            expert_responses_simple.append(f"{i}. {comfort}")
            
            if note:
                expert_responses_detailed.append(f"{i}. {comfort}\n   Expert Note: {note}")
            else:
                expert_responses_detailed.append(f"{i}. {comfort}")
        
        # Create outputs
        expert_output_detailed = f"""Expert Name: {row['expert_name']}
Headline: {row['expert_headline']}
Bio: {row['expert_bio']}
Work Summary: {row['expert_work_summary']}

Expert Responses:
{chr(10).join(expert_responses_detailed)}"""

        expert_output_simple = f"""Expert Name: {row['expert_name']}
Headline: {row['expert_headline']}
Bio: {row['expert_bio']}
Work Summary: {row['expert_work_summary']}

Expert Responses:
{chr(10).join(expert_responses_simple)}"""
        
        # Store expert
        expert_database[row['expert_name']] = {
            'id': row['expert_id'],
            'output_detailed': expert_output_detailed,
            'output_simple': expert_output_simple
        }
        
        # Add training examples
        training_examples.append({
            'input': formatted_agenda.strip(),
            'output': expert_output_detailed,
            'expert_name': row['expert_name']
        })
        
        training_examples.append({
            'input': formatted_agenda.strip(),
            'output': expert_output_simple,
            'expert_name': row['expert_name']
        })
        
    except Exception as e:
        print(f"Error processing row {idx}: {e}")
        failed_rows.append(idx)

print(f"Processed {len(expert_database)} experts")
print(f"Created {len(training_examples)} training examples")
print(f"Failed rows: {len(failed_rows)}")

# Show example
print("\nExample training data:")
print(f"Input:\n{training_examples[0]['input'][:200]}...")
print(f"\nOutput:\n{training_examples[0]['output'][:200]}...")


✓ Processed 23 experts
✓ Created 46 training examples
✗ Failed rows: 0

Example training data:
Input:
Agenda:
1. Provide insights on risk management strategies adopted by multinational banks post-2020.
2. What are the best practices for achieving sustainable growth in the packaged food and beverage in...

Output:
Expert Name: Rahul Rao test test test test test test test test test test test test
Headline: A CXO Level expert with 29+ years of experience in Food & Beverages and Financial Services industries with ...


### CREATE AUGMENTED DATASET

This cell performs data augmentation to artificially increase the size and variety of training set.

For each original training example that contains an agenda with 4 or more questions, it does the following:

    Keeps the Original: The complete, original example is always included.

    Creates Shorter Versions: It randomly selects a subset of 3 questions and then a subset of 4 questions from the original agenda.

    Builds New Examples: It creates new training data where the input is this new, shorter, partial agenda, but the output remains the same as the original.
    
    This teaches the model to be more robust and flexible. It learns to generate the correct output even when it receives an incomplete or differently ordered agenda, which helps it generalize better to real-world variations.

In [9]:
augmented_examples = []

for example in training_examples:
    # Add original
    augmented_examples.append(example)
    
    # Parse questions
    agenda_lines = []
    for line in example['input'].split('\n'):
        if line.strip() and len(line) > 0 and line[0].isdigit():
            agenda_lines.append(line.strip())
    
    if len(agenda_lines) >= 4:
        # Create partial agenda (3-4 questions)
        for num_q in [3, 4]:
            if num_q < len(agenda_lines):
                selected = random.sample(agenda_lines, num_q)
                partial_agenda = "Agenda:\n"
                for i, q in enumerate(selected, 1):
                    q_text = q.split('. ', 1)[1] if '. ' in q else q
                    partial_agenda += f"{i}. {q_text}\n"
                
                augmented_examples.append({
                    'input': partial_agenda.strip(),
                    'output': example['output'],
                    'expert_name': example['expert_name']
                })

print(f"Original examples: {len(training_examples)}")
print(f"Augmented examples: {len(augmented_examples)}")
print(f"Augmentation ratio: {len(augmented_examples) / len(training_examples):.2f}x")


Original examples: 46
Augmented examples: 138
Augmentation ratio: 3.00x


In [10]:

#SPLIT DATASET

from datasets import Dataset, DatasetDict

# Shuffle and split
random.shuffle(augmented_examples)
split_idx = int(0.9 * len(augmented_examples))

train_data = augmented_examples[:split_idx]
val_data = augmented_examples[split_idx:]

print(f"Train set: {len(train_data)} examples")
print(f"Validation set: {len(val_data)} examples")

# Create datasets
train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)

print(f"\nTrain dataset features: {train_dataset.features}")
print(f"Train dataset size: {len(train_dataset)}")


Train set: 124 examples
Validation set: 14 examples

Train dataset features: {'input': Value(dtype='string', id=None), 'output': Value(dtype='string', id=None), 'expert_name': Value(dtype='string', id=None)}
Train dataset size: 124


### SETUP MODEL AND TOKENIZER
load the pre-trained Llama-3-8B model and its corresponding tokenizer into the GPU's memory. Because an 8-billion parameter model is very large, this script uses a critical memory-saving technique called 4-bit quantization (QLoRA). This allows the model, which would normally require over 16 GB of VRAM, to fit onto a more modest GPU

In short,:

    Logs into Hugging Face to get access to the model.

    Clears GPU memory to make room.

    Defines the 4-bit quantization settings.

    Loads the tokenizer and the heavily compressed model.

In [11]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, TaskType

#  Login to Hugging Face
from huggingface_hub import login

#  Paste your Hugging Face token here (required for gated model access)
login(token="HF_TOKEN")  # REPLACE THIS WITH YOUR HF TOKEN
# Clear GPU memory
gc.collect()
torch.cuda.empty_cache()

# 4-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)

# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_id"])
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
print(f"✓ Tokenizer loaded. Vocab size: {len(tokenizer)}")

# Load model
print("\nLoading model...")
model = AutoModelForCausalLM.from_pretrained(
    CONFIG["model_id"],
    device_map="auto",
    torch_dtype=torch.float16,
    quantization_config=bnb_config,
    trust_remote_code=True
)
print("Model loaded")

# GPU memory check
if torch.cuda.is_available():
    allocated = torch.cuda.memory_allocated(0) / 1024**3
    print(f"GPU memory allocated: {allocated:.2f} GB")


Loading tokenizer...


tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

✓ Tokenizer loaded. Vocab size: 128256

Loading model...


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

✓ Model loaded
GPU memory allocated: 5.31 GB


In [12]:
# ============================================
# CELL 10: APPLY LORA
# ============================================

# Prepare for training
model = prepare_model_for_kbit_training(model)

# LoRA configuration
lora_config = LoraConfig(
    r=CONFIG["lora_r"],
    lora_alpha=CONFIG["lora_alpha"],
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

# Apply LoRA
model = get_peft_model(model, lora_config)

print("LoRA configuration applied:")
model.print_trainable_parameters()

# Memory check
if torch.cuda.is_available():
    allocated = torch.cuda.memory_allocated(0) / 1024**3
    print(f"\nGPU memory after LoRA: {allocated:.2f} GB")


LoRA configuration applied:
trainable params: 83,886,080 || all params: 8,114,147,328 || trainable%: 1.0338

GPU memory after LoRA: 7.59 GB


In [13]:
# ============================================
# CREATE TOKENIZATION FUNCTION
# ============================================

def create_prompt(example):
    """Create training prompt"""
    expert_names = list(expert_database.keys())[:5]
    
    system_prompt = f"""You are an Expert Recommendation System with a database of specific experts.

EXPERTS IN DATABASE: {', '.join(expert_names)}... and others

RULES:
1. ONLY recommend experts from your database
2. Use EXACT name, headline, bio, and work summary
3. Include specific comfort levels
4. Do NOT create fictional experts"""

    formatted_prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>

{example['input']}

Recommend the most suitable expert from the database:<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{example['output']}<|eot_id|>"""
    
    return formatted_prompt

# Test prompt creation
test_prompt = create_prompt(train_data[0])
print("Sample prompt length:", len(test_prompt))
print("\nFirst 500 characters:")
print(test_prompt[:500])


Sample prompt length: 1862

First 500 characters:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are an Expert Recommendation System with a database of specific experts.

EXPERTS IN DATABASE: Rahul Rao test test test test test test test test test test test test, Sunil Punjabi, Hariharan PV, Vipul Gupta, Guilherme Oliveira ... and others

RULES:
1. ONLY recommend experts from your database
2. Use EXACT name, headline, bio, and work summary
3. Include specific comfort levels
4. Do NOT create fictional experts<|eot_id|><|start_he


In [14]:
# ============================================
# TOKENIZE DATASETS
# ============================================

def tokenize_function(examples):
    """Tokenize examples"""
    prompts = [create_prompt({
        'input': inp,
        'output': out
    }) for inp, out in zip(examples['input'], examples['output'])]
    
    model_inputs = tokenizer(
        prompts,
        max_length=CONFIG["max_length"],
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    
    labels = model_inputs["input_ids"].clone()
    
    # Mask non-response tokens
    for idx, prompt in enumerate(prompts):
        assistant_marker = "<|start_header_id|>assistant<|end_header_id|>\n\n"
        response_start = prompt.find(assistant_marker)
        if response_start != -1:
            response_start += len(assistant_marker)
            pre_response = prompt[:response_start]
            pre_tokens = tokenizer(pre_response, return_tensors="pt")["input_ids"]
            
            if pre_tokens.shape[1] < labels.shape[1]:
                labels[idx, :pre_tokens.shape[1]] = -100
    
    model_inputs["labels"] = labels
    
    return {
        "input_ids": model_inputs["input_ids"],
        "attention_mask": model_inputs["attention_mask"],
        "labels": labels
    }

# Tokenize datasets
print("Tokenizing train dataset...")
tokenized_train = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=['input', 'output', 'expert_name']
)

print("\nTokenizing validation dataset...")
tokenized_val = val_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=['input', 'output', 'expert_name']
)

print(f"\n✓ Train dataset tokenized: {len(tokenized_train)} examples")
print(f"✓ Val dataset tokenized: {len(tokenized_val)} examples")


Tokenizing train dataset...


Map:   0%|          | 0/124 [00:00<?, ? examples/s]


Tokenizing validation dataset...


Map:   0%|          | 0/14 [00:00<?, ? examples/s]


✓ Train dataset tokenized: 124 examples
✓ Val dataset tokenized: 14 examples


In [15]:
# ============================================
# SETUP TRAINING ARGUMENTS
# ============================================

from transformers import TrainingArguments

output_dir = os.path.join(KAGGLE_WORKING_PATH, "expert-recommender")

# Check transformers version
import transformers
print(f"Transformers version: {transformers.__version__}")

# Create training args compatible with different versions
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=CONFIG["num_epochs"],
    per_device_train_batch_size=CONFIG["batch_size"],
    per_device_eval_batch_size=CONFIG["batch_size"],
    gradient_accumulation_steps=CONFIG["gradient_accumulation_steps"],
    gradient_checkpointing=True,
    optim="paged_adamw_8bit",
    learning_rate=CONFIG["learning_rate"],
    lr_scheduler_type="cosine",
    warmup_steps=CONFIG["warmup_steps"],
    logging_steps=10,
    eval_strategy="steps",  
    eval_steps=CONFIG["eval_steps"],
    save_strategy="steps",
    save_steps=CONFIG["save_steps"],
    save_total_limit=2,
    load_best_model_at_end=True,
    fp16=True,
    seed=42,
    report_to="none",
    dataloader_num_workers=2,
    dataloader_pin_memory=True
)

print("Training configuration:")
print(f"  Output directory: {output_dir}")
print(f"  Total training steps: {len(tokenized_train) // CONFIG['gradient_accumulation_steps'] * CONFIG['num_epochs']}")
print(f"  Effective batch size: {CONFIG['batch_size'] * CONFIG['gradient_accumulation_steps']}")


Transformers version: 4.51.3
Training configuration:
  Output directory: /kaggle/working/expert-recommender
  Total training steps: 155
  Effective batch size: 4


In [16]:
# ============================================
# CELL 14: CREATE TRAINER AND TRAIN
# ============================================

from transformers import Trainer, DataCollatorForLanguageModeling

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator
)

print("Starting training...")


# Train
start_time = datetime.now()
trainer.train()
end_time = datetime.now()


print(f"Training completed in: {end_time - start_time}")


  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Starting training...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
25,1.9037,1.2595
50,0.6655,0.502411
75,0.2967,0.2867
100,0.1888,0.20871
125,0.1644,0.182482
150,0.1444,0.177961


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Training completed in: 1:34:25.551145


In [17]:
# ============================================
# SAVE FINAL MODEL
# ============================================

final_model_path = os.path.join(KAGGLE_WORKING_PATH, "final_expert_model")

print(f"Saving model to: {final_model_path}")
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)

# List saved files
print("\nSaved files:")
for file in os.listdir(final_model_path):
    size = os.path.getsize(os.path.join(final_model_path, file)) / (1024*1024)
    print(f"  - {file} ({size:.2f} MB)")


Saving model to: /kaggle/working/final_expert_model

Saved files:
  - tokenizer_config.json (0.05 MB)
  - README.md (0.00 MB)
  - adapter_model.safetensors (320.06 MB)
  - adapter_config.json (0.00 MB)
  - special_tokens_map.json (0.00 MB)
  - tokenizer.json (16.41 MB)
  - training_args.bin (0.01 MB)


In [29]:
# ============================================
# TEST INFERENCE 
# ============================================

# Simple inference test
test_agenda = """Agenda:
1. Experience with Media and Entertainment industry?
2. Worked in companies like sony?"""

test_prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are Expert Recommendation System. Recommend the top 2 most suitable expert from your database.<|eot_id|><|start_header_id|>user<|end_header_id|>

{test_agenda}

Recommend the top 2 most suitable experts from the database:<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""

# Tokenize
inputs = tokenizer(test_prompt, return_tensors="pt", truncation=True, max_length=512)
inputs = {k: v.to(model.device) for k, v in inputs.items()}

print("Generating response...")
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        temperature=0.1,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id
    )

response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
print("\nModel Response:")
print(response)
# print(response[:38000] + "..." if len(response) > 38000 else response)


Generating response...

Model Response:


Expert Name: Sunil Punjabi
Headline: Expert has experience in Media &amp; Entertainment Industry. He Lead multi-location and large size service organization. understanding of  Entertainment &amp; Media- TV, Sports, Music, and Films-Co-production/Theatrical Distribution
Bio: Expert has 15 years of experience in Media &amp; Entertainment Industry.  Currently working with a leading  Media firm as an EVP &amp; Business Head. His expertise includes Broadcasting, Media Production &amp; Theatrical Distribution and also he has Developed strategies to invest in India through launching more AXN boutique channels like AXN HD, AXN Movies, Sony Bee TV, Sony One, Animax among others. Previously he has worked with Cinemax India Ltd, Sony Entertainment Television, Fox Filmed Entertainment, Star TV etc
Expert Responses:
1. Expert has worked in Media &amp; Entertainment Industry.
2. Expert has 15 years of experience in Media &amp; Entertainment Industry.
3. Expe