# Install

In [None]:
!pip install -U "transformers>=4.42.3" bitsandbytes accelerate peft

In [None]:
!pip install einops

# Imports

In [None]:
import importlib.util
import sys
import os

# Specify the directory containing the modules
directory = 'modeling-internlm'

# Add the directory to sys.path so Python can find the modules
sys.path.append(directory)

# Load the configuration module first to avoid ImportError
config_spec = importlib.util.spec_from_file_location("configuration_internlm2", os.path.join(directory, "configuration_internlm2.py"))
configuration_internlm2 = importlib.util.module_from_spec(config_spec)
config_spec.loader.exec_module(configuration_internlm2)

# Now load the modeling_internlm2 module
spec = importlib.util.spec_from_file_location("modeling_internlm2", os.path.join(directory, "modeling_internlm2.py"))
modeling_internlm2 = importlib.util.module_from_spec(spec)
spec.loader.exec_module(modeling_internlm2)

# Add the modules to sys.modules
sys.modules["configuration_internlm2"] = configuration_internlm2
sys.modules["modeling_internlm2"] = modeling_internlm2

# Now you can import InternLM2Model
from modeling_internlm2 import *
from configuration_internlm2 import *

In [None]:
from torch.utils.data import DataLoader, TensorDataset, random_split
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from transformers import AutoModel, AutoConfig, AutoTokenizer
from sklearn.model_selection import train_test_split
import torch.nn as nn
import os
import time
import copy
from dataclasses import dataclass
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from scipy.special import softmax
from sklearn.preprocessing import LabelEncoder
from transformers import (
    BitsAndBytesConfig,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    PreTrainedTokenizerBase, 
    EvalPrediction,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType,PeftModel
from sklearn.metrics import log_loss, accuracy_score
from sklearn.utils import resample
from dataclasses import dataclass
from concurrent.futures import ThreadPoolExecutor
from transformers.data.data_collator import pad_without_fast_tokenizer_warning

In [None]:
from sklearn.utils import resample
TRAIN_CSV = "data/train.csv"

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
MAX_LENGTH = 1024
target_columns = ['winner_model_a', 'winner_model_b', 'winner_tie']
columns_to_vectorize = ["prompt", "response_a", "response_b"]

train = pd.read_csv(TRAIN_CSV)

# Select the last 30,000 rows and reverse the order
#train = train.iloc[-11000:][::-1].reset_index(drop=True)
# Display the first few rows to verify
train['label'] = train[target_columns].idxmax(axis=1) 
label_encoder = LabelEncoder()
train['label'] = label_encoder.fit_transform(train['label'])
train = train[columns_to_vectorize + ['label']]
print(train['label'].value_counts())

print(len(train))

train = train.head(500)

# Display the first few rows to verify the changes
print(train['label'].value_counts())

# Calculate the number of samples per label
# Determine the unique labels

# Display the first few rows to verify
print(train['label'].value_counts())

In [None]:
tokenizer = AutoTokenizer.from_pretrained("internlm/internlm2-7b-reward", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'
tokenizer.add_eos_token = True

# save tokenizer to load offline during inference
tokenizer.save_pretrained('tokenizer')

In [None]:
def preprocess_chat(tokenizer, conversation):
    # Apply the chat template and encode the conversation
    conversation_str = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=False)
    # Encode without adding special tokens automatically
    input_ids = tokenizer.encode(conversation_str, return_tensors="pt", add_special_tokens=False,padding="max_length", truncation=True, max_length=MAX_LENGTH - 1)
    
    input_ids = torch.cat([input_ids, torch.tensor([[92527]], dtype=torch.long)], dim=1)
    # Create attention mask
    attention_mask = torch.ones_like(input_ids, dtype=torch.bool)

    return {"input_ids": input_ids, "attention_mask": attention_mask}

def get_tokens(tokenizer, prompt, response_a, response_b):
    # Prepare the conversation with assistant_a and assistant_b roles
    chat = [
        {"role": "user", "content": prompt},
        {"role": "assistant_a", "content": response_a},
        {"role": "assistant_b", "content": response_b}
    ]
    
    # Tokenize the entire conversation
    tokens = preprocess_chat(tokenizer, chat)
    
    return tokens

def tokenize(example, tokenizer):
    prompt = example['prompt']
    response_a = example['response_a']
    response_b = example['response_b']
    
    # Get tokens for both assistant_a and assistant_b content
    tokens = get_tokens(tokenizer, prompt, response_a, response_b)
    
    # Extract input IDs and attention mask
    input_ids = tokens["input_ids"]
    attention_mask = tokens["attention_mask"]

    # If exceeding max length, truncate
    if input_ids.size(1) > MAX_LENGTH:
        input_ids = input_ids[:, :MAX_LENGTH - 1]
        attention_mask = attention_mask[:, :MAX_LENGTH - 1]

    # Assign label: 0 for Response A, 1 for Response B, 2 for Tie
    label = int(example['label'])  # Convert label to an integer based on your label encoding
    
    return {
        "input_ids": input_ids.squeeze(0).tolist(),  # Flatten to list
        "attention_mask": attention_mask.squeeze(0).tolist(),  # Flatten to list
        "labels": label
    }

def load_data(df, tokenizer):
    raw_datasets = Dataset.from_pandas(df)
    tokenized_datasets = raw_datasets.map(
        tokenize, 
        remove_columns=raw_datasets.column_names,
        fn_kwargs={'tokenizer': tokenizer}
    )
    return tokenized_datasets

In [None]:
def compute_metrics(eval_preds: EvalPrediction) -> dict:
    preds = eval_preds.predictions
    labels = eval_preds.label_ids
    print(labels)
    probs = torch.from_numpy(preds).float().softmax(-1).numpy()
    loss = log_loss(y_true=labels, y_pred=probs)
    y_pred=preds.argmax(-1)
    print(y_pred)
    acc = accuracy_score(y_true=labels, y_pred=preds.argmax(-1))
    return {"acc": acc, "log_loss": loss}

n_splits = 20
fold_idx = 0
ds = load_data(train, tokenizer)
folds = [
    (
        [i for i in range(len(ds)) if i % n_splits != fold_idx],
        [i for i in range(len(ds)) if i % n_splits == fold_idx]
    ) 
    for fold_idx in range(n_splits)
]
train_idx, eval_idx = folds[fold_idx]

In [None]:
ds[0]['input_ids'][-1]

In [None]:
# Load your model configuration
config = AutoConfig.from_pretrained("/kaggle/input/internlm_quantified_7b/transformers/default/1", num_labels=3,trust_remote_code=True)

In [None]:
# Load the model with ignore_mismatched_sizes=True
model = InternLM2ForSequenceClassification.from_pretrained(
    '/kaggle/input/internlm_quantified_7b/transformers/default/1',
    config=config,
    trust_remote_code=True,
    device_map='auto'
)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Assuming `model` is your initialized model
total_params = count_parameters(model)

print(f"Total trainable parameters: {total_params:,}")


In [None]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias='none',
    inference_mode=False,
    task_type=TaskType.SEQ_CLS,
    target_modules=['wqkv', 'wo','w1','w2','w3'], 
)

In [None]:
model.config.use_cache = False
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)
print(model)
model.print_trainable_parameters()

In [None]:
# Unfreeze LoRA and v_head parameters
for name, param in model.named_parameters():
    if "lora_" in name or "v_head" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

# Optionally, you can print out which parameters are trainable
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"Trainable parameter: {name}")

In [None]:
model.print_trainable_parameters()

In [None]:
args = TrainingArguments(
    output_dir='output-h100-1',
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    save_strategy="steps",
    save_steps=25,  # Save every 25 steps
    logging_strategy="steps",
    logging_steps=10,  # Log every 10 steps
    warmup_steps=10,  # Warmup steps
    optim="adamw_8bit",
    learning_rate=5e-6,  # Learning rate
    per_device_train_batch_size=16,  # Batch size of 16 sequences
    per_device_eval_batch_size=32, 
    gradient_accumulation_steps=1,  # Effectively no gradient accumulation
    max_grad_norm=0.5,  # Gradient clipping
    num_train_epochs=1,  # Number of epochs
    weight_decay=0.01,
    fp16=True,  # Mixed precision training
    metric_for_best_model="log_loss",
    greater_is_better=False,
    report_to="none",
)


In [None]:
trainer = Trainer(
    args=args,
    model=model,
    train_dataset=ds.select(train_idx),
    eval_dataset=ds.select(eval_idx),
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)


In [None]:
print("## Starting to train ##")

In [None]:
trainer.train()

In [None]:
save_path = 'output/H100-full_model_pytorch-1'

# Ensure the directory exists
os.makedirs(save_path, exist_ok=True)

# Save the full model, including the v_head layer
torch.save(model.state_dict(), os.path.join(save_path, 'pytorch_model.bin'))

# Save the configuration as well
model.config.save_pretrained(save_path)

In [None]:
model = model.merge_and_unload()
model.save_pretrained('H100-merged-1')