<a href="https://www.kaggle.com/code/minhthonglai/ielts-gptj-tuner?scriptVersionId=242602445" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# GPT-J Fine-tuning for Essay Scoring

## System configurations 

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when create a version using "Save & Run All" 
# Can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import sys
import os

# Check Python version
print(f"Python Version: `{sys.version}`")  # Detailed version info
print(f"Base Python location: `{sys.base_prefix}`")
print(f"Current Environment location: `{os.path.basename(sys.prefix)}`", end='\n\n')

Python Version: `3.11.11 (main, Dec  4 2024, 08:55:07) [GCC 11.4.0]`
Base Python location: `/usr`
Current Environment location: `usr`



In [2]:
import torch
import time  # for CPU timing

# Check if GPU is available and being used
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"Current device: {torch.cuda.current_device()}")
    print(f"Device name: {torch.cuda.get_device_name()}")
    print(f"Device memory: {torch.cuda.get_device_properties(0).total_memory / 1e9} GB")

# This code will work in both CPU and GPU environments
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
    
# Test with a simple operation - modified to work on both CPU and GPU
x = torch.rand(10000, 10000, device=device)
y = torch.rand(10000, 10000, device=device)

if device.type == "cuda":
    # GPU timing approach
    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    start.record()
    z = x @ y  # Matrix multiplication
    end.record()
    torch.cuda.synchronize()
    print(f"Operation time: {start.elapsed_time(end)} ms")
else:
    # CPU timing approach
    start_time = time.time()
    z = x @ y  # Matrix multiplication
    end_time = time.time()
    print(f"Operation time: {(end_time - start_time) * 1000} ms")

CUDA available: False
Using device: cpu
Operation time: 16064.372301101685 ms


## Install required libraries

In [3]:
# %pip install transformers peft datasets accelerate bitsandbytes trl
# %pip install sentencepiece

## Import Libraries and Set Configuration

In [4]:
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import os
from datasets import load_dataset

# Check if GPU is available
if torch.cuda.is_available():
    print("GPU is available. Setting up GPU environment...")
    # Set maximum GPU memory usage to avoid OOM errors
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    torch.cuda.empty_cache()
else:
    print("No GPU detected. Running in CPU-only mode.")
    # Ensure CUDA is disabled
    os.environ["CUDA_VISIBLE_DEVICES"] = ""

# Enable memory efficient attention - works for both CPU and GPU
os.environ["TRANSFORMERS_OFFLINE"] = "1"  # Work offline if model is downloaded

2025-05-29 18:05:28.113018: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748541928.404030      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748541928.482937      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


No GPU detected. Running in CPU-only mode.


## Prepare the IELTS Dataset

### Sample code to create a dataset for IELTS assessment


Need to prepare the dataset with this structure:

```
Format should be:
{
  "input": "Rate this IELTS essay: [student essay text]",
  "output": "Task Achievement: 7.0 - [detailed feedback]\nCoherence and Cohesion: 6.5 - [detailed feedback]\nLexical Resource: 6.0 - [detailed feedback]\nGrammatical Range and Accuracy: 7.0 - [detailed feedback]\nOverall Band Score: 6.5"
}
```

In [5]:
from datasets import Dataset

# Example data - replace with your actual IELTS essays and scores
data = {
    "input": [
        "Rate this IELTS essay: In some countries, the number of shootings increase because many people have guns at home. To what extent do you agree or disagree?",
        "Rate this IELTS Task 1: The graph below shows the proportion of the population aged 65 and over between 1940 and 2040 in three different countries."
    ],
    "output": [
        "Task Achievement: 7.0 - The essay addresses all parts of the task...\nCoherence and Cohesion: 6.5 - The essay is generally well-organized...\nOverall Band Score: 6.5",
        "Task Achievement: 6.0 - The response covers the requirements of the task...\nCoherence and Cohesion: 6.0 - Information is arranged coherently...\nOverall Band Score: 6.0"
    ]
}

In [6]:
dataset = Dataset.from_dict(data)
train_dataset = dataset.shuffle().select(range(int(len(dataset)*0.9)))  # 90% for training
eval_dataset = dataset.shuffle().select(range(int(len(dataset)*0.9), len(dataset)))  # 10% for evaluation

## Load GPT-J with Quantization for Memory Efficiency

In [7]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_id = "EleutherAI/gpt-j-6B"

# Check if GPU is available
if torch.cuda.is_available():
    print("GPU detected! Using 4-bit quantization...")
    # Define quantization configuration
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )
    
    # Load with quantization for GPU
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=quantization_config,
        torch_dtype=torch.float16,
        device_map="auto"
    )
else:
    print("No GPU detected! For development, using a smaller model instead...")
    # For CPU development, consider using a much smaller model
    # GPT-J is too large for most CPU environments
    cpu_model_id = "distilgpt2"  # Only 82M parameters vs 6B
    model = AutoModelForCausalLM.from_pretrained(
        cpu_model_id,
        low_cpu_mem_usage=True
    )
    print(f"⚠️ NOTE: Using {cpu_model_id} instead of {model_id} for CPU development!")
    print("⚠️ Remember to switch to GPU when ready for actual fine-tuning")

# Get the tokenizer for whatever model we loaded
tokenizer = AutoTokenizer.from_pretrained(model_id if torch.cuda.is_available() else cpu_model_id)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token


No GPU detected! For development, using a smaller model instead...


config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

⚠️ NOTE: Using distilgpt2 instead of EleutherAI/gpt-j-6B for CPU development!
⚠️ Remember to switch to GPU when ready for actual fine-tuning


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

## Configure LoRA for Parameter-Efficient Fine-Tuning

In [8]:
# Inspect model architecture to find correct layer names
def find_target_modules(model):
    target_modules = []
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear) and module.weight.requires_grad:
            print(f"Found Linear layer: {name}, Shape: {module.weight.shape}")
            target_modules.append(name.split('.')[-1])  # Get just the final part of the name
    
    # Return unique module types
    return list(set(target_modules))

# Run this to see available modules
potential_targets = find_target_modules(model)
print(f"Potential target modules: {potential_targets}")

Found Linear layer: lm_head, Shape: torch.Size([50257, 768])
Potential target modules: ['lm_head']


In [9]:
# For DistilGPT2/GPT-2 style models:
peft_config = LoraConfig(
    r=8,                     # Rank dimension
    lora_alpha=32,           # Alpha parameter for LoRA scaling
    lora_dropout=0.1,        # Dropout probability for LoRA layers
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["c_attn", "c_proj"],  # Standard GPT-2 attention layers
)

# Print configuration
print(f"Using target modules for GPT-2 style architecture: ['c_attn', 'c_proj']")

# Apply LoRA to model
model = get_peft_model(model, peft_config)

# Print trainable parameters to verify setup
model.print_trainable_parameters()


Using target modules for GPT-2 style architecture: ['c_attn', 'c_proj']
trainable params: 405,504 || all params: 82,318,080 || trainable%: 0.4926




## Define Training Arguments

In [10]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=1,   # Keep small due to memory constraints
    gradient_accumulation_steps=4,   # Accumulate gradients to simulate larger batch
    save_steps=100,
    logging_steps=10,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True if torch.cuda.is_available() else False, # Use mixed precision
    report_to="none",                # Disable wandb reporting
    optim="adamw_torch",
    max_grad_norm=0.3,               # Gradient clipping
    warmup_ratio=0.03,               # Warmup for learning rate
)

In [11]:
class IELTSDataCollator:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        
    def __call__(self, examples):
        # Extract input and output
        inputs = [example["input"] for example in examples]
        outputs = [example["output"] for example in examples]
        
        # Format the text (input followed by output)
        texts = [f"<|user|>\n{inp}\n<|assistant|>\n{out}</s>" for inp, out in zip(inputs, outputs)]
        
        # Tokenize
        batch = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
        
        # For causal language modeling, use the input_ids as labels too
        batch["labels"] = batch["input_ids"].clone()
        
        return batch

# Create the custom data collator
data_collator = IELTSDataCollator(tokenizer)


In [12]:
def preprocess_function(examples):
    # Format the text
    texts = [
        f"<|user|>\n{examples['input'][i]}\n<|assistant|>\n{examples['output'][i]}</s>"
        for i in range(len(examples['input']))
    ]
    
    # Tokenize
    tokenized = tokenizer(texts, padding="max_length", truncation=True, max_length=512)
    
    # For language modeling
    tokenized["labels"] = tokenized["input_ids"].copy()
    
    return tokenized

# Process the dataset
processed_dataset = train_dataset.map(
    preprocess_function, 
    batched=True,
    remove_columns=train_dataset.column_names  # Remove the original columns
)

# Define the Trainer with processed dataset
trainer = Trainer(
    model=model,
    args=training_args,  # No need for remove_unused_columns=False here
    train_dataset=processed_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
)


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

NameError: name 'Trainer' is not defined

## Set Up Trainer and Start Fine-Tuning

In [None]:
from transformers import Trainer, DataCollatorForLanguageModeling

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Define custom data formatting function
def formatting_func(examples):
    texts = []
    for i in range(len(examples["input"])):
        text = f"<|user|>\n{examples['input'][i]}\n<|assistant|>\n{examples['output'][i]}</s>"
        texts.append(text)
    return texts

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Start training
trainer.train()

## Save the Fine-Tuned Model

In [None]:
# Save the model (just the LoRA weights, not the full model)
model.save_pretrained("./ielts_gptj_model")
tokenizer.save_pretrained("./ielts_gptj_model")

# Download the model to the local machine
from IPython.display import FileLink
FileLink(r'./ielts_gptj_model')


## Test the Fine-Tuned Model

In [None]:
# Test the model on an example
test_input = "Rate this IELTS essay: [paste a sample essay here]"

inputs = tokenizer(test_input, return_tensors="pt").to("cuda")
output = model.generate(
    inputs.input_ids,
    max_new_tokens=512,
    temperature=0.7,
    top_p=0.9,
    repetition_penalty=1.2
)

print(tokenizer.decode(output[0], skip_special_tokens=True))