In [6]:
%%writefile constants.py
# IMPORTANT: Update LORA_PATH to point to your saved adapter dataset
BASE_MODEL_PATH = "/kaggle/input/qwen2.5/transformers/0.5b-instruct-gptq-int4/1"
LORA_PATH = "/kaggle/input/qwen2-5-lora/" # <-- CHANGE THIS to the path of your Kaggle dataset
DATA_PATH = "/kaggle/input/jigsaw-agile-community-rules/"

POSITIVE_ANSWER = "Yes"
NEGATIVE_ANSWER = "No"
COMPLETE_PHRASE = "Answer:"
BASE_PROMPT = '''You are given a comment from reddit and a rule. Your task is to classify whether the comment violates the rule. Only respond Yes/No.'''

Writing constants.py


In [7]:
%%writefile utils.py
import pandas as pd
from datasets import Dataset
from constants import POSITIVE_ANSWER, NEGATIVE_ANSWER, COMPLETE_PHRASE, BASE_PROMPT
import random, numpy as np
random.seed(42)
np.random.seed(42)


def build_prompt(row):
    """
    Constructs the prompt for the language model with few-shot examples.
    """
    return f"""
{BASE_PROMPT}

Subreddit: r/{row["subreddit"]}
Rule: {row["rule"]}
Examples:
1) {row["positive_example"]}
{COMPLETE_PHRASE} Yes

2) {row["negative_example"]}
{COMPLETE_PHRASE} No

---
Comment: {row["body"]}
{COMPLETE_PHRASE}"""


def build_dataset(dataframe):
    """
    Builds a dataset suitable for inference.
    """
    dataframe["prompt"] = dataframe.apply(build_prompt, axis=1)
    dataset = Dataset.from_pandas(dataframe[["prompt"]])
    return dataset

Writing utils.py


In [8]:
%%writefile inference.py
import os
os.environ["VLLM_USE_V1"] = "0"

import vllm
import torch
import pandas as pd
from logits_processor_zoo.vllm import MultipleChoiceLogitsProcessor
from vllm.lora.request import LoRARequest
from utils import build_dataset
from constants import BASE_MODEL_PATH, LORA_PATH, DATA_PATH, POSITIVE_ANSWER, NEGATIVE_ANSWER
import random
import multiprocessing as mp


def run_inference_on_device(df_slice):
    """
    Runs vLLM inference on a specific GPU using the pre-trained LoRA adapter.
    """
    llm = vllm.LLM(
        BASE_MODEL_PATH,
        quantization="gptq",
        tensor_parallel_size=1,
        gpu_memory_utilization=0.98,
        trust_remote_code=True,
        dtype="half",
        enforce_eager=True,
        max_model_len=2836,
        disable_log_stats=True,
        enable_prefix_caching=True,
        enable_lora=True,
        max_lora_rank=64,
    )

    tokenizer = llm.get_tokenizer()
    # Constrain the output to be either "Yes" or "No"
    mclp = MultipleChoiceLogitsProcessor(tokenizer, choices=[POSITIVE_ANSWER, NEGATIVE_ANSWER])

    test_dataset = build_dataset(df_slice)
    texts = test_dataset["prompt"]

    # Generate predictions using the specified LoRA adapter
    outputs = llm.generate(
        texts,
        vllm.SamplingParams(
            skip_special_tokens=True,
            max_tokens=1,
            logits_processors=[mclp],
            logprobs=2,
        ),
        use_tqdm=True,
        lora_request=LoRARequest(
            lora_name="default", # A unique name for the LoRA request
            lora_int_id=1,      # A unique integer ID for the LoRA adapter
            lora_local_path=LORA_PATH # Path to the saved LoRA adapter
        )
    )

    # Extract log probabilities
    log_probs = [
        {lp.decoded_token: lp.logprob for lp in out.outputs[0].logprobs[0].values()}
        for out in outputs
    ]
    predictions = pd.DataFrame(log_probs)[[POSITIVE_ANSWER, NEGATIVE_ANSWER]]
    predictions["row_id"] = df_slice["row_id"].values
    return predictions


def worker(device_id, df_slice, return_dict):
    """
    Worker process to run inference on a slice of the data.
    """
    # Set the visible GPU for this process
    os.environ["CUDA_VISIBLE_DEVICES"] = str(device_id)
    print(f"[Worker {device_id}] Running on GPU {device_id}, data size={len(df_slice)}")

    preds = run_inference_on_device(df_slice)
    return_dict[device_id] = preds


def main():
    """
    Main function for parallelized inference.
    """
    test_dataframe = pd.read_csv(f"{DATA_PATH}/test.csv")

    # Randomly select one positive and one negative example for the few-shot prompt
    test_dataframe["positive_example"] = test_dataframe.apply(
        lambda row: random.choice([row["positive_example_1"], row["positive_example_2"]]),
        axis=1
    )
    test_dataframe["negative_example"] = test_dataframe.apply(
        lambda row: random.choice([row["negative_example_1"], row["negative_example_2"]]),
        axis=1
    )
    test_dataframe = test_dataframe.drop(
        columns=["positive_example_1", "positive_example_2", "negative_example_1", "negative_example_2"],
        errors="ignore"
    )

    # Split the data for parallel processing
    num_gpus = torch.cuda.device_count()
    if num_gpus > 1:
        mid = len(test_dataframe) // num_gpus
        dfs = [test_dataframe.iloc[i*mid:(i+1)*mid].reset_index(drop=True) for i in range(num_gpus-1)]
        dfs.append(test_dataframe.iloc[(num_gpus-1)*mid:].reset_index(drop=True))
    else:
        dfs = [test_dataframe]

    manager = mp.Manager()
    return_dict = manager.dict()

    processes = [mp.Process(target=worker, args=(i, df, return_dict)) for i, df in enumerate(dfs)]
    for p in processes:
        p.start()
    for p in processes:
        p.join()

    # Combine the results
    predictions = pd.concat([return_dict[i] for i in range(len(dfs))], ignore_index=True)

    # Create the submission file
    submission = predictions[["row_id", POSITIVE_ANSWER]].rename(columns={POSITIVE_ANSWER: "rule_violation"})
    
    # Normalize probabilities to ranks for a robust submission format
    submission['rule_violation'] = submission['rule_violation'].rank(method='average') / (len(submission) + 1)

    submission.to_csv("submission.csv", index=False)
    print("✅ Saved submission.csv")


if __name__ == "__main__":
    main()

Writing inference.py


In [9]:
# Run the inference script
!python inference.py

[Worker 0] Running on GPU 0, data size=5
[Worker 1] Running on GPU 1, data size=5
2025-09-30 20:18:25.764893: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-09-30 20:18:25.764893: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759263506.135834     202 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759263506.135972     204 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759263506.244833     202 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS w

In [10]:
# Display the first few rows of the submission file
!head submission.csv

row_id,rule_violation
2029,0.13636363636363635
2030,0.7272727272727273
2031,0.6363636363636364
2032,0.13636363636363635
2033,0.45454545454545453
2034,0.5454545454545454
2035,0.8181818181818182
2036,0.36363636363636365
2037,0.2727272727272727
