### Install the Necessary Libraries Before Running the Crawler

In [14]:
# pip install numpy
# pip install sklearn
# pip install openai

### Importing Libraries

In [None]:
import json
import os
import time
import openai
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

### Setup OpenAI API Key

In [None]:
openai.api_key = os.environ['OPENAI_API_KEY'] # set your API key in the environment variables

# set your API key here if you don't want to use environment variables
# openai.api_key = "some OpenAI API key"

print(openai.api_key)
if not openai.api_key:
    raise ValueError("No OPENAI_API_KEY found in environment variables.")

### Load the Dataset

In [None]:
# with open('paper_data_file', 'r') as f:
with open('iclr2024.json', 'r') as f:
    papers = json.load(f)

def preprocess_data(papers):
    """
    Extract metadata, reviewer scores, and comments for each paper,
    and store them in a Python list.
    We'll convert them to chat format later in the fine_tune_model function.
    """
    dataset = []
    for paper in papers:
        # Basic info
        title = paper.get('title', '')
        abstract = paper.get('abstract', '')
        authors = ', '.join(paper.get('authors', []))

        # Normalize acceptance decision
        decision = paper.get('decision', '').strip()
        label = "Accept" if decision.lower().startswith('accept') else "Reject"

        # Extract reviewer info
        reviews = paper.get('reviews', [])
        scores = ", ".join(str(r.get('score', 'N/A')) for r in reviews) if reviews else "N/A"
        comments = " ".join(r.get('comment', '').strip() for r in reviews) if reviews else "N/A"

        dataset.append({
            "title": title,
            "abstract": abstract,
            "authors": authors,
            "scores": scores,
            "comments": comments,
            "label": label
        })
    return dataset

dataset = preprocess_data(papers)
print("Number of papers:", len(dataset))

### Split the Dataset into Training Set and Test Set

In [None]:
train_set, test_set = train_test_split(dataset, test_size=0.2, random_state=0)
print("Number of training samples:", len(train_set))
print("Number of test samples:", len(test_set))

### Possible Feature Sets

In [None]:
# the best feature set is "with_comments" based on validation results
best_feature_set = "with_comments"  # e.g., "metadata_only", "with_scores", "with_comments"

def build_user_content(entry, feature_type):
    """
    Build the 'content' of the user's message depending on the chosen feature type.
    """
    base_text = f"Title: {entry['title']}\nAbstract: {entry['abstract']}\nAuthors: {entry['authors']}"
    
    if feature_type == "metadata_only":
        # Just metadata
        user_content = base_text + "\nDecision:"
    elif feature_type == "with_scores":
        # metadata + scores
        user_content = base_text + f"\nReviewer Scores: {entry['scores']}\nDecision:"
    else:
        # with_comments => metadata + scores + comments
        user_content = (
            base_text +
            f"\nReviewer Scores: {entry['scores']}" +
            f"\nReviewer Comments: {entry['comments']}\nDecision:"
        )
    
    return user_content

### Helper Function for Fine-tuning

In [20]:
def fine_tune_model(train_data, feature_type):
    """
    Takes the full training set and a feature type,
    prepares a JSONL file in Chat Format, and creates a fine-tuning job.
    The last message is from the assistant, containing the correct label.
    """

    data_for_finetuning = []
    for entry in train_data:
        # Construct the user prompt
        user_message_content = build_user_content(entry, feature_type)
        
        # Create a 'messages' array where the last message is from the assistant
        chat_item = {
            "messages": [
                {
                    "role": "system",
                    "content": "You are a helpful AI that predicts whether a paper should be Accept or Reject."
                },
                {
                    "role": "user",
                    "content": user_message_content
                },
                {
                    "role": "assistant",
                    # The final correct answer, e.g. "Accept" or "Reject"
                    "content": entry["label"]
                }
            ]
        }
        data_for_finetuning.append(chat_item)
    
    # Save to JSONL
    jsonl_path = "chat_fine_tune_data.jsonl"
    with open(jsonl_path, "w") as f:
        for record in data_for_finetuning:
            json.dump(record, f)
            f.write("\n")
    
    # Upload file to OpenAI
    upload_response = openai.File.create(
        file=open(jsonl_path, "rb"),
        purpose="fine-tune"
    )
    file_id = upload_response["id"]
    print(f"Uploaded file ID: {file_id}")

    # Create fine-tune job
    fine_tune_response = openai.FineTuningJob.create(
        training_file=file_id,
        model="gpt-4o-2024-08-06",
        suffix="PaperEval",
    )
    
    return fine_tune_response["id"]

### Fine-tuning Process

In [None]:
# Start fine-tuning
fine_tune_job_id = fine_tune_model(train_set, best_feature_set)
print(f"Fine-Tuning Job started: {fine_tune_job_id}")

# Poll the fine-tuning job status
poll_interval = 60  # in seconds
while True:
    job_status = openai.FineTuningJob.retrieve(fine_tune_job_id)
    status = job_status["status"]
    print(f"Current fine-tuning job status: {status}")
    
    if status == "succeeded":
        final_model_id = job_status["fine_tuned_model"]
        print(f"Fine-tuning succeeded! Model ID: {final_model_id}")
        break
    elif status == "failed":
        raise RuntimeError("Fine-tuning job failed.")
    else:
        time.sleep(poll_interval)

### Evaluate Performance

In [27]:
def evaluate_model(test_data, model_id, feature_type):
    """
    Uses the fine-tuned chat model to predict labels on the test set,
    then calculates Accuracy, Precision, Recall, and F1-score.
    """
    test_prompts = [build_user_content(entry, feature_type) for entry in test_data]
    true_labels = [entry["label"] for entry in test_data]

    predictions = []
    for prompt in test_prompts:
        response = openai.ChatCompletion.create(
            model=model_id,
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful AI that predicts whether a paper should be Accept or Reject."
                },
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            max_tokens=1,
            temperature=0.0
        )
        # The chat response is in `response.choices[0].message.content`
        pred = response['choices'][0]['message']['content'].strip()
        pred_label = "Accept" if pred.lower().startswith("accept") else "Reject"
        predictions.append(pred_label)
    
    acc = accuracy_score(true_labels, predictions)
    prec = precision_score(true_labels, predictions, pos_label="Accept")
    rec = recall_score(true_labels, predictions, pos_label="Accept")
    f1 = f1_score(true_labels, predictions, pos_label="Accept")

    print("\nEvaluation on Test Set:")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1-score:  {f1:.4f}")

### Evaluate the Performance (ChatCompletion Approach)

use this method when have some API keys permission error

In [42]:
def evaluate_model(test_data, model_id, feature_type):
    """
    Uses a fine-tuned chat model to predict labels on the test set,
    then calculates Accuracy, Precision, Recall, and F1-score.
    
    Parameters:
    -----------
    test_data : list
        A list of dictionaries, each containing paper information and a 'label'.
    model_id : str
        The fine-tuned model ID in the format "ft:..." (e.g., "ft:gpt-4:some_id").
    feature_type : str
        Determines which parts of the paper information to include in the user prompt
        (e.g., "metadata_only", "with_scores", "with_comments").
    """
    
    # Prepare the content of the "user" messages for the test set
    test_prompts = [build_user_content(entry, feature_type) for entry in test_data]
    
    # Get the ground-truth labels for comparison
    true_labels = [entry["label"] for entry in test_data]

    predictions = []
    
    # For each paper, call the fine-tuned model
    for prompt in test_prompts:
        response = openai.ChatCompletion.create(
            model=model_id,  # e.g., "ft:gpt-3.5-turbo:xxxx"
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful AI that predicts whether a paper should be Accept or Reject."
                },
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            max_tokens=1,
            temperature=0.0
        )
        # Extract the model's response from 'assistant' message
        pred_text = response['choices'][0]['message']['content'].strip()
        
        # Convert the raw text to a label: "Accept" or "Reject"
        pred_label = "Accept" if pred_text.lower().startswith("accept") else "Reject"
        predictions.append(pred_label)
    
    # Calculate evaluation metrics
    acc = accuracy_score(true_labels, predictions)
    prec = precision_score(true_labels, predictions, pos_label="Accept")
    rec = recall_score(true_labels, predictions, pos_label="Accept")
    f1 = f1_score(true_labels, predictions, pos_label="Accept")

    # Print the final evaluation results
    print("\nEvaluation on Test Set:")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1-score:  {f1:.4f}")

### Print the Model Information for Debugging

In [None]:
# print the fine-tuning jobs
jobs = openai.FineTuningJob.list()
for job in jobs.data:
    print(job)

# print the fine-tuning job details for debugging
job_id = "some fine-tuning job ID"
try:
    job_info = openai.FineTuningJob.retrieve(job_id)
    print("Found job:", job_info)
except openai.error.PermissionError:
    print("No permission to access this Fine-Tuning Job.")

# print the models can be accessed
models = openai.Model.list()
model_ids = [m["id"] for m in models["data"]]
print("Models you can access:", model_ids)

# print the fine-tuned model details
model_info = openai.Model.retrieve("some fine-tuned model ID")
print(model_info)

# print the fine-tuned model details for debugging
model_id = "some fine-tuned model ID"
try:
    model_info = openai.Model.retrieve(model_id)
    print("Found model:", model_info)
except openai.error.PermissionError:
    print("No permission to access this model.")

In [None]:
final_model_id = "some fine-tuned model ID"  # change this to the final model ID
print("Final model ID:", final_model_id)
evaluate_model(test_set, final_model_id, best_feature_set)