In [None]:
!pip install transformers==4.41.0
!pip install jsonlines
!pip install pandas==2.0.3
!pip install datasets==2.14.5
!pip install torchmetrics==0.11.4
!pip install datasets transformers accelerate

Collecting transformers==4.41.0
  Downloading transformers-4.41.0-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.20,>=0.19 (from transformers==4.41.0)
  Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.41.0-py3-none-any.whl (9.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
   

In [None]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
import numpy as np
from transformers import AdamW
import os
from datasets import Dataset
from sklearn.metrics import precision_score, recall_score, f1_score
import json
import jsonlines
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline
from torchmetrics import Accuracy, Precision, F1Score, Recall
import random
torch.manual_seed(1234)
np.random.seed(1234)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data_dir = './drive/MyDrive/cat-bench'
folders = ['train_must_why', 'test_must_why', 'val_must_why']

# file types
file_types = ['nondependent_real_before', 'nondependent_real_after',
              'dependent_real_before', 'dependent_real_after']

# dictionary to hold data
data = {folder: {} for folder in folders}

# load data from the folders
for folder in folders:
    folder_path = os.path.join(data_dir, folder)

    for file_type in file_types:
        file_path = os.path.join(folder_path, f'{file_type}.jsonl')
        data[folder][file_type] = []

        # Read the .jsonl file
        with jsonlines.open(file_path) as reader:
            for obj in reader:
                # append labels based on file type, 0 if in nondependent 1 if else
                if file_type.startswith('nondependent'):
                    obj['label'] = 0
                else:
                    obj['label'] = 1

                # store in the folder based off file_type list
                data[folder][file_type].append(obj)

combined_data = {}

for folder in folders:
    combined_data[folder] = []

    # nondependent data being split in two by before and after, since they are the same samples just the questions worded differently.
    nondependent_before = data[folder]['nondependent_real_before']
    nondependent_after = data[folder]['nondependent_real_after']
    half_before = len(nondependent_before) // 2
    half_after = len(nondependent_after) // 2

    # combine first half of before and second half of after
    combined_nondependent = nondependent_before[:half_before] + nondependent_after[half_after:]

    # dependent, samething as above just for dependent
    dependent_before = data[folder]['dependent_real_before']
    dependent_after = data[folder]['dependent_real_after']
    half_before = len(dependent_before) // 2
    half_after = len(dependent_after) // 2

    # combine first half of before and second half of after
    combined_dependent = dependent_before[:half_before] + dependent_after[half_after:]

    # add to combined data
    combined_data[folder].extend(combined_nondependent)
    combined_data[folder].extend(combined_dependent)

example_plan = combined_data['train_must_why'][0]

In [None]:
#return prompt, index removed
def create_prompt(steps, question, q_idx):
    # remove random step here
    valid_indices = [i for i in range(0, q_idx[1]) if i != q_idx[0]]

    if (len(valid_indices) == 0) or (valid_indices == None):
        return None, None

    index_to_remove = random.choice(valid_indices)


    steps_text = " ".join([f"Step {i+1}: {step}" for i, step in enumerate(steps) if i != index_to_remove])
    prompt = f"{steps_text}\nQuestion: {question}"
    return prompt, index_to_remove

# prep training data with labels
training_data = []
removed_data = []
for plan in combined_data['train_must_why']:
    steps = plan['steps']
    question = plan['binary_question']
    label = plan['label']
    q_idx = plan['step_pair_idx_asked_about']

    # if the step_idx is 0, 1, remove that from the list of training items
    # we want to remove a random step before the the last step
    # so if 3, 4, then we remove a random index before 4 but making sure its not 3
    # also, we before or after dataset, we will take half of the before and half of the after

    # make prompt
    prompt, removed_index = create_prompt(steps, question, q_idx)

    if prompt == None:
      continue

    # training set to
    training_data.append({
        'text_input': prompt,
        'output': str(label)
    })

    removed_data.append({
        'question_idx': plan['plan_idx'],
        'removed_idx' : removed_index
    })

training_df = pd.DataFrame(training_data)
training_df.to_csv('./drive/MyDrive/rds_training_data.csv', index=False)

In [None]:
# prep test data with labels
test_data = []
test_removed_data = []
for plan in combined_data['test_must_why']:
    steps = plan['steps']
    question = plan['binary_question']
    label = plan['label']
    q_idx = plan['step_pair_idx_asked_about']

    # make prompt
    prompt, removed_index = create_prompt(steps, question, q_idx)

    if prompt == None:
      continue

    # test set to
    test_data.append({
        'text_input': prompt,
        'output': str(label)
    })

    test_removed_data.append({
        'question_idx': plan['plan_idx'],
        'removed_idx' : removed_index
    })

test_df = pd.DataFrame(test_data)
test_df.to_csv('./drive/MyDrive/rds_test_data.csv', index=False)

# BASE MODEL TESTING


In [None]:
import pandas as pd
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Path to your CSV dataset
DATA_PATH = './drive/MyDrive/rds_test_data.csv'

# Load the dataset
df = pd.read_csv(DATA_PATH)

# Extract input texts and labels from the dataframe
texts = df['text_input'].tolist()
true_labels = df['output'].tolist()  # assumed to be 0 or 1

# Load a DistilBERT model and tokenizer
# Replace with your specific model if you have one fine-tuned for the task.
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name)

# Tokenize the input
encodings = tokenizer(texts, truncation=True, padding=True, return_tensors="pt")

# Run inference
with torch.no_grad():
    outputs = model(**encodings)

# Get predictions (logits)
logits = outputs.logits
predicted_labels = torch.argmax(logits, dim=1).tolist()

# Compute metrics
accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, zero_division=0)
recall = recall_score(true_labels, predicted_labels, zero_division=0)
f1 = f1_score(true_labels, predicted_labels, zero_division=0)

# Print results
print("Evaluation Metrics:")
print(f"  Accuracy:  {accuracy:.4f}")
print(f"  Precision: {precision:.4f}")
print(f"  Recall:    {recall:.4f}")
print(f"  F1-Score:  {f1:.4f}")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Evaluation Metrics:
  Accuracy:  0.4717
  Precision: 0.2687
  Recall:    0.0258
  F1-Score:  0.0471



# FINE TUNING


In [54]:
import pandas as pd
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [55]:
train_data = pd.read_csv('./drive/MyDrive/rds_training_data.csv')
test_data = pd.read_csv('./drive/MyDrive/rds_test_data.csv')

In [56]:
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples["text_input"], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.rename_column("output", "labels")
test_dataset = test_dataset.rename_column("output", "labels")

Map:   0%|          | 0/6799 [00:00<?, ? examples/s]

Map:   0%|          | 0/1380 [00:00<?, ? examples/s]

In [58]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",          # Output directory for checkpoints and metrics
    num_train_epochs=3,              # Number of training epochs
    per_device_train_batch_size=16,  # Batch size per device during training
    per_device_eval_batch_size=64,   # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",     # Evaluate at the end of each epoch
    load_best_model_at_end=True,  # Load the best model when finished training
    metric_for_best_model="f1",     # Use f1 score to determine the best model
    save_strategy="epoch"           # Save checkpoints at the end of each epoch to match evaluation strategy
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [59]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [60]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

eval_results = trainer.evaluate()
print(eval_results)

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6646,0.666706,0.603623,0.518909,0.671982,0.422636
2,0.6318,0.624986,0.668841,0.684174,0.660881,0.709169
3,0.5416,0.620046,0.676812,0.699461,0.660305,0.743553


{'eval_loss': 0.6200461387634277, 'eval_accuracy': 0.6768115942028986, 'eval_f1': 0.6994609164420486, 'eval_precision': 0.6603053435114504, 'eval_recall': 0.7435530085959885, 'eval_runtime': 13.0228, 'eval_samples_per_second': 105.968, 'eval_steps_per_second': 1.689, 'epoch': 3.0}


# ONE SHOT


In [61]:
import pandas as pd
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import Dataset

# Custom instructions you want to add to each prompt
base_instructions = """
Consider the following example as a precursor to a question:

Gather ingredients: flour, eggs
[Missing Step]
Mix the whisked eggs with the flour.

In this example, for "whisked eggs" to be added to the "flour," the eggs must first be cracked and whisked.
Therefore, these steps are dependent because the state of the "eggs" must change before they can be mixed with the "flour."

For the following plan and question, return ONLY 0 if the pair in question is non-dependent and 1 if the pair is dependent.
Again, your response should only be either a 0 or 1.

"""

# Load and Edit Data
train_data = pd.read_csv('./drive/MyDrive/rds_training_data.csv')
test_data = pd.read_csv('./drive/MyDrive/rds_test_data.csv')
# For example, prepend the base instructions to each prompt
train_data['text_input'] = base_instructions + train_data['text_input']
test_data['text_inputt'] = base_instructions + test_data['text_input']

# Convert pandas DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

# Tokenizer and Model Setup
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2)

def tokenize_function(examples):
    # Tokenize the text only
    return tokenizer(examples["text_input"], truncation=True, padding=False)

train_tokenized = train_dataset.map(tokenize_function, batched=True)
test_tokenized = test_dataset.map(tokenize_function, batched=True)

# Rename 'output' column to 'labels'
train_tokenized = train_tokenized.rename_column("output", "labels")
test_tokenized = test_tokenized.rename_column("output", "labels")

# Remove original text column, keep 'labels'
train_tokenized = train_tokenized.remove_columns(["text_input"])
test_tokenized = test_tokenized.remove_columns(["text_input"])

train_tokenized.set_format("torch")
test_tokenized.set_format("torch")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Metrics Function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, zero_division=0)
    recall = recall_score(labels, predictions, zero_division=0)
    f1 = f1_score(labels, predictions, zero_division=0)
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

# Training Arguments and Trainer
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    # The fix: Align save_strategy with evaluation_strategy
    save_strategy="epoch"
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train the Model
trainer.train()

# Evaluate the Model
metrics = trainer.evaluate()
print(metrics)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/6799 [00:00<?, ? examples/s]

Map:   0%|          | 0/1380 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6903,0.655592,0.601449,0.597625,0.648997,0.622253
2,0.5805,0.794481,0.584783,0.556155,0.886819,0.6836
3,0.6435,0.833633,0.614493,0.57757,0.885387,0.699095


{'eval_loss': 0.8336326479911804, 'eval_accuracy': 0.6144927536231884, 'eval_precision': 0.577570093457944, 'eval_recall': 0.8853868194842407, 'eval_f1': 0.6990950226244343, 'eval_runtime': 7.8371, 'eval_samples_per_second': 176.085, 'eval_steps_per_second': 22.074, 'epoch': 3.0}


# REVERSE GRAPH PLAN

In [None]:
import re

def reverse_plan(test_str):
    """Reverses the order of steps in a plan string."""
    steps_list = re.findall(r'(Step \d+: .*?)(?= Step \d+:|$)', test_str)
    question_match = re.search(r'Question: (.*)', test_str)
    question = "\nQuestion: " + question_match.group(1) if question_match else "No question found."
    steps_list_reversed = steps_list[::-1]
    return " ".join(steps_list_reversed) + question

base_instructions = """
Given a plan, for each step in the plan generate a node, where each step is a node.
Each node should connect to each other if that node depends on another node.
This could be because an item in the step depends on the state of another item or a change in the state of the item itself.
DO NOT output this graph.

For the following plan and question, return ONLY 0 if the pair in question is non-dependent and 1 if the pair is dependent.
Again, your response should only be either a 0 or 1.
"""

# Load and Edit Data
train_data = pd.read_csv('./drive/MyDrive/rds_training_data.csv')
test_data = pd.read_csv('./drive/MyDrive/rds_test_data.csv')
# prepend the base instructions to each prompt
train_data['text_input'] = base_instructions + train_data['text_input']
test_data['text_input'] = base_instructions + test_data['text_input']
train_data['text_input'] = train_data['text_input'].apply(reverse_plan)

# Convert pandas DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

# Tokenizer and Model Setup
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Training Arguments and Trainer
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    # The fix: Align save_strategy with evaluation_strategy
    save_strategy="epoch"
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train the Model
trainer.train()

# Evaluate the Model
metrics = trainer.evaluate()
print(metrics)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6903,0.655592,0.601449,0.597625,0.648997,0.622253
2,0.5805,0.794481,0.584783,0.556155,0.886819,0.6836
3,0.6435,0.833633,0.614493,0.57757,0.885387,0.699095


{'eval_loss': 0.8336326479911804, 'eval_accuracy': 0.6144927536231884, 'eval_precision': 0.577570093457944, 'eval_recall': 0.8853868194842407, 'eval_f1': 0.6990950226244343, 'eval_runtime': 7.8075, 'eval_samples_per_second': 176.754, 'eval_steps_per_second': 22.158, 'epoch': 3.0}
