In [1]:
!pip install -U "transformers==4.40.0" --upgrade
!pip install accelerate bitsandbytes
!pip install datasets

Collecting transformers==4.40.0
  Downloading transformers-4.40.0-py3-none-any.whl (9.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.41.2
    Uninstalling transformers-4.41.2:
      Successfully uninstalled transformers-4.41.2
Successfully installed transformers-4.40.0
Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cac

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
import torch
import time
import datasets
import json
from transformers import AutoTokenizer, AutoModelForCausalLM
from dataclasses import dataclass

Some parts of this code are from an LLM tutorial and this post https://medium.com/@geronimo7/data-labelling-with-llama-3-5906edd3a5c1

In [5]:
@dataclass
class Config:
  batch_size: int
  model: str
  seed: int
  lr: float
  epochs: int
  lora_r: int
  lora_alpha: float

config = Config(
  batch_size = 64,
  model = "meta-llama/Meta-Llama-3-8B-Instruct",
  seed = 42,
  lr = 3e-5,
  epochs = 3,
  # For LoRA
  lora_r = 16,
  lora_alpha = 32,  # a common choice is alpha = 2 * rank
)

In [6]:
def set_seed(seed: int) -> None:
  """
  Set the random seed for reproducibility.

  Args:
    seed (int): The seed value to set.

  Returns:
    None
  """

  import os
  import random
  import numpy as np

  np.random.seed(seed)
  random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  # When running on the CuDNN backend, two further options must be set
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False
  # Set a fixed value for the hash seed
  os.environ["PYTHONHASHSEED"] = str(seed)
  print(f"Random seed set as {seed}")

set_seed(config.seed)

Random seed set as 42


In [7]:
train_data = datasets.load_dataset("art", split="train").shuffle(seed=config.seed)
test_data = datasets.load_dataset("art", split="validation")

Downloading readme:   0%|          | 0.00/6.94k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/209k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/8.98M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/1532 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/169654 [00:00<?, ? examples/s]

In [8]:
test_0_300_data = datasets.load_dataset('art', split='validation[:300]')
test_300_600_data = datasets.load_dataset('art', split='validation[300:600]')
test_600_900_data = datasets.load_dataset('art', split='validation[600:900]')
test_900_1200_data = datasets.load_dataset('art', split='validation[900:1200]')
test_1200_1530_data = datasets.load_dataset('art', split='validation[1200:]')

In [9]:
test_test = datasets.load_dataset('art', split='validation[:3]')

In [10]:
chain_of_thoughts = ["Albert, being a weight loss guru, likely engaged in regular and possibly intense exercise. Dying on the last mile suggests he was pushing himself physically. This fits with hypothesis 1 that he increased his exercise regimen. Stopping exercise doesn’t explain the exertion implied.",
                     "Billy had a cat named Mittens, and cats are known to prefer boxes over beds. Mittens ignoring the bed suggests it was intended for her. Hypothesis 2, that Billy made a special bed for Mittens, fits better than hypothesis 1, which involves Cody, who is not mentioned elsewhere.",
                     "Andrew's dedication and hard work suggest he was very firm and strict in his planning. Becoming less rigid indicates a significant event caused this change. Hypothesis 1 explains that Andrew made his daughter cry by being too firm, leading him to reconsider his rigidity. Hypothesis 2 is not logical and lacks an explanation.",
                     "Amy needed to gather her courage, suggesting she was anxious about the party. If the only person she knew was Ella, it explains her anxiety and need for courage. Both hypotheses are similar, but hypothesis 2 directly states Ella was the only person Amy knew, aligning more clearly with Amy's feelings.",
                     "Jen staying up until 3am suggests she was pressed for time to complete her paper. This indicates she may have procrastinated. Hypothesis 1, that she put off finishing her paper to watch TV, explains why she had to stay up late. Hypothesis 2 implies she prioritized her paper, which doesn’t fit with the need to stay up late."]

In [11]:
def chain_of_thought_prompt_text(examples, chain_of_thoughts):
    """
    Generates a prompt text for chain of thought reasoning.

    Args:
        examples : A dataset containing the examples for chain of thought reasoning.
            It should have the following keys:
            - "observation_1": A list of strings representing the first observation.
            - "observation_2": A list of strings representing the second observation.
            - "hypothesis_1": A list of strings representing the first hypothesis.
            - "hypothesis_2": A list of strings representing the second hypothesis.
            - "label": A list of integers representing the label for each example.

        chain_of_thoughts (list): A list of strings representing the chain of thought for each example.

    Returns:
        str: The generated prompt text.

    """
    
    few_shot_prompt = []

    for i in range(len(examples["observation_1"])):
        few_shot_prompt.append("\nQuestion: \nGiven these two observations: " + examples["observation_1"][i]+ " "
                               + examples["observation_2"][i])
        few_shot_prompt.append("What is the most plausible explanation ?")
        few_shot_prompt.append("Hypothesis 1: "+ examples["hypothesis_1"][i])
        few_shot_prompt.append("Hypothesis 2: " +examples["hypothesis_2"][i])
        few_shot_prompt.append("Answer:")
        few_shot_prompt.append(chain_of_thoughts[i] + "The answer is hypothesis " + str(examples["label"][i]))

    return "\n".join(few_shot_prompt).strip()

print(chain_of_thought_prompt_text(train_data[0:5], chain_of_thoughts))

Question: 
Given these two observations: Albert was a weight loss guru. He died of a heart attack on the last mile.
What is the most plausible explanation ?
Hypothesis 1: Albert increased his exercise regimen.
Hypothesis 2: Albert stopped his exercise regimen.
Answer:
Albert, being a weight loss guru, likely engaged in regular and possibly intense exercise. Dying on the last mile suggests he was pushing himself physically. This fits with hypothesis 1 that he increased his exercise regimen. Stopping exercise doesn’t explain the exertion implied.The answer is hypothesis 1

Question: 
Given these two observations: Billy had a cat named Mittens. Mittens ignored the bed and slept in the box.
What is the most plausible explanation ?
Hypothesis 1: Billy made a special bed for Cody to sleep in.
Hypothesis 2: Billy made a special bed for Mittens to sleep in.
Answer:
Billy had a cat named Mittens, and cats are known to prefer boxes over beds. Mittens ignoring the bed suggests it was intended for

In [12]:
def few_shot_prompt_text(examples):
    """
    Generate a few-shot prompt text based on the given examples.

    Args:
        examples: A dataset containing the examples with the following keys:
            - observation_1: List of strings representing the first observation.
            - observation_2: List of strings representing the second observation.
            - hypothesis_1: List of strings representing the first hypothesis.
            - hypothesis_2: List of strings representing the second hypothesis.
            - label: List of integers representing the label for each example.

    Returns:
        str: The generated few-shot prompt text.

    """
    few_shot_prompt = []

    for i in range(len(examples["observation_1"])):
        few_shot_prompt.append("\nQuestion: \nGiven these two observations: " + examples["observation_1"][i]+ " "
                               + examples["observation_2"][i])
        few_shot_prompt.append("What is the most plausible explanation ?")
        few_shot_prompt.append("Hypothesis 1: "+ examples["hypothesis_1"][i])
        few_shot_prompt.append("Hypothesis 2: " +examples["hypothesis_2"][i])
        few_shot_prompt.append("Answer:")
        few_shot_prompt.append("The most plausible explanation is hypothesis " + str(examples["label"][i]))

    return "\n".join(few_shot_prompt).strip()

print(few_shot_prompt_text(train_data[0:3]))

Question: 
Given these two observations: Albert was a weight loss guru. He died of a heart attack on the last mile.
What is the most plausible explanation ?
Hypothesis 1: Albert increased his exercise regimen.
Hypothesis 2: Albert stopped his exercise regimen.
Answer:
The most plausible explanation is hypothesis 1

Question: 
Given these two observations: Billy had a cat named Mittens. Mittens ignored the bed and slept in the box.
What is the most plausible explanation ?
Hypothesis 1: Billy made a special bed for Cody to sleep in.
Hypothesis 2: Billy made a special bed for Mittens to sleep in.
Answer:
The most plausible explanation is hypothesis 2

Question: 
Given these two observations: Andrew was very dedicated and hardworking. Andrew became less rigid about his planning after that.
What is the most plausible explanation ?
Hypothesis 1: Andrew accidentally made his daughter cry when he was firm.
Hypothesis 2: doesnt make sense.
Answer:
The most plausible explanation is hypothesis 1


In [13]:
def entry_to_prompt_text(entry):
  """
  Converts an entry dictionary into a formatted prompt text.

  Args:
    entry (dict): A dictionary containing the following keys:
      - observation_1 (str): The first observation.
      - observation_2 (str): The second observation.
      - hypothesis_1 (str): The first hypothesis.
      - hypothesis_2 (str): The second hypothesis.

  Returns:
    str: The formatted prompt text.
  """
  
  prompt = []
  prompt.append("\nQuestion: \nGiven these two observations: " + entry["observation_1"]+ " "
                + entry["observation_2"])
  prompt.append("What is the most plausible explanation ?")
  prompt.append("Hypothesis 1: "+ entry["hypothesis_1"])
  prompt.append("Hypothesis 2: " +entry["hypothesis_2"])
  prompt.append("Answer:")

  return "\n".join(prompt).strip()

print(entry_to_prompt_text(train_data[0]))

Question: 
Given these two observations: Albert was a weight loss guru. He died of a heart attack on the last mile.
What is the most plausible explanation ?
Hypothesis 1: Albert increased his exercise regimen.
Hypothesis 2: Albert stopped his exercise regimen.
Answer:


In [16]:
def load_blank_model(config):
  """ 
  Load a blank model from the transformers library.

  Args:
    config (Config): The configuration object.
  
  Returns:  
    model: The blank model.
  """

  # Delete references to a previously loaded model
  if "optimizer" in globals():
      global optimizer
      del optimizer
  if "model" in globals():
      global model
      del model

  # Free up GPU memory
  torch.cuda.empty_cache()

  model = AutoModelForCausalLM.from_pretrained(config.model, torch_dtype=torch.bfloat16, device_map="auto")

  return model

In [17]:
def generate(prompt, model, tokenizer, terminators, max_new_tokens = 256):
  """
  Generate text from the model given a prompt.
  
  Args:
    prompt (str): The prompt text.
    model: The model to generate text from.
    tokenizer: The tokenizer used to tokenize the prompt.
    terminators (list): A list of token IDs that indicate the end of the generated text.
    max_new_tokens (int): The maximum number of tokens to generate.

  Returns:
    str: The generated text.
  """

  prompt_tokenized=tokenizer(prompt, return_tensors="pt").to("cuda")
  output_tokenized = model.generate(
      **prompt_tokenized,
      max_new_tokens = max_new_tokens,
      eos_token_id=terminators,
      pad_token_id = tokenizer.eos_token_id,
      do_sample=True,
      temperature=0.6,
      top_p=0.9,)[0]

  output_tokenized = output_tokenized[len(prompt_tokenized["input_ids"][0]):]
  output = tokenizer.decode(output_tokenized)

  return output

In [18]:
def save_answers(answers, path):
  """
  Saves the answers to a JSON file.

  Args:
    answers (list): A list containing the answers.
    path (str): The path to save the answers.

  Returns:
    None
  """

  results = {
      'model': config.model,
      'dataset': 'art_validation',
      'answers': answers,
  }

  with open(path, 'w') as f:
    json.dump(results, f, indent=4)

In [19]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

def classification_analysis(predictions, true_labels, incorrect_indices, unknown_indices, report_path):
  """
  Performs classification analysis and saves the results to a JSON file.

  Args:
    predictions (list): A list of predicted labels.
    true_labels (list): A list of true labels.
    incorrect_indices (list): A list of indices where the predictions are incorrect.
    unknown_indices (list): A list of indices where the predictions are unknown.
    report_path (str): The path to save the classification report.

  Returns:
    None
  """

  filtered_true_labels = [label for label, pred in zip(true_labels, predictions) if pred is not None]
  filtered_predictions = [pred for pred in predictions if pred is not None]

  accuracy = accuracy_score(filtered_true_labels, filtered_predictions)
  f1 = f1_score(filtered_true_labels, filtered_predictions, average='weighted')
  report = classification_report(filtered_true_labels, filtered_predictions)

  print(f"Accuracy: {accuracy}")
  print(f"F1 Score: {f1}")
  print(f"Classification Report:\n{report}")

  results = {
      'model': config.model,
      'dataset': 'art_validation',
      'true_labels': true_labels,
      'predictions': predictions,
      'incorrect_indices': incorrect_indices,
      'unknown_indices': unknown_indices,
      'accuracy': accuracy,
      'f1_score': f1,
      'classification_report': report
  }

  with open(report_path, 'w') as f:
    json.dump(results, f, indent=4)

In [20]:
def save_chunck(predictions, true_labels, incorrect_indices, unknown_indices, answers, chunk, report_path):
  """
  Saves the results of a chunk to a JSON file.

  Args:
    predictions (list): A list of predicted labels.
    true_labels (list): A list of true labels.
    incorrect_indices (list): A list of indices where the predictions are incorrect.
    unknown_indices (list): A list of indices where the predictions are unknown.
    answers (list): A list containing the text answers.
    chunk (str): The name of the chunk.
    report_path (str): The path to save the results.

  Returns:
    None
  """
  
  results = {
      'model': config.model,
      'dataset': chunk,
      'true_labels': true_labels,
      'predictions': predictions,
      'incorrect_indices': incorrect_indices,
      'unknown_indices': unknown_indices,
      'answers': answers,
  }

  with open(report_path, 'w') as f:
    json.dump(results, f, indent=4)

In [21]:
import re
def evaluate_zero_cot(test_data):
  """
  Evaluate the zero-shot chain of thought reasoning task.

  Args:
    test_data (Dataset): The test data.

  Returns:
    predictions (list): A list of predicted labels.
    true_labels (list): A list of true labels.
    incorrect_indices (list): A list of indices where the predictions are incorrect.
    unknown_indices (list): A list of indices where the predictions are unknown.
    answers (list): A list containing the text answers.
  """

  pattern = r"hypothesis (\d+)"

  true_labels = []
  predictions = []
  incorrect_indices = []
  unknown_indices =[]
  answers = []
  total, correct, unknown = 0, 0, 0

  for i, example in enumerate(test_data):

    total += 1

    prompt = entry_to_prompt_text(example) + " Let’s think step by step and determine which one is the most plausible hypothesis.\n"

    answer = generate(prompt, model, tokenizer, terminators)
    answer = answer.split("Question:")[0] if "Question:" in answer else answer
    answers.append(answer)
    matches = re.findall(pattern, answer, re.IGNORECASE)

    if matches:
      pred = int(matches[-1])
      predictions.append(pred)
      true_label = example['label']
      true_labels.append(true_label)

      if pred != true_label:
        incorrect_indices.append(i)
      else:
        correct+=1

    else:
      print(answer)
      unknown+=1
      predictions.append(None)
      true_labels.append(example['label'])
      unknown_indices.append(i)

    print(" *","Stats: ", f"{round(correct/total*100,2)}% correct, ({total} total, {correct} correct, {unknown} unknown)")

  return predictions, true_labels, incorrect_indices, unknown_indices, answers

In [22]:
def evaluate(test_data, few_shot_examples):
  """
  Evaluate the zero-shot and few-shot reasoning task depending on the content of few_shot_examples.

  Args:
    test_data (Dataset): The test data.
    few_shot_examples (str): The few-shot examples.

  Returns:
    predictions (list): A list of predicted labels.
    true_labels (list): A list of true labels.
    incorrect_indices (list): A list of indices where the predictions are incorrect.
    unknown_indices (list): A list of indices where the predictions are unknown.
    answers (list): A list containing the text answers.
  """

  pattern = r"hypothesis (\d+)"

  true_labels = []
  predictions = []
  incorrect_indices = []
  unknown_indices =[]
  answers = []
  total, correct, unknown = 0, 0, 0

  for i, example in enumerate(test_data):

    total += 1

    if few_shot_examples:
      prompt = few_shot_examples + "\n\n" + entry_to_prompt_text(example) + "\n"
    else:
      prompt = entry_to_prompt_text(example) + "\n"

    answer = generate(prompt, model, tokenizer, terminators)
    answer = answer.split("Question:")[0] if "Question:" in answer else answer
    answers.append(answer)
    matches = re.findall(pattern, answer, re.IGNORECASE)

    if matches:
      pred = int(matches[-1])
      predictions.append(pred)
      true_label = example['label']
      true_labels.append(true_label)

      if pred != true_label:
        incorrect_indices.append(i)
      else:
        correct+=1

    else:
      print(answer)
      unknown+=1
      predictions.append(None)
      true_labels.append(example['label'])
      unknown_indices.append(i)

    print(" *","Stats: ", f"{round(correct/total*100,2)}% correct, ({total} total, {correct} correct, {unknown} unknown)")

  return predictions, true_labels, incorrect_indices, unknown_indices, answers

In [23]:
def evaluate_self_consistency(test_data, few_shot_examples, k):
  """
  Evaluate the self-consistency task and pick the most frequent hypothesis in the k sampled paths.

  Args:
    test_data (Dataset): The test data.
    few_shot_examples (str): The few-shot examples.
    k (int): The number of samples to generate for each example.
  
  Returns:
    predictions (list): A list of predicted labels.
    true_labels (list): A list of true labels.
    incorrect_indices (list): A list of indices where the predictions are incorrect.
    unknown_indices (list): A list of indices where the predictions are unknown.
    answers (list): A list containing the text answers.
  """

  pattern = r"hypothesis (\d+)"

  true_labels = []
  predictions = []
  incorrect_indices = []
  unknown_indices =[]
  answers =[]
  total, correct, unknown = 0, 0, 0

  for i, example in enumerate(test_data):

    if few_shot_examples:
      prompt = few_shot_examples + "\n\n" + entry_to_prompt_text(example) + "\n"
    else:
      prompt = entry_to_prompt_text(example) + "\n"

    total += 1
    preds = []
    k_answers = []
    for j in range(k):
      answer = generate(prompt, model, tokenizer, terminators)
      answer = answer.split("Question:")[0] if "Question:" in answer else answer
      k_answers.append(answer)
      matches = re.findall(pattern, answer, re.IGNORECASE)
      if matches:
        pred = int(matches[-1])
      else:
        pred = None

      preds.append(pred)

    final_pred = max(set(preds), key=preds.count)
    true_label = example['label']
    predictions.append(final_pred)
    true_labels.append(true_label)
    answers.append(k_answers)
    if final_pred == None:
      unknown_indices.append(i)
    elif final_pred == true_label:
      correct+=1
    else:
      incorrect_indices.append(i)

    print(" *","Stats: ", f"{round(correct/total*100,2)}% correct, ({total} total, {correct} correct, {unknown} unknown)")

  return predictions, true_labels, incorrect_indices, unknown_indices, answers

In [24]:
model = load_blank_model(config)
tokenizer = AutoTokenizer.from_pretrained(config.model, device_map="auto")
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]



config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]



model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Zero shot prompting

In [None]:
predictions, true_labels, incorrect_indices, unknown_indices, answers = evaluate(test_data, "")
classification_analysis(predictions, true_labels, incorrect_indices, unknown_indices,
                        '/content/gdrive/MyDrive/CS4NLP/llama3_zero_shot_prompt.json')

### 5 shot prompting

In [None]:
examples = few_shot_prompt_text(train_data[0:5])
predictions, true_labels, incorrect_indices, unknown_indices, answers = evaluate(test_data, examples)
classification_analysis(predictions, true_labels, incorrect_indices, unknown_indices,
                        '/content/gdrive/MyDrive/CS4NLP/llama3_5shot_prompt.json')

### Zero shot chain of thought prompting

In [None]:
predictions, true_labels, incorrect_indices, unknown_indices, answers = evaluate_zero_cot(test_data)
classification_analysis(predictions, true_labels, incorrect_indices, unknown_indices,
                        '/content/gdrive/MyDrive/CS4NLP/llama3_zero_shot_CoT_prompt.json')

 * Stats:  0.0% correct, (1 total, 0 correct, 0 unknown)
 * Stats:  0.0% correct, (2 total, 0 correct, 0 unknown)
 * Stats:  33.33% correct, (3 total, 1 correct, 0 unknown)
 * Stats:  50.0% correct, (4 total, 2 correct, 0 unknown)
 * Stats:  40.0% correct, (5 total, 2 correct, 0 unknown)
 * Stats:  50.0% correct, (6 total, 3 correct, 0 unknown)
 * Stats:  57.14% correct, (7 total, 4 correct, 0 unknown)
 * Stats:  50.0% correct, (8 total, 4 correct, 0 unknown)
 * Stats:  55.56% correct, (9 total, 5 correct, 0 unknown)
 * Stats:  60.0% correct, (10 total, 6 correct, 0 unknown)
 * Stats:  63.64% correct, (11 total, 7 correct, 0 unknown)
 * Stats:  66.67% correct, (12 total, 8 correct, 0 unknown)
 * Stats:  69.23% correct, (13 total, 9 correct, 0 unknown)
 * Stats:  64.29% correct, (14 total, 9 correct, 0 unknown)
 * Stats:  60.0% correct, (15 total, 9 correct, 0 unknown)
 * Stats:  62.5% correct, (16 total, 10 correct, 0 unknown)
 * Stats:  58.82% correct, (17 total, 10 correct, 0 unknown

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
save_answers(answers, '/content/gdrive/MyDrive/CS4NLP/llama3_zero_shot_CoT_prompt_answers.json')

### 5 shot chain of thought prompting

In [None]:
examples = chain_of_thought_prompt_text(train_data[0:5], chain_of_thoughts)
predictions, true_labels, incorrect_indices, unknown_indices, answers = evaluate(test_data, examples)
classification_analysis(predictions, true_labels, incorrect_indices, unknown_indices,
                        '/content/gdrive/MyDrive/CS4NLP/llama3_5_shot_CoT_prompt.json')

 * Stats:  100.0% correct, (1 total, 1 correct, 0 unknown)
 * Stats:  100.0% correct, (2 total, 2 correct, 0 unknown)
 * Stats:  100.0% correct, (3 total, 3 correct, 0 unknown)
 * Stats:  100.0% correct, (4 total, 4 correct, 0 unknown)
 * Stats:  100.0% correct, (5 total, 5 correct, 0 unknown)
 * Stats:  100.0% correct, (6 total, 6 correct, 0 unknown)
 * Stats:  100.0% correct, (7 total, 7 correct, 0 unknown)
 * Stats:  100.0% correct, (8 total, 8 correct, 0 unknown)
 * Stats:  88.89% correct, (9 total, 8 correct, 0 unknown)
 * Stats:  80.0% correct, (10 total, 8 correct, 0 unknown)
 * Stats:  81.82% correct, (11 total, 9 correct, 0 unknown)
 * Stats:  83.33% correct, (12 total, 10 correct, 0 unknown)
 * Stats:  84.62% correct, (13 total, 11 correct, 0 unknown)
 * Stats:  85.71% correct, (14 total, 12 correct, 0 unknown)
 * Stats:  86.67% correct, (15 total, 13 correct, 0 unknown)
 * Stats:  87.5% correct, (16 total, 14 correct, 0 unknown)
 * Stats:  88.24% correct, (17 total, 15 corre

### 5 shot chain of thought prompting with self-consistency

In [25]:
examples = chain_of_thought_prompt_text(train_data[0:5], chain_of_thoughts)

In [None]:
predictions, true_labels, incorrect_indices, unknown_indices, answers = evaluate_self_consistency(test_0_300_data, examples, 5)
save_chunck(predictions, true_labels, incorrect_indices, unknown_indices, answers, "art_validation_0_300",
            '/content/gdrive/MyDrive/CS4NLP/llama3_5_shot_CoT_self_cons_0_300.json')

In [None]:
predictions, true_labels, incorrect_indices, unknown_indices, answers = evaluate_self_consistency(test_300_600_data, examples, 5)
save_chunck(predictions, true_labels, incorrect_indices, unknown_indices, answers, "art_validation_300_600",
            '/content/gdrive/MyDrive/CS4NLP/llama3_5_shot_CoT_self_cons_300_600.json')

In [None]:
predictions, true_labels, incorrect_indices, unknown_indices, answers = evaluate_self_consistency(test_600_900_data, examples, 5)
save_chunck(predictions, true_labels, incorrect_indices, unknown_indices, answers, "art_validation_600_900",
            '/content/gdrive/MyDrive/CS4NLP/llama3_5_shot_CoT_self_cons_600_900.json')

In [None]:
predictions, true_labels, incorrect_indices, unknown_indices, answers = evaluate_self_consistency(test_900_1200_data, examples, 5)
save_chunck(predictions, true_labels, incorrect_indices, unknown_indices, answers, "art_validation_900_1200",
            '/content/gdrive/MyDrive/CS4NLP/llama3_5_shot_CoT_self_cons_900_1200.json')

In [None]:
predictions, true_labels, incorrect_indices, unknown_indices, answers = evaluate_self_consistency(test_1200_1530_data, examples, 5)
save_chunck(predictions, true_labels, incorrect_indices, unknown_indices, answers, "art_validation_1200_1530",
            '/content/gdrive/MyDrive/CS4NLP/llama3_5_shot_CoT_self_cons_1200_1530.json')

In [None]:
with open('/content/gdrive/MyDrive/CS4NLP/CoT_self_consistency /consolidated_results.json', 'r') as file:
  data = json.load(file)
  classification_analysis(data['predictions'], data['true_labels'], data['incorrect_indices'], data['unknown_indices'],
                          '/content/gdrive/MyDrive/CS4NLP/CoT_self_consistency /self_cons_report.json')

Accuracy: 0.77088772845953
F1 Score: 0.7707204556181076
Classification Report:
              precision    recall  f1-score   support

           1       0.77      0.79      0.78       781
           2       0.78      0.75      0.76       751

    accuracy                           0.77      1532
   macro avg       0.77      0.77      0.77      1532
weighted avg       0.77      0.77      0.77      1532



In [26]:
predictions, true_labels, incorrect_indices, unknown_indices, answers = evaluate_self_consistency(test_0_300_data, examples, 10)
save_chunck(predictions, true_labels, incorrect_indices, unknown_indices, answers, "art_validation_0_300",
            '/content/gdrive/MyDrive/CS4NLP/llama3_10_self_cons_0_300.json')

 * Stats:  100.0% correct, (1 total, 1 correct, 0 unknown)
 * Stats:  100.0% correct, (2 total, 2 correct, 0 unknown)
 * Stats:  100.0% correct, (3 total, 3 correct, 0 unknown)
 * Stats:  100.0% correct, (4 total, 4 correct, 0 unknown)
 * Stats:  80.0% correct, (5 total, 4 correct, 0 unknown)
 * Stats:  83.33% correct, (6 total, 5 correct, 0 unknown)
 * Stats:  85.71% correct, (7 total, 6 correct, 0 unknown)
 * Stats:  87.5% correct, (8 total, 7 correct, 0 unknown)
 * Stats:  88.89% correct, (9 total, 8 correct, 0 unknown)
 * Stats:  90.0% correct, (10 total, 9 correct, 0 unknown)
 * Stats:  90.91% correct, (11 total, 10 correct, 0 unknown)
 * Stats:  91.67% correct, (12 total, 11 correct, 0 unknown)
 * Stats:  92.31% correct, (13 total, 12 correct, 0 unknown)
 * Stats:  92.86% correct, (14 total, 13 correct, 0 unknown)
 * Stats:  93.33% correct, (15 total, 14 correct, 0 unknown)
 * Stats:  93.75% correct, (16 total, 15 correct, 0 unknown)
 * Stats:  94.12% correct, (17 total, 16 corre

In [None]:
predictions, true_labels, incorrect_indices, unknown_indices, answers = evaluate_self_consistency(test_300_600_data, examples, 10)
save_chunck(predictions, true_labels, incorrect_indices, unknown_indices, answers, "art_validation_300_600",
            '/content/gdrive/MyDrive/CS4NLP/llama3_10_self_cons_300_600.json')

 * Stats:  100.0% correct, (1 total, 1 correct, 0 unknown)
 * Stats:  100.0% correct, (2 total, 2 correct, 0 unknown)
 * Stats:  100.0% correct, (3 total, 3 correct, 0 unknown)
 * Stats:  100.0% correct, (4 total, 4 correct, 0 unknown)
 * Stats:  80.0% correct, (5 total, 4 correct, 0 unknown)
 * Stats:  83.33% correct, (6 total, 5 correct, 0 unknown)
 * Stats:  71.43% correct, (7 total, 5 correct, 0 unknown)
 * Stats:  75.0% correct, (8 total, 6 correct, 0 unknown)
 * Stats:  77.78% correct, (9 total, 7 correct, 0 unknown)
 * Stats:  80.0% correct, (10 total, 8 correct, 0 unknown)
 * Stats:  81.82% correct, (11 total, 9 correct, 0 unknown)
 * Stats:  83.33% correct, (12 total, 10 correct, 0 unknown)
 * Stats:  84.62% correct, (13 total, 11 correct, 0 unknown)
 * Stats:  85.71% correct, (14 total, 12 correct, 0 unknown)
 * Stats:  80.0% correct, (15 total, 12 correct, 0 unknown)
 * Stats:  81.25% correct, (16 total, 13 correct, 0 unknown)
 * Stats:  82.35% correct, (17 total, 14 correct

In [None]:
predictions, true_labels, incorrect_indices, unknown_indices, answers = evaluate_self_consistency(test_600_900_data, examples, 10)
save_chunck(predictions, true_labels, incorrect_indices, unknown_indices, answers, "art_validation_600_900",
            '/content/gdrive/MyDrive/CS4NLP/llama3_10_self_cons_600_900.json')

In [None]:
predictions, true_labels, incorrect_indices, unknown_indices, answers = evaluate_self_consistency(test_900_1200_data, examples, 10)
save_chunck(predictions, true_labels, incorrect_indices, unknown_indices, answers, "art_validation_900_1200",
            '/content/gdrive/MyDrive/CS4NLP/llama3_10_self_cons_900_1200.json')

In [None]:
predictions, true_labels, incorrect_indices, unknown_indices, answers = evaluate_self_consistency(test_1200_1530_data, examples, 10)
save_chunck(predictions, true_labels, incorrect_indices, unknown_indices, answers, "art_validation_1200_1530",
            '/content/gdrive/MyDrive/CS4NLP/llama3_10_self_cons_1200_1530.json')