In [1]:
import os , sys
# set following environment variables
os.environ['HF_HOME'] = "/mnt/Data1/akann1warw1ck/AlanTuring/.cache"
os.environ['TRANSFORMERS_CACHE'] = "/mnt/Data1/akann1warw1ck/.cache/transformers"
# os.environ['CUDA_VISIBLE_DEVICES'] = "0"
sys.path.append('/home/akann1warw1ck/AlanTuring')
from prompt_engineering.langchain.utils import load_llm
from prompt_engineering.langchain.utils import PredictionGenerator
import warnings
import pandas as pd
from transformers import AutoTokenizer
# Filter out the warning message
warnings.filterwarnings('ignore', category=UserWarning, module='transformers')

#Testing loading large model on 2 GPUS
from transformers import BitsAndBytesConfig
import torch
from langchain import HuggingFacePipeline

  from .autonotebook import tqdm as notebook_tqdm


## practicing loading on multiple GPUS

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
from accelerate import infer_auto_device_map

# model_id = 'upstage/llama-30b-instruct-2048'
model_id = 'stabilityai/StableBeluga2'
# model_id = 'stabilityai/StableBeluga-7B'
quant_config = BitsAndBytesConfig(
    load_in_8bit=False,
    llm_int8_has_fp16_weights=False,
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4" ,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16 if True else None
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

# device_map = infer_auto_device_map(my_model, max_memory={0: "10GiB", 1: "10GiB", "cpu": "30GiB"})

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="balanced",
    max_memory = {0:'22GiB', 1:'22GiB'},
    quantization_config=quant_config,
    trust_remote_code=True,
    # rope_scaling={"type": "dynamic", "factor": 2} # allows handling of longer inputs
)

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.
Loading checkpoint shards: 100%|██████████| 29/29 [26:31<00:00, 54.87s/it]


In [3]:
prompt = "### User:\nThomas is healthy. What could be the reasons?\n\n### Assistant:\n"
inputs = tokenizer(prompt, return_tensors="pt")
output = model.generate(**inputs,  max_new_tokens=3)
output

tensor([[    1,   835,  4911, 29901,    13,  1349, 18902,   338,  9045, 29891,
         29889,  1724,  1033,   367,   278,  9590, 29973,    13,    13,  2277,
         29937,  4007, 22137, 29901,    13,  1670,  1033,   367]])

## Break

In [4]:
# llm_name = 'TheBloke/wizard-vicuna-13B-HF'
llm_name = 'TheBloke/Wizard-Vicuna-13B-Uncensored-HF'
llm = load_llm(llm_name, False, 'local', 0)
tokenizer = AutoTokenizer.from_pretrained(llm_name, use_fast=True)

Loading checkpoint shards: 100%|██████████| 3/3 [00:17<00:00,  5.87s/it]
Device has 2 GPUs available. Provide device={deviceId} to `from_model_id` to use availableGPUs for execution. deviceId is -1 (default) for CPU and can be a positive integer associated with CUDA device id.


# Experimenting with Stable Predictions for categorical prompt_style


In [6]:
prompt_style = 'categorise'
ensemble_size = 1
edge_value = 'distribution'
parse_style = 'categories_perplexity'
relationship='budgetitem_to_indicator'
effect_type = 'directly'

prediction_generator = PredictionGenerator(None,
                                            llm_name,
                                            prompt_style,
                                            ensemble_size,
                                            edge_value,
                                            parse_style,
                                            relationship=relationship,
                                            local_or_remote='local',
                                            effect_type=effect_type)

map_category_answer_b2i = { '1':'Local government spending on "{budget_item}" does {effect_type} affect "{indicator}"', '2':'Local government spending on "{budget_item}" does not {effect_type} affect "{indicator}"'}
map_category_label_b2i = { '1':'Yes', '2':'No'}


In [5]:
#Prompts to test

li_b2i = [
    ('Children 5-19 public health programmes', 'Low birth weight of term babies','Yes'),
    ('Children 5-19 public health programmes', 'Pupil absence','Yes'),

    ('Education services', '5 or more A*-C grades at GCSE (inc english and maths)','Yes'),
    ('Education services', 'Progression by 2 levels in maths between KS1 and KS2','Yes'),

    ('Environmental and regulatory services', 'Municipal waste landfilled','Yes'),
    ('Environmental and regulatory services', 'Waste collected per head','Yes'),

    ('Children 5-19 public health programmes', 'Municipal waste landfilled','No'),
    ('Children 5-19 public health programmes', 'Killed and seriously injured (KSI) casualties on England\'s roads (historic data)','No'),

    ('Education services', 'Municipal waste landfilled','No'),
    ('Education services', 'Killed and seriously injured (KSI) casualties on England\'s roads (historic data)','No'),

    ('Environmental and regulatory services', '5 or more A*-C grades at GCSE (inc english and maths)','No'),
    ('Environmental and regulatory services', 'Progression by 2 levels in maths between KS1 and KS2','No'),
]

li_budget_item = [ _tuple[0] for _tuple in li_b2i ]
li_indicator = [ _tuple[1] for _tuple in li_b2i ]
li_answer = [ _tuple[2] for _tuple in li_b2i ]


li_test_prompts = [
    
    {'normal': f'Categories:\n1) {map_category_answer_b2i["1"]}\n2) {map_category_answer_b2i["2"]}\nWrite the category number that best answers whether local government spending on "{{budget_item}}" {{effect_type}} affects "{{indicator}}"?',
    'reversed': f'Categories:\n1) {map_category_answer_b2i["2"]}\n2) {map_category_answer_b2i["1"]}\nWrite the category number that best answers whether local government spending on "{{budget_item}}" {{effect_type}} affects "{{indicator}}"?'
    },

    {'normal': f'Categories:\n1) {map_category_answer_b2i["1"]}\n2) {map_category_answer_b2i["2"]}\nWrite the number of the category that best answers whether local government spending on "{{budget_item}}" {{effect_type}} affects "{{indicator}}"?',
    'reversed': f'Categories:\n1) {map_category_answer_b2i["2"]}\n2) {map_category_answer_b2i["1"]}\nWrite the number of the category that best answers whether local government spending on "{{budget_item}}" {{effect_type}} affects "{{indicator}}"?'
    },

    {'normal': f'Categories:\n1) {map_category_answer_b2i["1"]}\n2) {map_category_answer_b2i["2"]}\nWhat is the number of the category that best answers whether local government spending on "{{budget_item}}" {{effect_type}} affects "{{indicator}}"?',
    'reversed': f'Categories:\n1) {map_category_answer_b2i["2"]}\n2) {map_category_answer_b2i["1"]}\nWhat is the number of the category that best answers whether local government spending on "{{budget_item}}" {{effect_type}} affects "{{indicator}}"?'
    },

    {'normal': f'Categories:\n1) {map_category_answer_b2i["1"]}\n2) {map_category_answer_b2i["2"]}\nWhat is the number of the category that answers the following question. Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?',
    'reversed': f'Categories:\n1) {map_category_answer_b2i["2"]}\n2) {map_category_answer_b2i["1"]}\nWhat is the number of the category that answers the following question. Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?'
    },

    {'normal': f'Which of the following categories best answers the question. Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\n1) {map_category_answer_b2i["1"]}\n2) {map_category_answer_b2i["2"]}',
    'reversed': f'Which of the following categories best answers the question. Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\n1) {map_category_answer_b2i["2"]}\n2) {map_category_answer_b2i["1"]}'
     },

    {'normal': f'Which of the following categories best answers the question. Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\nCategories:\n1) {map_category_answer_b2i["1"]}\n2) {map_category_answer_b2i["2"]}',
    'reversed': f'Which of the following categories best answers the question. Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\nCategories:\n1) {map_category_answer_b2i["2"]}\n2) {map_category_answer_b2i["2"]}'
    },

    {'normal': f'Which of the following categories best answers the question. Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\nCategory 1) {map_category_answer_b2i["1"]}\nCategory 2) {map_category_answer_b2i["2"]}',
    'reversed': f'Which of the following categories best answers the question. Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\nCategory 1) {map_category_answer_b2i["2"]}\nCategory 2) {map_category_answer_b2i["1"]}'
    },

    {'normal': f'Category 1) {map_category_answer_b2i["1"]}\nCategory 2) {map_category_answer_b2i["2"]}.\n\nQuestion: Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"\n\nChoose a category to answer the question.',
    'reversed': f'Category 1) {map_category_answer_b2i["2"]}\nCategory 2) {map_category_answer_b2i["1"]}.\n\nQuestion: Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"\n\nChoose a category to answer the question.'
    },

    {'normal': f'Select the category that answers the question. Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\n1) {map_category_answer_b2i["1"]}\n2) {map_category_answer_b2i["2"]}',
    'reversed': f'Select the category that answers the question. Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\n1) {map_category_answer_b2i["2"]}\n2) {map_category_answer_b2i["1"]}'
     },

    {'normal': f'Write the number of the category that fits the question. Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\n1) {map_category_answer_b2i["1"]}\n2) {map_category_answer_b2i["2"]}',
    'reversed': f'Write the number of the category that fits the question. Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\n1) {map_category_answer_b2i["2"]}\n2) {map_category_answer_b2i["1"]}'
    },

    {'normal': f'Write the number of the category that fits the question. Local government spending on "{{budget_item}}" {{effect_type}} affects "{{indicator}}"?\n\n1) {map_category_answer_b2i["1"]}\n2) {map_category_answer_b2i["2"]}',
    'reversed': f'Write the number of the category that fits the question. Local government spending on "{{budget_item}}" {{effect_type}} affects "{{indicator}}"?\n\n1) {map_category_answer_b2i["2"]}\n2) {map_category_answer_b2i["1"]}'
    },

    {'normal': f'Write the number of the option that answers the question. Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\n1) {map_category_answer_b2i["1"]}\n2) {map_category_answer_b2i["2"]}',
    'reversed': f'Write the number of the option that answers the question. Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\n1) {map_category_answer_b2i["2"]}\n2) {map_category_answer_b2i["1"]}'
    },

    {'normal': f'Write the number of the option that answers the question. Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\n\n1) {map_category_answer_b2i["1"]}\n2) {map_category_answer_b2i["2"]}',
    'reversed': f'Write the number of the option that answers the question. Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\n\n1) {map_category_answer_b2i["2"]}\n2) {map_category_answer_b2i["1"]}'
    },

    {'normal': f'Write the number of the option that answers the question. Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\n\nOptions\n1) {map_category_answer_b2i["1"]}\n2) {map_category_answer_b2i["2"]}',
    'reversed': f'Write the number of the option that answers the question. Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\n\nOptions\n1) {map_category_answer_b2i["2"]}\n2) {map_category_answer_b2i["1"]}'
    },
    
    {'normal': f'Select the category number that answers the question. Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\n1) {map_category_answer_b2i["1"]}\n2) {map_category_answer_b2i["2"]}',
    'reversed': f'Select the category number that answers the question. Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\n1) {map_category_answer_b2i["2"]}\n2) {map_category_answer_b2i["1"]}'
     },

    {'normal': f'Choose the category that answers the question. Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\n1) {map_category_answer_b2i["1"]}\n2) {map_category_answer_b2i["2"]}',
    'reversed': f'Choose the category that answers the question. Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\n1) {map_category_answer_b2i["2"]}\n2) {map_category_answer_b2i["1"]}'
     },

    {'normal': f'Select the answer for the question. Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\n1) {map_category_answer_b2i["1"]}\n2) {map_category_answer_b2i["2"]}',
    'reversed': f'Select the answer for the question. Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\n1) {map_category_answer_b2i["2"]}\n2) {map_category_answer_b2i["1"]}'
     },

    {'normal': f'Select the answer for the question.\nQuestion: Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\nAnswers:\n1) {map_category_answer_b2i["1"]}\n2) {map_category_answer_b2i["2"]}',
    'reversed': f'Select the answer for the question.\nQuestion: Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\nAnswers:\n1) {map_category_answer_b2i["2"]}\n2) {map_category_answer_b2i["1"]}'
     },

    {'normal': f'Select the answer number that answers question.\nQuestion: Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\nAnswers:\n1) {map_category_answer_b2i["1"]}\n2) {map_category_answer_b2i["2"]}',
    'reversed': f'Select the answer number that answers question.\nQuestion: Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\nAnswers:\n1) {map_category_answer_b2i["2"]}\n2) {map_category_answer_b2i["1"]}'
     },

    {'normal': f'Select the answer number that answers question.\nQuestion: Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\n1) {map_category_answer_b2i["1"]}\n2) {map_category_answer_b2i["2"]}',
    'reversed': f'Select the answer number that answers question.\nQuestion: Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\n1) {map_category_answer_b2i["2"]}\n2) {map_category_answer_b2i["1"]}'
     },

    {'normal': f'Select the answer number that answers question.\nQuestion: Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\nAnswer 1) {map_category_answer_b2i["1"]}\n Answer 2) {map_category_answer_b2i["2"]}',
    'reversed': f'Select the answer number that answers question.\nQuestion: Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\nAnswer 1) {map_category_answer_b2i["2"]}\n Answer 2) {map_category_answer_b2i["1"]}'
     },

    {'normal': f'Q: Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\nAnswers:\n1) {map_category_answer_b2i["1"]}\n2) {map_category_answer_b2i["2"]}\nSelect the correct answer for the question.',
     'reversed': f'Q: Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\nAnswers:\n1) {map_category_answer_b2i["2"]}\n2) {map_category_answer_b2i["1"]}\nSelect the correct answer for the question.'
     },

    {'normal': f'Select the correct answer to the question. Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\n1) {map_category_answer_b2i["1"]}\n2) {map_category_answer_b2i["2"]}',
    'reversed': f'Select the correct answer to the question. Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\n1) {map_category_answer_b2i["2"]}\n2) {map_category_answer_b2i["1"]}'
     },

    {'normal': f'Which answer is the correct answer to the question. Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\n1) {map_category_answer_b2i["1"]}\n2) {map_category_answer_b2i["2"]}',
    'reversed': f'Which answer is the correct answer to the question. Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\n1) {map_category_answer_b2i["2"]}\n2) {map_category_answer_b2i["1"]}'
     },

    {'normal': f'Select the correct answer to the question. Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\n\nAnswers:\n1) {map_category_answer_b2i["1"]}\n2) {map_category_answer_b2i["2"]}',
    'reversed': f'Select the correct answer to the question. Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\n\nAnswers:\n1) {map_category_answer_b2i["2"]}\n2) {map_category_answer_b2i["1"]}'
     },

    {'normal': f'Which answer is the correct answer to the question. Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\n\nAnswers:\n1) {map_category_answer_b2i["1"]}\n2) {map_category_answer_b2i["2"]}',
    'reversed': f'Which answer is the correct answer to the question. Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\n\nAnswers:\n1) {map_category_answer_b2i["2"]}\n2) {map_category_answer_b2i["1"]}'
     },


    {'normal': f'Select the answer that answers the question. Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\n1) {map_category_answer_b2i["1"]}\n2) {map_category_answer_b2i["2"]}',
    'reversed': f'Select the answer that answers the question. Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\n1) {map_category_answer_b2i["2"]}\n2) {map_category_answer_b2i["1"]}'
     },

    {'normal': f'Select the answer that answers the question. Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\nAnswers:\n1) {map_category_answer_b2i["1"]}\n2) {map_category_answer_b2i["2"]}',
    'reversed': f'Select the answer that answers the question. Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\nAnswers:\n1) {map_category_answer_b2i["2"]}\n2) {map_category_answer_b2i["1"]}'
     },

    {'normal': f'Is Answer 1 or Answer 2 the correct answer to the following Question: Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\nAnswers:\n1) {map_category_answer_b2i["1"]}\n2) {map_category_answer_b2i["2"]}',
    'reversed': f'Is Answer 1 or Answer 2 the correct answer to the following Question: Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\nAnswers:\n1) {map_category_answer_b2i["2"]}\n2) {map_category_answer_b2i["1"]}'
     },

     {'normal': f'Is Answer 1 or Answer 2 the correct answer to the following Question: Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\nAnswer 1) {map_category_answer_b2i["1"]}\nAnswer 2) {map_category_answer_b2i["2"]}',
    'reversed': f'Is Answer 1 or Answer 2 the correct answer to the following Question: Does local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?\nAnswer 1) {map_category_answer_b2i["2"]}\nAnswer 2) {map_category_answer_b2i["1"]}'
     },

     {'normal': f'Answer 1) {map_category_answer_b2i["1"]}\nAnswer 2) {map_category_answer_b2i["2"]}\n\nDoes local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?',
    'reversed': f'Answer 1) {map_category_answer_b2i["2"]}\nAnswer 2) {map_category_answer_b2i["1"]}\n\nDoes local government spending on "{{budget_item}}" {{effect_type}} affect "{{indicator}}"?'
    },
    
    {'normal':f'Statement 1) {map_category_answer_b2i["1"]}\nStatement 2) {map_category_answer_b2i["2"]}\nWhich statement is True?',
        'reversed':f'Statement 1) {map_category_answer_b2i["2"]}\nStatement 2) {map_category_answer_b2i["1"]}\nWhich statement is True?'
    },

    {'normal':f'Statement 1) {map_category_answer_b2i["1"]}\nStatement 2) {map_category_answer_b2i["2"]}\nI want you to select the True statement.',
        'reversed':f'Statement 1) {map_category_answer_b2i["2"]}\nStatement 2) {map_category_answer_b2i["1"]}\nI want you to select the True statement.'
    },

    {'normal':f'Statement 1) {map_category_answer_b2i["1"]}\nStatement 2) {map_category_answer_b2i["2"]}\nIs Statement 1 or Statement 2 True?',
        'reversed':f'Statement 1) {map_category_answer_b2i["2"]}\nStatement 2) {map_category_answer_b2i["1"]}\nIs Statement 1 or Statement 2 True?'
    },

    {'normal':f'Statement 1) {map_category_answer_b2i["1"]}\nStatement 2) {map_category_answer_b2i["2"]}\nWrite the number of the correct statement.',
        'reversed':f'Statement 1) {map_category_answer_b2i["2"]}\nStatement 2) {map_category_answer_b2i["1"]}\nWrite the number of the correct statement.'
    }
    ]

#### In the box below, we evaluate the output of all prompts on all budget item to indicator
#### We effectively want to test that the normal and reverse version of the prompt give the same answer and that the outputs are robust to the order the options are presented in

In [7]:
from test_methods_helper import run_command
debug = False

li_format_dict = [ {'budget_item':b2i[0], 'indicator':b2i[1], 'effect_type':effect_type } for b2i in li_b2i ]
li_related = [ b2i[2] for b2i in li_b2i ] 

if debug:
    li_li_filledtemplate = [ [ test_prompt['normal'].format(**format_dict) ] for test_prompt in li_test_prompts[:2] for format_dict in li_format_dict ] 
    li_li_filledtemplate_reverse = [ [ test_prompt['reversed'].format(**format_dict) ] for test_prompt in li_test_prompts[:2] for format_dict in li_format_dict] 
    li_related_ = li_related*2 

else:
    li_li_filledtemplate = [ [ test_prompt['normal'].format(**format_dict) ] for test_prompt in li_test_prompts for format_dict in li_format_dict ] 
    li_li_filledtemplate_reverse = [ [ test_prompt['reversed'].format(**format_dict) ] for test_prompt in li_test_prompts for format_dict in li_format_dict ]
    li_related_ = li_related*len(li_test_prompts) 


import multiprocessing as mp
mp.set_start_method('spawn', force=True)

# Create a queue to hold the results
manager = mp.Manager()
queue = mp.Queue()

# Create two processes to run the commands
p1 = mp.Process(target=run_command, args=(queue, prediction_generator, li_li_filledtemplate, llm_name, 0, 'normal' ))
p2 = mp.Process(target=run_command, args=(queue, prediction_generator, li_li_filledtemplate_reverse, llm_name, 1, 'reverse' ))

# Start the processes
p1.start()
p2.start()

# Wait for the processes to finish
p1.join()
p2.join()

# Get the results from the queue in the order they were put in
results = []
while not queue.empty():
    results.append(queue.get())

# Extract the results from the sorted list
li_preds = [ d['li_preds'] for d in results if d['name'] == 'normal'  ][0]
li_preds_reverse = [ d['li_preds'] for d in results if d['name'] == 'reverse'][0]

# Evaluating the results
## Goal is two produce two sets of rankings
    ## One ranking is getting models which place the highest probability on the correct answer
    ## One ranking is getting models which produce similar results for the normal and reversed prompts

def get_prob_correctness(pred:dict, correct_answer:str) -> float:
    if correct_answer == 'Yes':
        return pred['Yes']
    elif correct_answer == 'No':
        return pred['No']
    else:
        return 0.0

def get_normal_reverse_diff( pred:str, pred_reverse:str ):
    yes_diff = pred['Yes'] - pred_reverse['Yes']
    return yes_diff

li_prob_correct_normal = [ get_prob_correctness(pred, correct_answer) for pred, correct_answer in zip(li_preds, li_related_) ]
li_prob_correct_reverse = [ get_prob_correctness(pred, correct_answer) for pred, correct_answer in zip(li_preds_reverse, li_related_) ]
li_diffs = [ get_normal_reverse_diff(pred, pred_reverse) for pred, pred_reverse in zip(li_preds, li_preds_reverse) ]

# Now we need to group the results by the prompt used and aggregate the reesults
stride = len(li_format_dict)
grouped_li_prob_correct_normal = [ li_prob_correct_normal[i:i+stride] for i in range(0, len(li_prob_correct_normal), stride ) ]
grouped_li_prob_correct_reverse = [ li_prob_correct_reverse[i:i+stride] for i in range(0, len(li_prob_correct_reverse), stride ) ]
grouped_li_diffs = [ li_diffs[i:i+stride] for i in range(0, len(li_diffs), stride ) ]

avg_li_prob_correct_normal = [ sum(li)/len(li) for li in grouped_li_prob_correct_normal ]
avg_li_prob_correct_reverse = [ sum(li)/len(li) for li in grouped_li_prob_correct_reverse ]
avg_li_diffs = [ sum(li)/len(li) for li in grouped_li_diffs ]

idx_top10_prob_correct_normal = sorted(range(len(avg_li_prob_correct_normal)), key=lambda i: avg_li_prob_correct_normal[i],reverse=True )[:10]
idx_top10_prob_correct_reverse = sorted(range(len(avg_li_prob_correct_reverse)), key=lambda i: avg_li_prob_correct_reverse[i], reverse=True )[:10]
idx_top10_diffs = sorted(range(len(avg_li_diffs)), key=lambda i: avg_li_diffs[i])[-10:]

# Print a dataframe of top 10 normal with the prob correct answer as the first column
df_top10_prob_correct_normal = pd.DataFrame( [ (idx, avg_li_prob_correct_normal[idx]) for idx in idx_top10_prob_correct_normal ], columns=['index', 'prob_correct'] )
print("\nTop 10 Normal")
print(df_top10_prob_correct_normal)

# Print a dataframe of top 10 reverse with the prob correct answer as the first column
df_top10_prob_correct_reverse = pd.DataFrame( [ (idx, avg_li_prob_correct_reverse[idx]) for idx in idx_top10_prob_correct_reverse ], columns=['index', 'prob_correct'] )
print("\nTop 10 Reverse")
print(df_top10_prob_correct_reverse)

# Print a dataframe of top 10 diffs with the diff as the first column
df_top10_diffs = pd.DataFrame( [ (idx, avg_li_diffs[idx]) for idx in idx_top10_diffs ], columns=['index', 'diff'] )
print("\nTop 10 Diffs")
print(df_top10_diffs)


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/akann1warw1ck/miniconda3/envs/alanturing/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so
CUDA SETUP: CUDA runtime path found: /home/akann1warw1ck/miniconda3/envs/alanturing/lib/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /home/akann1warw1ck/miniconda3/envs/alanturing/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...


Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)



Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/akann1warw1ck/miniconda3/envs/alanturing/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so
CUDA SETUP: CUDA runtime path found: /home/akann1warw1ck/miniconda3/envs/alanturing/lib/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /home/akann1warw1ck/miniconda3/envs/alanturing/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...


Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)
Using pad_token, but it is not set yet.
Using pad_token, but it is not set yet.



Top 10 Normal
   index  prob_correct
0     10      0.566875
1      9      0.558117
2     24      0.547783
3     18      0.546550
4     11      0.534042
5     14      0.532275
6     17      0.532217
7     27      0.528917
8     13      0.528633
9     20      0.526792

Top 10 Reverse
   index  prob_correct
0     20      0.524958
1      0      0.502000
2      1      0.501083
3      5      0.499933
4     30      0.498208
5      3      0.497892
6     34      0.495817
7      4      0.491992
8     18      0.491817
9      2      0.491250

Top 10 Diffs
   index      diff
0      4  0.384308
1     16  0.395683
2     26  0.400392
3      2  0.471092
4     15  0.477458
5     13  0.492908
6      8  0.501433
7     14  0.596367
8      9  0.610233
9     10  0.666617


Results above show all prompts essentially fail since the lowest diff is 0.75 which shows thmat the model is not robust to the order of the prompt

#### Evaluating the continuation sentences output by the model for a given prompt
- through this we can evaluate which prompts are appropriate e.g. the next token output should be 1 number of the category

In [97]:
import langchain
langchain.llm_cache.clear()
from prompt_engineering.utils_prompteng import (map_llmname_input_format, map_relationship_system_prompt)

if 'llm' not in globals():
    llm = load_llm(llm_name, False, 'local')

def show_continuation( llm, prompt_idx=-1, method='normal', include_system_message=True, print_generations=True ):
    
    if method == 'normal':
        li_filledtemplate = li_li_filledtemplate[prompt_idx]
    elif method == 'reverse':
        li_filledtemplate = li_li_filledtemplate_reverse[prompt_idx]
    else:
        raise ValueError('method must be normal or reverse')
        
    # Get the relationship and effect type
    if include_system_message:
        sm = (map_relationship_system_prompt[relationship][effect_type] + ' ' + map_relationship_system_prompt[relationship][prompt_style] ).replace('  ',' ').strip(' ')
    else:
        sm = None

    li_prompt_adapted_to_lm = [ map_llmname_input_format(llm_name,
                                    user_message = '\n'+prompt, 
                                    system_message = sm )
                                for prompt in li_filledtemplate ] #Added some base model formatting

    llm.pipeline._forward_params  = {
        # 'num_beams':3,
        'num_return_sequences':1,
        'early_stopping':True,
        'max_new_tokens': 20,
    }

    outp = llm.predict(li_prompt_adapted_to_lm[0]+' ')

    if print_generations:
        print('\t\t========='+method.upper()+'=========')
        print(li_prompt_adapted_to_lm[0])
        print(outp)
    return outp


In this cell we are searching for the prompts that
- have a continuation that starts with a number for both cases of normal and reverse
- predict a different number for the normal and reverse

In [98]:
li_good_prompts_idx = []
for idx in range(len(li_test_prompts)):
    # print(f"\n\nIDX={idx}")

    idx_ = idx*len(li_format_dict) # For each prompt we only compare the first b2i couple

    pred_str = show_continuation(llm, prompt_idx=idx_, method='normal', print_generations=False)
    pred_str_rev = show_continuation(llm, prompt_idx=idx_, method='reverse', print_generations=False)

    # Perform a checks
    bool_good_prompt = False
    
    if len(pred_str[0])>0 and len(pred_str_rev[0])>0 and pred_str[0].isdigit() and pred_str_rev[0].isdigit() and pred_str[0] != pred_str_rev[0]:
        bool_good_prompt = True
    
    if bool_good_prompt:
        li_good_prompts_idx.append(idx)

print(li_good_prompts_idx)

[0, 1, 2, 3, 4, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 22, 23, 26, 28, 34]


In [6]:
li_good_prompts_idx = [0, 1, 2, 3, 4, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 22, 23, 26, 28, 34]


In [7]:
len(li_good_prompts_idx)

22

#### Evaluation the performance of the models that satisfy the constraint that the foraward and reverse aggree
- Now li_good_prompts_idx contains the indexes of the prompts that we want to use for performance comparison
- We use the earlier experiment from before to re-evaluate the performance of the model on these prompts

In [8]:
from test_methods_helper import run_command
debug = False

li_format_dict = [ {'budget_item':b2i[0], 'indicator':b2i[1], 'effect_type':effect_type } for b2i in li_b2i ]
li_related = [ b2i[2] for b2i in li_b2i ] 

if debug:
    li_good_prompts_idx_ = li_good_prompts_idx[:2]
    li_li_filledtemplate = [ [ li_test_prompts[idx]['normal'].format(**format_dict) ] for idx in li_good_prompts_idx_ for format_dict in li_format_dict ] 
    li_li_filledtemplate_reverse = [ [ li_test_prompts[idx]['reversed'].format(**format_dict) ] for idx in li_good_prompts_idx_  for format_dict in li_format_dict] 
    li_related_ = li_related*2 

else:
    li_li_filledtemplate = [ [ li_test_prompts[idx]['normal'].format(**format_dict) ] for idx in li_good_prompts_idx for format_dict in li_format_dict ] 
    li_li_filledtemplate_reverse = [ [ li_test_prompts[idx]['reversed'].format(**format_dict) ] for idx in li_good_prompts_idx for format_dict in li_format_dict ]
    li_related_ = li_related*len(li_good_prompts_idx) 


import multiprocessing as mp
mp.set_start_method('spawn', force=True)

# Create a queue to hold the results
manager = mp.Manager()
queue = mp.Queue()

# Create two processes to run the commands
p1 = mp.Process(target=run_command, args=(queue, prediction_generator, li_li_filledtemplate, llm_name, 0, 'normal' ))
p2 = mp.Process(target=run_command, args=(queue, prediction_generator, li_li_filledtemplate_reverse, llm_name, 1, 'reverse' ))

# Start the processes
p1.start()
p2.start()

# Wait for the processes to finish
p1.join()
p2.join()

# Get the results from the queue in the order they were put in
# put sentinel values in queue
for _ in range(2):
    queue.put(None)

results = []
while True:
    data = queue.get()
    if data is None:  # we met sentinel value, break the loop
        break
    results.append(data)


# Extract the results from the sorted list
li_preds = [ d['li_preds'] for d in results if d['name'] == 'normal'  ][0]
li_preds_reverse = [ d['li_preds'] for d in results if d['name'] == 'reverse'][0]

# Evaluating the results
## Goal is two produce two sets of rankings
    ## One ranking is getting models which place the highest probability on the correct answer
    ## One ranking is getting models which produce similar results for the normal and reversed prompts

def get_prob_correctness(pred:dict, correct_answer:str) -> float:
    if correct_answer == 'Yes':
        return pred['Yes']
    elif correct_answer == 'No':
        return pred['No']
    else:
        return 0.0

def get_normal_reverse_diff( pred:str, pred_reverse:str ):
    yes_diff = pred['Yes'] - pred_reverse['Yes']
    return yes_diff

li_prob_correct_normal = [ get_prob_correctness(pred, correct_answer) for pred, correct_answer in zip(li_preds, li_related_) ]
li_prob_correct_reverse = [ get_prob_correctness(pred, correct_answer) for pred, correct_answer in zip(li_preds_reverse, li_related_) ]
li_diffs = [ get_normal_reverse_diff(pred, pred_reverse) for pred, pred_reverse in zip(li_preds, li_preds_reverse) ]

# Now we need to group the results by the prompt used and aggregate the reesults
stride = len(li_format_dict)
grouped_li_prob_correct_normal = [ li_prob_correct_normal[i:i+stride] for i in range(0, len(li_prob_correct_normal), stride ) ]
grouped_li_prob_correct_reverse = [ li_prob_correct_reverse[i:i+stride] for i in range(0, len(li_prob_correct_reverse), stride ) ]
grouped_li_diffs = [ li_diffs[i:i+stride] for i in range(0, len(li_diffs), stride ) ]

avg_li_prob_correct_normal = [ sum(li)/len(li) for li in grouped_li_prob_correct_normal ]
avg_li_prob_correct_reverse = [ sum(li)/len(li) for li in grouped_li_prob_correct_reverse ]
avg_li_diffs = [ sum(li)/len(li) for li in grouped_li_diffs ]

idx_top10_prob_correct_normal = sorted(range(len(avg_li_prob_correct_normal)), key=lambda i: avg_li_prob_correct_normal[i],reverse=True )[:10]
idx_top10_prob_correct_reverse = sorted(range(len(avg_li_prob_correct_reverse)), key=lambda i: avg_li_prob_correct_reverse[i], reverse=True )[:10]
idx_top10_diffs = sorted(range(len(avg_li_diffs)), key=lambda i: avg_li_diffs[i])[-10:]

# Print a dataframe of top 10 normal with the prob correct answer as the first column
df_top10_prob_correct_normal = pd.DataFrame( [ (li_good_prompts_idx[idx], avg_li_prob_correct_normal[idx]) for idx in idx_top10_prob_correct_normal ], columns=['index', 'prob_correct'] )
print("\nTop 10 Normal")
print(df_top10_prob_correct_normal)

# Print a dataframe of top 10 reverse with the prob correct answer as the first column
df_top10_prob_correct_reverse = pd.DataFrame( [ (li_good_prompts_idx[idx], avg_li_prob_correct_reverse[idx]) for idx in idx_top10_prob_correct_reverse ], columns=['index', 'prob_correct'] )
print("\nTop 10 Reverse")
print(df_top10_prob_correct_reverse)

# Print a dataframe of top 10 diffs with the diff as the first column
df_top10_diffs = pd.DataFrame( [ (li_good_prompts_idx[idx], avg_li_diffs[idx]) for idx in idx_top10_diffs ], columns=['index', 'diff'] )
print("\nTop 10 Diffs")
print(df_top10_diffs)


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/akann1warw1ck/miniconda3/envs/alanturing/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so
CUDA SETUP: CUDA runtime path found: /home/akann1warw1ck/miniconda3/envs/alanturing/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /home/akann1warw1ck/miniconda3/envs/alanturing/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...

Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/akann1warw1ck/miniconda3/envs/alanturing/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so
CUDA SETUP: CUDA r

Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)
Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)



Top 10 Normal
   index  prob_correct
0     34      0.599358
1      2      0.574908
2      0      0.563133
3      1      0.548350
4      3      0.537758
5     23      0.522792
6     28      0.518667
7      9      0.511667
8     14      0.509817
9     10      0.509233

Top 10 Reverse
   index  prob_correct
0      3      0.495708
1     13      0.494092
2     12      0.493250
3      2      0.490942
4     10      0.487108
5     11      0.481725
6      1      0.474842
7     34      0.465167
8      0      0.440967
9     14      0.440533

Top 10 Diffs
   index      diff
0      2  0.822283
1     16  0.822900
2      1  0.863475
3      9  0.872758
4      3  0.891800
5     14  0.927733
6     10  0.973975
7     11  0.976033
8     12  0.985600
9     13  0.991917


#### Evaluatating Results 
- The diff results imply that none of the prompts are resistant to the reversal of the prompt.
- However, our continuation results show that the prompts do corrrectly predict 1 or 2 next. Therefore we can assume that the likelihood predictor model has an error somewhere

ERRORS TO CHECK
- When I attach the number to the end, is the number the last token or is a final last token appended
- The calculations used when calculating the perplexity or log-likehood  

In [58]:
# Testing the first point
print(tokenizer.encode('ASSISTANT: 1) Local government spending on "Environmental and regulatory services" does directly affect "Progression by 2 levels in maths between KS1 and KS2" is the correct statement.'))
print( tokenizer.encode("ASSISTANT:") )
print( tokenizer.encode("ASSISTANT: ") )
print( tokenizer.encode("ASSISTANT: 1") )
print( tokenizer.encode("ASSISTANT: 1)") )
# Indeed we see the first point is not the problem

[1, 319, 1799, 9047, 13566, 29901]
[1, 319, 1799, 9047, 13566, 29901, 29871]
[1, 319, 1799, 9047, 13566, 29901, 29871, 29896]
[1, 319, 1799, 9047, 13566, 29901, 29871, 29896, 29897]


- To investigate if the likelihood of final token is being calculated properly we will break down the function

GOING THROUGH THE FUNCTION WE IDENTIFY THE FOLLOWING POSSIBLE CAUSES FOR FAILURE
- padding tokens on wrong side
- add_start_token having un-wanted effects
- the effect of not specifying max_length e.g. does it cut text during tokenization for long texts
- args to tokenizer() e.g. add_special_tokens= False, padding=True, remember setting padding=left has to be done during model initiation


- step by step through the for loop for calculating the perplexity

CHANGES MADE
- ensure padding is always on the left hand side w/ tokenizer.padding_size = 'left'. The default is right hand side
- Change add_special_tokens=False to add_special_tokens=True
- change Truncation to False permanently
- change truncation to left side with tokenizer.truncation_side = 'left'

In [116]:
from prompt_engineering.utils_prompteng import map_llmname_input_format, map_relationship_system_prompt
data_unfmtd = [ li_li_filledtemplate[0][0],
    li_li_filledtemplate_reverse[0][0]]

data_fmtd =  [
                map_llmname_input_format(llm_name,
                                        user_message = prompt, 
                                        system_message = (map_relationship_system_prompt[relationship][effect_type] + ' ' + map_relationship_system_prompt[relationship][prompt_style] ) )
                                    for prompt in data_unfmtd ] 
data_fmtd_w_ans = [ s + ' ' +str(idx+1) for idx, s in enumerate(data_fmtd) ]

In [135]:
data_fmtd_w_ans

['USER: You are a socio-economic researcher tasked with answering a question about whether government spending on a "government budget item" directly affects a "socio-economic/health indicator". In the question the government budget item and socio-economic/health indicator will be presented within quotation marks.  Categories:\n1) Local government spending on "Children 5-19 public health programmes" does directly affect "Low birth weight of term babies"\n2) Local government spending on "Children 5-19 public health programmes" does not directly affect "Low birth weight of term babies"\nWrite the category number that best answers whether local government spending on "Children 5-19 public health programmes" directly affects "Low birth weight of term babies"?\nASSISTANT: 1',
 'USER: You are a socio-economic researcher tasked with answering a question about whether government spending on a "government budget item" directly affects a "socio-economic/health indicator". In the question the gov

In [136]:
# llm = load_llm(llm_name, local_or_remote='local')
# data = [
#     "The number that comes after 1 is 2",
#     "The number after 1 is 3"
# ]
data = data_fmtd_w_ans

model = llm.pipeline.model
tokenizer = llm.pipeline.tokenizer
batch_size = 2
add_start_token = True
max_length = None
category_token_len = 1

In [137]:
# def joint_probabilities_for_category(
#     data, model, tokenizer, batch_size: int = 16, add_start_token: bool = True, max_length=None, category_token_len=1):


# """For a given prompt taking the style of "Answer with the letter of the Category which best answers my question", This function returns the joint probabilities for the category tokens in each posible answer,
#     NOTE: by design the category responses must all be the same length, ideally 1 token length.

#     NOTE: However the function is currently written to work on sequencs longer than 1 token, but this is not recommended.
# """
import torch
from transformers import PreTrainedModel, PreTrainedTokenizerBase
from torch.nn.functional import log_softmax

assert isinstance(model, PreTrainedModel)
assert isinstance(tokenizer, PreTrainedTokenizerBase)
# assert category_token_len == 1, "Currently only supports category tokens of length 1"

model = model
tokenizer = tokenizer

if tokenizer.pad_token is None and batch_size > 1:
    existing_special_tokens = list(tokenizer.special_tokens_map_extended.values())
    assert (
        len(existing_special_tokens) > 0
    ), "If batch_size > 1, model must have at least one special token to use for padding. Please use a different model or set batch_size=1."
    tokenizer.add_special_tokens({"pad_token": existing_special_tokens[0]})

#NOTE: remove this section
# if add_start_token and max_length:
#     assert (
#         tokenizer.bos_token is not None
#     ), "Input model must already have a BOS token if using add_start_token=True. Please use a different model, or set add_start_token=False"
#     max_tokenized_len = max_length - 1
# else:
#     max_tokenized_len = max_length

tokenizer.padding_side = 'left'
tokenizer.truncation_side = 'left'

encodings = tokenizer(
    data,
    add_special_tokens=True,
    padding=True,
    truncation= False,
    max_length=None,
    return_tensors="pt",
    return_attention_mask=True,
    
).to(model.device)


In [138]:
print(tokenizer.decode(encodings['input_ids'][0]))
print(tokenizer.decode(encodings['input_ids'][1]))

<s> USER: You are a socio-economic researcher tasked with answering a question about whether government spending on a "government budget item" directly affects a "socio-economic/health indicator". In the question the government budget item and socio-economic/health indicator will be presented within quotation marks.  Categories:
1) Local government spending on "Children 5-19 public health programmes" does directly affect "Low birth weight of term babies"
2) Local government spending on "Children 5-19 public health programmes" does not directly affect "Low birth weight of term babies"
Write the category number that best answers whether local government spending on "Children 5-19 public health programmes" directly affects "Low birth weight of term babies"?
ASSISTANT: 1
<s> USER: You are a socio-economic researcher tasked with answering a question about whether government spending on a "government budget item" directly affects a "socio-economic/health indicator". In the question the gover

In [139]:
encoded_texts = encodings["input_ids"]
attn_masks = encodings["attention_mask"]

# Now we always have a start token
# if add_start_token:
#     assert torch.all(torch.ge(attn_masks.sum(1), 1)), "Each input text must be at least one token long."
# else:
#     assert torch.all(
#         torch.ge(attn_masks.sum(1), 2)
#     ), "When add_start_token=False, each input text must be at least two tokens long. Run with add_start_token=True if inputting strings of only one token, and remove all empty input strings."

joint_probs = []

#DEBUG # for start_index in range(0, len(encoded_texts), batch_size):
start_index = 0
# end_index = 1

end_index = min(start_index + batch_size, len(encoded_texts))
encoded_texts_batch = encoded_texts[start_index:end_index]
attn_masks_batch = attn_masks[start_index:end_index]

# if add_start_token:
#     bos_tokens_tensor = torch.tensor([[tokenizer.bos_token_id]] * encoded_texts_batch.size(dim=0)).to(model.device)
#     encoded_texts_batch = torch.cat([bos_tokens_tensor, encoded_texts_batch], dim=1)
#     attn_masks_batch = torch.cat(
#         [torch.ones(bos_tokens_tensor.size(), dtype=torch.int64).to(model.device), attn_masks_batch], dim=1
#     )

labels = encoded_texts_batch

with torch.no_grad():
    out_logits = model(encoded_texts_batch, attention_mask=attn_masks_batch).logits

shift_logits = out_logits[..., :-1, :]
shift_labels = labels[..., 1:]
shift_attention_mask_batch = attn_masks_batch[..., 1:]

shift_logits = shift_logits[..., -category_token_len:, :]
shift_labels = shift_labels[..., -category_token_len:]
shift_attention_mask_batch = shift_attention_mask_batch[..., -category_token_len:]

shift_logits = shift_logits.contiguous()
shift_labels = shift_labels.contiguous()
shift_attention_mask_batch = shift_attention_mask_batch.contiguous()



In [140]:

# Calculate log probabilities from logits
log_probs  = log_softmax(shift_logits, dim=-1)


In [141]:
# Use gather to select the log probabilities for the actual tokens
gathered_log_probs = log_probs.gather(-1, shift_labels.unsqueeze(-1)).squeeze(-1)

In [142]:
# log_probs.max(dim=-1)
gathered_log_probs
# shift_attention_mask_batch
# 

tensor([[-0.0418],
        [-0.0599]], device='cuda:0', dtype=torch.float16)

In [143]:
gathered_log_probs = gathered_log_probs * shift_attention_mask_batch

# Sum the log probabilities for the actual tokens to get the joint log probability
joint_log_prob_batch = gathered_log_probs.sum(dim=-1)

joint_prob_batch = torch.exp(joint_log_prob_batch)

# joint_probs += joint_prob_batch.tolist()

# return joint_probs

In [144]:
joint_prob_batch

tensor([0.9590, 0.9419], device='cuda:0', dtype=torch.float16)

# Experimenting with ways to get stable Probabilities for cot categorical classification

In [15]:
prompt_style = 'cot_categorise'
ensemble_size = 1
edge_value = 'distribution'
parse_style = 'categories_perplexity'
relationship='budgetitem_to_indicator'
effect_type = 'directly'

prediction_generator = PredictionGenerator(llm,
                                            llm_name,
                                            prompt_style,
                                            ensemble_size,
                                            edge_value,
                                            parse_style,
                                            relationship=relationship,
                                            local_or_remote='local',
                                            effect_type=effect_type)

#### Setting up prompts

In [91]:
#Prompts to test

li_b2i = [
    ('Children 5-19 public health programmes', 'Low birth weight of term babies','Yes'),
    ('Children 5-19 public health programmes', 'Pupil absence','Yes'),

    ('Education services', '5 or more A*-C grades at GCSE (inc english and maths)','Yes'),
    ('Education services', 'Progression by 2 levels in maths between KS1 and KS2','Yes'),

    ('Environmental and regulatory services', 'Municipal waste landfilled','Yes'),
    ('Environmental and regulatory services', 'Waste collected per head','Yes'),

    ('Children 5-19 public health programmes', 'Municipal waste landfilled','No'),
    ('Children 5-19 public health programmes', 'Killed and seriously injured (KSI) casualties on England\'s roads (historic data)','No'),

    ('Education services', 'Municipal waste landfilled','No'),
    ('Education services', 'Killed and seriously injured (KSI) casualties on England\'s roads (historic data)','No'),

    ('Environmental and regulatory services', '5 or more A*-C grades at GCSE (inc english and maths)','No'),
    ('Environmental and regulatory services', 'Progression by 2 levels in maths between KS1 and KS2','No'),
]

li_budget_item = [ _tuple[0] for _tuple in li_b2i ]
li_indicator = [ _tuple[1] for _tuple in li_b2i ]
li_answer = [ _tuple[2] for _tuple in li_b2i ]

map_category_answer_b2i = { '1':'Local government spending on "{budget_item}" does {effect_type} affect "{indicator}"', '2':'Local government spending on "{budget_item}" does not {effect_type} affect "{indicator}"'}
map_category_label_b2i = { '1':'Yes', '2':'No'}

li_test_prompts = [
    
    # {'normal': f'The Statement below expresses an opinion on whether government spending on "{{budget_item}}" {{effect_type}} affects "{{indicator}}". Classify the statement into one of the following categories by writing the selected category\'s number.\nCategory 1) {map_category_answer_b2i["1"]}\nCategory 2) {map_category_answer_b2i["2"]}.\nStatement: {"{statement}"}',
    # 'reversed':f'The Statement below expresses an opinion on whether government spending on "{{budget_item}}" {{effect_type}} affects "{{indicator}}". Classify the statement into one of the following categories by writing the selected category\'s number:\nCategory 1) {map_category_answer_b2i["2"]}\nCategory 2) {map_category_answer_b2i["1"]}.\nStatement: {"{statement}"}'},


    # {'normal': f'The Statement below expresses an opinion on whether government spending on "{{budget_item}}" {{effect_type}} affects "{{indicator}}". Classify the statement into one of the following categories by writing the selected category\'s number.\nCategories:\n-1) {map_category_answer_b2i["1"]}\n-2) {map_category_answer_b2i["2"]}.\nStatement: {"{statement}"}',
    # 'reversed':f'The Statement below expresses an opinion on whether government spending on "{{budget_item}}" {{effect_type}} affects "{{indicator}}". Classify the statement into one of the following categories by writing the selected category\'s number:\nCategories:\n-1) {map_category_answer_b2i["2"]}\n-2) {map_category_answer_b2i["1"]}.\nStatement: {"{statement}"}'},

    
    # {'normal': f'Classify the statement into one of the following categories by only writing the selected category\'s number.\nCategory 1) {map_category_answer_b2i["1"]}\nCategory 2) {map_category_answer_b2i["2"]}.\nStatement: {"{statement}"}',
    # 'reversed':f'Classify the statement into one of the following categories by only writing the selected category\'s number:\nCategory 1) {map_category_answer_b2i["2"]}\nCategory 2) {map_category_answer_b2i["1"]}.\nStatement: {"{statement}"}'},

    # {'normal': f'Statement: {"{statement}"}\n\n Please classify the statement above into one of the following categories by only writing the selected category\'s number.\nCategory 1) {map_category_answer_b2i["1"]}\nCategory 2)      {map_category_answer_b2i["2"]}.',
    # 'reversed':f'Statement: {"{statement}"}\n\n Please classify the statement above into one of the following categories by only writing the selected category\'s number:\nCategory 1) {map_category_answer_b2i["2"]}\nCategory 2) {map_category_answer_b2i["1"]}.'},


    # {'normal': f'Statement: {"{statement}"}\n\n Please categorise the statement above by only writing the selected category\'s number.\nCategory 1) {map_category_answer_b2i["1"]}\nCategory 2) {map_category_answer_b2i["2"]}.',
    # 'reversed':f'Statement: {"{statement}"}\n\n Please categorise the statement above by only writing the selected category\'s number:\nCategory 1) {map_category_answer_b2i["2"]}\nCategory 2) {map_category_answer_b2i["1"]}.'},

    # {'normal': f'Statement: {"{statement}"}\n\nCategory 1) {map_category_answer_b2i["1"]}\t2) {map_category_answer_b2i["2"]}\n\n Please categorise the statement into one of the categories',
    #  'reversed':f'Statement: {"{statement}"}\n\nCategory 1) {map_category_answer_b2i["2"]}\t2) {map_category_answer_b2i["1"]}\n\n Please categorise the statement into one of the categories'},
     
    # {'normal': f'Statement: {"{statement}"}\n\nCategory 1) {map_category_answer_b2i["1"]}\t2) {map_category_answer_b2i["2"]}\n\nWhich category fits the statement?',
    #  'reversed':f'Statement: {"{statement}"}\n\nCategory 1) {map_category_answer_b2i["2"]}\t2) {map_category_answer_b2i["1"]}\n\nWhich category fits the statement?'},

    # {'normal': f'Statement: {"{statement}"}\n\nCategories:\n1) {map_category_answer_b2i["1"]}\t2) {map_category_answer_b2i["2"]}\n\nWrite the number of the category that fits the statement',
    #  'reversed':f'Statement: {"{statement}"}\n\nCategories:\n1) {map_category_answer_b2i["2"]}\t2) {map_category_answer_b2i["1"]}\n\nWrite the number of the category that fits the statement'},

    # {'normal': f'Statement: {"{statement}"}\n\nCategories:\n1) {map_category_answer_b2i["1"]}\t2) {map_category_answer_b2i["2"]}\n\nWrite the number of the category that fits the statement.',
    #  'reversed':f'Statement: {"{statement}"}\n\nCategories:\n1) {map_category_answer_b2i["2"]}\t2) {map_category_answer_b2i["1"]}\n\nWrite the number of the category that fits the statement.'},

    {'normal': f'Write the number of the category that fits the following statement.\nStatement: {"{statement}"}\nCategories:\n1) {map_category_answer_b2i["1"]}\n2) {map_category_answer_b2i["2"]}',
     'reversed':f'Write the number of the category that fits the following statement.\nStatement: {"{statement}"}\nCategories:\n1) {map_category_answer_b2i["2"]}\n2) {map_category_answer_b2i["1"]}'},

    {'normal': f'Write only the number of the category that fits the following statement.\nStatement: {"{statement}"}\nCategories:\n1) {map_category_answer_b2i["1"]}\n2) {map_category_answer_b2i["2"]}',
     'reversed':f'Write only the number of the category that fits the following statement.\nStatement: {"{statement}"}\nCategories:\n1) {map_category_answer_b2i["2"]}\n2) {map_category_answer_b2i["1"]}'},


    # {'normal':f'Write "1" if the following statement implies {map_category_answer_b2i["1"]} or write "2" if the following statement implies {map_category_answer_b2i["2"]}.\nStatement: {"{statement}"}',
    # 'reversed':f'Write "1" if the following statement implies {map_category_answer_b2i["2"]} or write "2" if the following statement implies {map_category_answer_b2i["1"]}.\nStatement: {"{statement}"}'},
    
    # {'normal':f'Write "1" if the following statement implies {map_category_answer_b2i["1"]} is True or write "2" if the following statement implies it is False.\nStatement: {"{statement}"}',
    #  'reversed':f'Write "1" if the following statement implies {map_category_answer_b2i["2"]} is True or write "2" if the following statement implies it is False.\nStatement: {"{statement}"}'
    #  }

    ]

In [86]:
# Getting Intemediary statemetns which explain the language model's reasoning
from prompt_engineering.utils_prompteng import map_llmname_input_format, map_relationship_system_prompt

sm = 'You are a socio-economic researcher tasked with answering a question about whether government spending on a "government budget item" affects a "socio-economic/health indicator". In the question the government budget item and socio-economic/health indicator will be presented within quotation marks.'
um_1= 'Using your expert knowledge, please provide a thorough, detailed and conclusive four sentence answer to the following question.'

um_2 = 'To what extent, if any, does local government spending on \"{budget_item}\" {effect_type} affect \"{indicator}\"?'


li_statements_prompts = [ map_llmname_input_format(llm_name, 
                                                   user_message= um_1 + ' ' + um_2.format(budget_item=budget_item, indicator=indicator, effect_type=effect_type),
                                                   system_message= sm  
                                                     ) for budget_item, indicator in zip(li_budget_item[:2], li_indicator[:2] ) ]

llm.pipeline._forward_params['max_new_tokens'] = 200
llm.pipeline._forward_params['early_stopping'] = True



outputs = llm.generate( li_statements_prompts )
li_statements = [ li_chatgen[0].text for li_chatgen in outputs.generations ]

In [7]:
from test_methods_helper import run_command
debug = False

# Get the statements explaining the language model's reasoning
li_format_dict = [ {'budget_item':b2i[0], 'indicator':b2i[1], 'effect_type':effect_type, 'statement':statement } for b2i,statement in zip(li_b2i,li_statements) ]
li_related = [ b2i[2] for b2i in li_b2i ] 

if debug:
    li_li_filledtemplate = [ [ test_prompt['normal'].format(**format_dict) ] for test_prompt in li_test_prompts[:2] for format_dict in li_format_dict ] 
    li_li_filledtemplate_reverse = [ [ test_prompt['reversed'].format(**format_dict) ] for test_prompt in li_test_prompts[:2] for format_dict in li_format_dict] 
    li_related_ = li_related*2 

else:
    li_li_filledtemplate = [ [ test_prompt['normal'].format(**format_dict) ] for test_prompt in li_test_prompts for format_dict in li_format_dict ] 
    li_li_filledtemplate_reverse = [ [ test_prompt['reversed'].format(**format_dict) ] for test_prompt in li_test_prompts for format_dict in li_format_dict ]
    li_related_ = li_related*len(li_test_prompts) 


import multiprocessing as mp
mp.set_start_method('spawn', force=True)

# Create a queue to hold the results
manager = mp.Manager()
queue = mp.Queue()

# Create two processes to run the commands
p1 = mp.Process(target=run_command, args=(queue, prediction_generator, li_li_filledtemplate, llm_name, 0, 'normal' ))
p2 = mp.Process(target=run_command, args=(queue, prediction_generator, li_li_filledtemplate_reverse, llm_name, 1, 'reverse' ))

# Start the processes
p1.start()
p2.start()

# Wait for the processes to finish
p1.join()
p2.join()

# Get the results from the queue in the order they were put in
results = []
while not queue.empty():
    results.append(queue.get())

# Extract the results from the sorted list
li_preds = [ d['li_preds'] for d in results if d['name'] == 'normal'  ][0]
li_preds_reverse = [ d['li_preds'] for d in results if d['name'] == 'reverse'][0]

# Evaluating the results
## Goal is two produce two sets of rankings
    ## One ranking is getting models which place the highest probability on the correct answer
    ## One ranking is getting models which produce similar results for the normal and reversed prompts

def get_prob_correctness(pred:dict, correct_answer:str) -> float:
    if correct_answer == 'Yes':
        return pred['Yes']
    elif correct_answer == 'No':
        return pred['No']
    else:
        return 0.0

def get_normal_reverse_diff( pred:str, pred_reverse:str ):
    yes_diff = pred['Yes'] - pred_reverse['Yes']
    return yes_diff

li_prob_correct_normal = [ get_prob_correctness(pred, correct_answer) for pred, correct_answer in zip(li_preds, li_related_) ]
li_prob_correct_reverse = [ get_prob_correctness(pred, correct_answer) for pred, correct_answer in zip(li_preds_reverse, li_related_) ]
li_diffs = [ get_normal_reverse_diff(pred, pred_reverse) for pred, pred_reverse in zip(li_preds, li_preds_reverse) ]

# Now we need to group the results by the prompt used and aggregate the reesults
stride = len(li_format_dict)
grouped_li_prob_correct_normal = [ li_prob_correct_normal[i:i+stride] for i in range(0, len(li_prob_correct_normal), stride ) ]
grouped_li_prob_correct_reverse = [ li_prob_correct_reverse[i:i+stride] for i in range(0, len(li_prob_correct_reverse), stride ) ]
grouped_li_diffs = [ li_diffs[i:i+stride] for i in range(0, len(li_diffs), stride ) ]

avg_li_prob_correct_normal = [ sum(li)/len(li) for li in grouped_li_prob_correct_normal ]
avg_li_prob_correct_reverse = [ sum(li)/len(li) for li in grouped_li_prob_correct_reverse ]
avg_li_diffs = [ sum(li)/len(li) for li in grouped_li_diffs ]

idx_top10_prob_correct_normal = sorted(range(len(avg_li_prob_correct_normal)), key=lambda i: avg_li_prob_correct_normal[i],reverse=True )[:10]
idx_top10_prob_correct_reverse = sorted(range(len(avg_li_prob_correct_reverse)), key=lambda i: avg_li_prob_correct_reverse[i], reverse=True )[:10]
idx_top10_diffs = sorted(range(len(avg_li_diffs)), key=lambda i: avg_li_diffs[i])[-10:]

# Print a dataframe of top 10 normal with the prob correct answer as the first column
df_top10_prob_correct_normal = pd.DataFrame( [ (idx, avg_li_prob_correct_normal[idx]) for idx in idx_top10_prob_correct_normal ], columns=['index', 'prob_correct'] )
print("\nTop 10 Normal")
print(df_top10_prob_correct_normal)

# Print a dataframe of top 10 reverse with the prob correct answer as the first column
df_top10_prob_correct_reverse = pd.DataFrame( [ (idx, avg_li_prob_correct_reverse[idx]) for idx in idx_top10_prob_correct_reverse ], columns=['index', 'prob_correct'] )
print("\nTop 10 Reverse")
print(df_top10_prob_correct_reverse)

# Print a dataframe of top 10 diffs with the diff as the first column
df_top10_diffs = pd.DataFrame( [ (idx, avg_li_diffs[idx]) for idx in idx_top10_diffs ], columns=['index', 'diff'] )
print("\nTop 10 Diffs")
print(df_top10_diffs)

['Statement: It is difficult to determine a direct correlation between local government spending on "Mental Health" and "Satisfaction with Management of Roadworks" without further context. Mental health services may address the needs of individuals affected by roadworks, but it is not clear if this would directly impact satisfaction with the management of roadworks. Additionally, other factors such as the quality of the roadworks themselves, the timeliness of completion, and the effectiveness of communication and compensation measures may also play a role in determining satisfaction.\n\nCategories:\n1) Spending on "Mental Health" does directly affect "Satisfaction with Management of Roadworks"\n2) Spending on "Mental Health" does not directly affect "Satisfaction with Management of Roadworks"\n\nWrite the number of the category that fits the statement']




'2) Spending on "Mental Health" does not directly affect "Satisfaction with Management of Roadworks".'

### Investigating the continuation

In [92]:
import langchain
langchain.llm_cache.clear()
from prompt_engineering.utils_prompteng import (map_llmname_input_format, map_relationship_system_prompt)

if 'llm' not in globals():
    llm = load_llm(llm_name, False, 'local')

debug = False

# Get the statements explaining the language model's reasoning
li_format_dict = [ {'budget_item':b2i[0], 'indicator':b2i[1], 'effect_type':effect_type, 'statement':statement } for b2i,statement in zip(li_b2i,li_statements) ]
li_related = [ b2i[2] for b2i in li_b2i ] 

if debug:
    li_li_filledtemplate = [ [ test_prompt['normal'].format(**format_dict) ] for test_prompt in li_test_prompts[:2] for format_dict in li_format_dict ] 
    li_li_filledtemplate_reverse = [ [ test_prompt['reversed'].format(**format_dict) ] for test_prompt in li_test_prompts[:2] for format_dict in li_format_dict] 
    li_related_ = li_related*2 

else:
    li_li_filledtemplate = [ [ test_prompt['normal'].format(**format_dict) ] for test_prompt in li_test_prompts for format_dict in li_format_dict ] 
    li_li_filledtemplate_reverse = [ [ test_prompt['reversed'].format(**format_dict) ] for test_prompt in li_test_prompts for format_dict in li_format_dict ]
    li_related_ = li_related*len(li_test_prompts) 



def show_continuation( llm, prompt_idx=-1, method='normal', include_system_message=True, print_generations=True ):
    
    if method == 'normal':
        li_filledtemplate = li_li_filledtemplate[prompt_idx]
    elif method == 'reverse':
        li_filledtemplate = li_li_filledtemplate_reverse[prompt_idx]
    else:
        raise ValueError('method must be normal or reverse')
        
    # Get the relationship and effect type
    if include_system_message:
        sm = (map_relationship_system_prompt[relationship][effect_type] + ' ' + map_relationship_system_prompt[relationship][prompt_style] ).replace('  ',' ').strip(' ')
    else:
        sm = None

    li_prompt_adapted_to_lm = [ map_llmname_input_format(llm_name,
                                    user_message = prompt, 
                                    system_message = sm )
                                for prompt in li_filledtemplate ] #Added some base model formatting

    llm.pipeline._forward_params  = {
        # 'num_beams':3,
        'num_return_sequences':1,
        'early_stopping':True,
        'max_new_tokens': 2,
    }

    outp = llm.predict(li_prompt_adapted_to_lm[0]+' ')

    if print_generations:
        print('\t\t========='+method.upper()+'=========')
        print(li_prompt_adapted_to_lm[0])
        print(outp)
    return outp



In [93]:
idx_bool_good_prompts = []
for idx in range(len(li_li_filledtemplate)):
    pred = show_continuation(llm, idx, 'normal', include_system_message=False, print_generations=False)
    pred_reverse = show_continuation(llm, idx, 'reverse', include_system_message=False, print_generations=False)

    if pred[:1] != pred_reverse[:1] and pred[:1].isdigit() and pred_reverse[:1].isdigit():
        idx_bool_good_prompts.append(idx)
print(idx_bool_good_prompts)

[1, 2, 3]


In [94]:
print( len(li_format_dict) )
print( len(li_li_filledtemplate) )

2
4


In [95]:
for idx in idx_bool_good_prompts:
    print('++++'+str(idx)+'++++')
    show_continuation(llm, idx, 'normal', include_system_message=False, print_generations=True)
    show_continuation(llm, idx, 'reverse', include_system_message=False, print_generations=True)
    print( '\n\n')

++++1++++
USER: Write the number of the category that fits the following statement.
Statement: 1. According to research, there is a positive correlation between local government spending on "Children 5-19 public health programmes" and "Pupil absence". 
2. This suggests that investing in health and wellbeing programmes for children and young people can have a direct impact on reducing absenteeism in schools. 
3. However, other factors such as family income, parental employment, and school quality also play a role in pupil absence rates. 
4. Therefore, while there is a clear link between government spending on children's health programmes and pupil absence, further research is needed to fully understand the complex factors at play.
Categories:
1) Local government spending on "Children 5-19 public health programmes" does directly affect "Pupil absence"
2) Local government spending on "Children 5-19 public health programmes" does not directly affect "Pupil absence"
ASSISTANT: 
1.
USER: Wri