In [1]:
!pip install transformers
!pip install torch
!pip install datasets
!pip install transformers accelerate bitsandbytes
!pip install nltk
!pip install tqdm

from nltk.tokenize import word_tokenize, sent_tokenize
from collections import defaultdict, Counter
from sklearn.metrics.pairwise import cosine_similarity
from datasets import Dataset, DatasetDict, load_dataset
from huggingface_hub import login
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, BertTokenizer, BertModel, AutoTokenizer, AutoModel
from tqdm import tqdm

import json
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import statistics
import numpy as np
import random
import torch
import re
import seaborn as sns
import transformers



In [None]:
HF_L = "xxx"
login(token=HF_L)
model = 'mistral'

if model == 'llama':
    llama_model_name = "meta-llama/Llama-2-7b-chat-hf"
    llama_tokenizer = AutoTokenizer.from_pretrained(llama_model_name, use_auth_token=HF_L)
    llama_model = AutoModelForCausalLM.from_pretrained(
        llama_model_name,
        torch_dtype=torch.float16,
        device_map="auto",
        use_auth_token=HF_L
    )
if model == 'falcon':
    falcon_model_name = "tiiuae/falcon-7b-instruct"
    falcon_tokenizer = AutoTokenizer.from_pretrained(falcon_model_name)
    falcon_model = AutoModelForCausalLM.from_pretrained(
        falcon_model_name,
        torch_dtype=torch.bfloat16,
        device_map="auto"
    )
if model == 'gemma':
    gemma_tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b-it")
    gemma_model = AutoModelForCausalLM.from_pretrained(
        "google/gemma-7b-it", 
        device_map="auto", 
        revision="float16")
if model == 'mistral':
    device = "cuda"
    mistral_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
    mistral_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
def get_llama_embeddings(prompt, seed=42):
    inputs = llama_tokenizer(prompt, return_tensors="pt").to("cuda")
    torch.manual_seed(seed)    
    llama_model.config.output_hidden_states = True    
    with torch.no_grad():
        outputs = llama_model(**inputs)    
    embeddings = outputs.hidden_states[-1]
    attention_mask = inputs["attention_mask"].unsqueeze(-1)
    embeddings = embeddings * attention_mask
    sum_embeddings = embeddings.sum(dim=1)
    mask_sum = attention_mask.sum(dim=1)
    mean_pooled = sum_embeddings / mask_sum
    mean_pooled = mean_pooled.float().detach().cpu().numpy()
    return mean_pooled[0]
    
def get_falcon_embeddings(prompt, seed=42):
    if not hasattr(falcon_model, "config"):
        raise ValueError("falcon_model is not properly initialized. Please load the model correctly.")
    inputs = falcon_tokenizer(prompt, return_tensors="pt").to("cuda")
    torch.manual_seed(seed)
    falcon_model.config.output_hidden_states = True
    with torch.no_grad():
        outputs = falcon_model(**inputs)
    embeddings = outputs.hidden_states[-1]
    embeddings = outputs.hidden_states[-1]
    attention_mask = inputs["attention_mask"].unsqueeze(-1)
    embeddings = embeddings * attention_mask
    sum_embeddings = embeddings.sum(dim=1)
    mask_sum = attention_mask.sum(dim=1)
    mean_pooled = sum_embeddings / mask_sum
    mean_pooled = mean_pooled.float().detach().cpu().numpy()
    return mean_pooled[0]
    
def get_gemma_embeddings(prompt, seed=42):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    device = "cuda" if torch.cuda.is_available() else "cpu"    
    gemma_model.to(device)    
    inputs = gemma_tokenizer(prompt, return_tensors="pt")  
    input_ids = inputs["input_ids"].to(device)    
    attention_mask = inputs["attention_mask"].to(device)
    gemma_model.config.output_hidden_states = True
    with torch.no_grad():
        outputs = gemma_model(input_ids=input_ids) 
    embeddings = outputs.hidden_states[-1]
    attention_mask = attention_mask.unsqueeze(-1)
    embeddings = embeddings * attention_mask    
    sum_embeddings = embeddings.sum(dim=1)
    mask_sum = attention_mask.sum(dim=1)    
    mean_pooled = sum_embeddings / mask_sum
    mean_pooled = mean_pooled.float().detach().cpu().numpy()    
    return mean_pooled[0]
    
def get_mistral_embeddings(prompt, seed=42):
    global mistral_model
    mistral_model = mistral_model.to("cuda")    
    inputs = mistral_tokenizer(prompt, return_tensors="pt")
    inputs = {key: value.to("cuda") for key, value in inputs.items()}    
    torch.manual_seed(seed)
    mistral_model.config.output_hidden_states = True    
    with torch.no_grad():
        outputs = mistral_model(**inputs)    
    embeddings = outputs.hidden_states[-1]
    attention_mask = inputs["attention_mask"].unsqueeze(-1)
    embeddings = embeddings * attention_mask
    sum_embeddings = embeddings.sum(dim=1)
    mask_sum = attention_mask.sum(dim=1)
    mean_pooled = sum_embeddings / mask_sum
    mean_pooled = mean_pooled.float().detach().cpu().numpy()
    return mean_pooled[0]

In [4]:
ANSWER_GENERATION_PROMPT = '''Generate an answer to the below question based on the provided snippet.

question: "{0}"
snippet: "{1}"
'''

QUESTION_GENERATION_PROMPT = '''Your task is to generate a clear and concise question based on the provided snippet and answer. Ensure that the generated question directly corresponds to the snippet's content and leads to the given answer.

Here is the input:
Snippet: "{0}"
Answer: "{1}"

Generate the most appropriate question:'''

QUESTION_GENERATION_PROMPT_GEMMA = '''Generate the question which was asked regarding the below snippet and provided answer. Ensure that the generated question directly corresponds to the snippet's content and leads to the given answer.

snippet: "{0}"
answer: "{1}"

The output should contain only the question (don't output the answer to the question).
'''

In [5]:
def get_case_alignment(case_embs, case_base):
    emb1, emb2 = case_embs
    emb1 = emb1.reshape(1, -1) if emb1.ndim == 1 else emb1
    emb2 = emb2.reshape(1, -1) if emb2.ndim == 1 else emb2
    alignment_scores = []
    for past_case in case_base:
        past_prob_emb, past_solution_emb = past_case
        past_prob_emb = past_prob_emb.reshape(1, -1) if past_prob_emb.ndim == 1 else past_prob_emb
        past_solution_emb = past_solution_emb.reshape(1, -1) if past_solution_emb.ndim == 1 else past_solution_emb
        prob_similarity = cosine_similarity(emb1, past_prob_emb)
        solution_similarity = cosine_similarity(emb2, past_solution_emb)
        alignment_score = (prob_similarity + solution_similarity) / 2.0
        alignment_scores.append(alignment_score)
    return (sum(alignment_scores) / len(alignment_scores))[0][0]


def get_embeddings(model, text):
    if model == "llama":
        return get_llama_embeddings(text)
    elif model == "falcon":
        return get_falcon_embeddings(text)
    elif model == "gemma":
        return get_gemma_embeddings(text)
    elif model == "mistral":
        return get_mistral_embeddings(text)
    else:
        raise ValueError(f"Unknown model: {model}")

In [None]:
dataset = load_dataset("Ramitha/unique-records-snippet-combination")
df = pd.DataFrame(dataset['rawcases'])

for index, row in tqdm(df.iterrows(), total=len(df)):
    if model not in ["llama", "falcon", "gemma", "mistral"]:
        continue
    if (row['model'] != model):
        continue
    q_emb_without_context = np.array(get_embeddings(model, row["question"]))
    a_emb_without_context = np.array(get_embeddings(model, row["answerGenerated"]))
    q_emb_with_context = np.array(get_embeddings(model, row["question"]  + " " + row["snippet"]))
    case_without_context = [q_emb_without_context, a_emb_without_context]    
    case_with_problem_context = [q_emb_with_context, a_emb_without_context]
    
    case_base_without_context, case_base_with_context, case_base_with_problem_context, case_base_with_answer_context = [], [], [], []
    for other_model in ["llama", "falcon", "gemma", "mistral"]:
        if other_model == model:
            continue
        q_wo = np.array(get_embeddings(model, row[f"question_answerGenerated_{other_model}"]))
        q_wc = np.array(get_embeddings(model, row[f"question_answerGenerated_{other_model}"] + " " + row["snippet"]))
        a_wo = np.array(get_embeddings(model, row[f"reverse_answer_answerGenerated_{other_model}"]))
        case_base_without_context.append([q_wo, a_wo])
        case_base_with_problem_context.append([q_wc, a_wo])
    df.at[index, f"ILRAlign_without_context_{model}"] = get_case_alignment(case_without_context, case_base_without_context)
    df.at[index, f"ILRAlign_with_problem_context_only_{model}"] = get_case_alignment(case_with_problem_context, case_base_with_problem_context)

hf_dataset = DatasetDict({
    'rawcases': Dataset.from_pandas(df)
})
hf_dataset.push_to_hub("Ramitha/unique-records-snippet-combination")