In [1]:
%load_ext autoreload
%autoreload 2



import pandas as pd
from torch.utils.data import Dataset
import torch
import os
import random
import numpy as np
from torch import nn
from typing import Dict, Optional, Tuple, List
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, random_split
from datasets import Dataset as _Dataset, DatasetDict
import torch.optim as optim
from tqdm import tqdm
from sklearn.metrics import precision_score, f1_score, recall_score, classification_report

import time
import math
import matplotlib
matplotlib.rcParams.update({'figure.figsize': (16, 12), 'font.size': 14})
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import clear_output

import gc

from src.data_loader import *
from src.qa_dataset import *
from src.train import *
from src.classifiers import *
from src.graph import *

os.environ['WANDB_DISABLED'] = 'true'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    #model_name = "unsloth/llama-3-8b-bnb-4bit",
    model_name = "models/lora_model_instruct_1",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth: Fast Llama patching release 2024.4
   \\   /|    GPU: NVIDIA GeForce RTX 4080 SUPER. Max memory: 15.992 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.2. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.25.post1. FA = True.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Unsloth 2024.4 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0.1, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.1.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2024.4 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


## Prepre data

In [4]:
train_df = pd.read_csv('./data/train.csv')
dev_df = pd.read_csv('./data/dev.csv')
test_df = pd.read_csv('./data/test.csv')

In [5]:
def linearize_graph(graph_dict):
    #print(graph_dict["nodes"])
    graph_dict = eval(graph_dict)
    nodes = sorted((node_dict for node_dict in graph_dict["nodes"]), key=lambda d:d["id"])
    for n_id, node_dict in enumerate(nodes):
        assert n_id == node_dict["id"]
    src_node_id2links = {}
    for link_dict in graph_dict["links"]:
        link_src =  link_dict["source"]
        if src_node_id2links.get(link_src) is None:
            src_node_id2links[link_src] = []
        src_node_id2links[link_src].append(link_dict)
    graph_s = ""

    for n_id, node_dict in enumerate(nodes):
        links = src_node_id2links.get(n_id, list())
        start_label = node_dict["label"]
        if node_dict["type"] == "ANSWER_CANDIDATE_ENTITY":
            #start_label = f"{SEP_TOKEN} {start_label} {SEP_TOKEN}"
            start_label = f"[{start_label}]"
        for link_dict in links:
            target_label = nodes[link_dict["target"]]["label"]
            if nodes[link_dict["target"]]["type"] == "ANSWER_CANDIDATE_ENTITY":
                #target_label = f"{SEP_TOKEN} {target_label} {SEP_TOKEN}"
                target_label = f"[{target_label}]"
            link_s = f" {start_label}, {link_dict['label']}, {target_label} "
            graph_s += link_s

    return graph_s

In [6]:
def linearize_graph_new(graph_dict):
    #print(graph_dict["nodes"])
    graph_dict = eval(graph_dict)
    nodes = sorted((node_dict for node_dict in graph_dict["nodes"]), key=lambda d:d["id"])
    for n_id, node_dict in enumerate(nodes):
        assert n_id == node_dict["id"]
    src_node_id2links = {}

    graph_s = ""
    
    for link_dict in graph_dict["links"]:
        link_src =  link_dict["source"]
        link_targ =  link_dict["target"]
        src_name = nodes[link_src]['label']
        targ_name = nodes[link_targ]['label']
        if targ_name == 0:
            src_name, targ_name = targ_name, src_name
        if link_src != link_targ:
            graph_s += f"({src_name}, {link_dict['label']}, {targ_name}); "

    return graph_s

In [7]:
train_df["linearized_graph"] = train_df["graph"].apply(linearize_graph)
dev_df["linearized_graph"] = dev_df["graph"].apply(linearize_graph)
test_df["linearized_graph"] = test_df["graph"].apply(linearize_graph)

In [8]:
def list_answers(df):
    answers_entities = df['answerEntity'].tolist()
    graphs_lin = df['linearized_graph'].tolist()
    question = df['question'].tolist()[0]
    
    res = f"""
Q: {question}
"""
    
    for i, (a, g) in enumerate(zip(answers_entities, graphs_lin)):
        res += f"""{i+1}. [{a}]. Graph: {g} \n"""

    return pd.DataFrame({
        'possible_answers': [res],
        'true_answer': [df['groundTruthAnswerEntity'].tolist()[0]]
    })

train_df_lin = train_df.groupby('question').apply(list_answers).reset_index()
dev_df_lin = dev_df.groupby('question').apply(list_answers).reset_index()

train_ds = _Dataset.from_pandas(train_df_lin)
dev_ds = _Dataset.from_pandas(dev_df_lin)

  train_df_lin = train_df.groupby('question').apply(list_answers).reset_index()
  dev_df_lin = dev_df.groupby('question').apply(list_answers).reset_index()


In [5]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{}

### Input:
{}

### Response:
{}"""

instruction = """Given a question and the associated retrieved knowledge graph triplets contains entity, relation, and entity. Possible answer entity highlighted like [entity] 
You are asked to answer the question with these triplets and your knowledge.
"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    #print(examples)
    
    input       = examples["possible_answers"]
    output      = '{'+examples["true_answer"]+'}'
    texts = []
    text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
    return { "text" : text, }

dataset_train = train_ds.map(formatting_prompts_func, batched = False,)

NameError: name 'train_ds' is not defined

In [10]:
dataset_train['text'][0]

'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n### Instruction:\nGiven a question and the associated retrieved knowledge graph triplets contains entity, relation, and entity. Possible answer entity highlighted like [entity] \nYou are asked to answer the question with these triplets and your knowledge.\n\n\n### Input:\n\nQ: Adolf Hitler was the leader of which party?\n1. [The Nazi Party in Linz, Austria, 1919-1939: A Sociological Perspective]. Graph:  Austria, on focus list of Wikimedia project, Wikipedia:Vital articles/Level/4  Austria, country, Austria  Adolf Hitler, on focus list of Wikimedia project, Wikipedia:Vital articles/Level/4  [The Nazi Party in Linz, Austria, 1919-1939: A Sociological Perspective], instance of, scholarly article  [The Nazi Party in Linz, Austria, 1919-1939: A Sociological Perspective], main subject, Austria  Hitler and new contributions to the deb

## No fine tune

In [41]:
def list_answers_raw(df):
    answers_entities = df['answerEntity'].tolist()
    graphs_lin = df['linearized_graph'].tolist()
    question = df['question'].tolist()[0]
    
    res = f"""
Q: {question}
"""
    
    for i, (a, g) in enumerate(zip(answers_entities, graphs_lin)):
        res += f"""{i+1}. [{a}]. Graph: {g} \n"""

    return pd.DataFrame({
        'possible_answers': [res],
        'true_answer': [df['groundTruthAnswerEntity'].tolist()[0]]
    })

train_df_lin = train_df.groupby('question').apply(list_answers_raw).reset_index()
dev_df_lin = dev_df.groupby('question').apply(list_answers_raw).reset_index()
train_ds = _Dataset.from_pandas(train_df_lin)
dev_ds = _Dataset.from_pandas(dev_df_lin)


  train_df_lin = train_df.groupby('question').apply(list_answers_raw).reset_index()
  dev_df_lin = dev_df.groupby('question').apply(list_answers_raw).reset_index()


In [42]:
print(train_df_lin['possible_answers'][2])


Q: As of Q2 2021, who has the least GPU shipment share?
1. [methyldopa]. Graph:  computer, on focus list of Wikimedia project, Wikipedia:Vital articles/Level/4  computer, has part(s), graphics processing unit  essential medicine, on focus list of Wikimedia project, Wikipedia:Vital articles/Level/4  [methyldopa], instance of, essential medicine  
2. [age related macular degeneration]. Graph:  computer, on focus list of Wikimedia project, Wikipedia:Vital articles/Level/4  computer, has part(s), graphics processing unit  ophthalmology, on focus list of Wikimedia project, Wikipedia:Vital articles/Level/4  [age related macular degeneration], subclass of, disease of a particular individual  [age related macular degeneration], health specialty, ophthalmology  arteriovenous malformation, subclass of, disease of a particular individual  Large-scale ensemble simulations of biomathematical brain arteriovenous malformation models using graphics processing unit computation, main subject, graphics 

In [47]:
few_shot0 = """
Select one of possible answers for the question. Explain your answer very briefly.
Examples:
Q: At the 1994 Oscars, which movie didn't win anything, despite receiving seven nominations?
PA:
1. The Shawshank Redemption 
2. The Great Gatsby 
3. Star Wars: Episode VIII – The Last Jedi 
4. Inglourious Basterds 
5. Gone with the Wind 
6. Gods and Monsters 
7. Fifty Shades Darker 
8. Braveheart 
9. An American Werewolf in London 
10. Alice in Wonderland 
11. A Star Is Born 
A: The Shawshank Redemption was nominated for seven Oscars: Best Picture, Best Actor (Morgan Freeman), Best Adapted Screenplay, Best Cinematography, Best Editing, Best Original Score, Best Sound Mixing but didn't win.
The answer is {The Shawshank Redemption}

Did Bill Clinton belong to the Republican or Democratic Party?
PA:
1. Sri Lanka 
2. North Korea 
3. Democratic Republic of the Congo 
4. Democratic Party 
5. Democratic Party 
6. Democrat Party
A: Bill Clinton belong to Democratic Party
The answer is {Democratic Party}

For which movie was Leonardo DiCaprio nominated for Best Actor in 2007 but did not win?
PA:
1. True Grit 
2. The Shape of Water 
3. The Matrix 
4. The Last of Us Part II 
5. The Last Days of Disco 
6. The Aviator 
7. Ocean's 11 
8. Good Will Hunting 
9. Gone with the Wind 
10. Gone Girl 
11. Blood Diamond
A: Leonardo DiCaprio has been nominated for the Oscars on several occasions: 1994 - Best Supporting Actor for What's Eating Gilbert Grape, 2005 - Best Actor for The Aviator, 2007 - Best Actor for Blood Diamond, and so on.
The answer is {Blood Diamond}

Q: Did Stephen King write Creepshow or Firestarter first?
PA:
1. The Creepshow 
2. The Creepshow 
3. Necronomicon 
4. Firestarter 
5. Firestarter 
6. Fear and Loathing in America 
7. Fear and Loathing 
8. Crocodile Dundee 
9. Crimson Skies 
10. Crimson Skies 
11. Creepshow
A: Stephen King wrote Firestarter first, published in 1980. Creepshow, the graphic novel, came after in 1982, inspired by the 1982 movie for which King wrote the screenplay.
The answer is {Firestarter}

"""

few_shot1 = """Q: What state is home to the university that is represented in sports by George Washington Colonials men's basketball?
A: First, the education institution has a sports team named George Washington Colonials men's basketball in is George Washington University , Second, George Washington University is in Washington D.C. The answer is {Washington, D.C.}.

Q: Who lists Pramatha Chaudhuri as an influence and wrote Jana Gana Mana?
A: First, Bharoto Bhagyo Bidhata wrote Jana Gana Mana. Second, Bharoto Bhagyo Bidhata lists Pramatha Chaudhuri as an influence. The answer is {Bharoto Bhagyo Bidhata}.

Q: Who was the artist nominated for an award for You Drive Me Crazy?
A: First, the artist nominated for an award for You Drive Me Crazy is Britney Spears. The answer is {Jason Allen Alexander}.

Q: What person born in Siegen influenced the work of Vincent Van Gogh?
A: First, Peter Paul Rubens, Claude Monet and etc. influenced the work of Vincent Van Gogh. Second, Peter Paul Rubens born in Siegen. The answer is {Peter Paul Rubens}.

Q: What is the country close to Russia where Mikheil Saakashvii holds a government position?
A: First, China, Norway, Finland, Estonia and Georgia is close to Russia. Second, Mikheil Saakashvii holds a government position at Georgia. The answer is {Georgia}.

Q: What drug did the actor who portrayed the character Urethane Wheels Guy overdosed on?
A: First, Mitchell Lee Hedberg portrayed character Urethane Wheels Guy. Second, Mitchell Lee Hedberg overdose Heroin. The answer is {Heroin}."""

few_shot = """Q: Are the Green Bay Packers or Dallas Cowboys older?
1. [history of the Los Angeles Rams]. Graph:  United States, country, United States  Dallas Cowboys, country, United States  Green Bay Packers, country, United States  [history of the Los Angeles Rams], country, United States  
2. [history of the Green Bay Packers]. Graph:  United States, country, United States  Dallas Cowboys, country, United States  Green Bay Packers, country, United States  Green Bay Packers, history of topic, [history of the Green Bay Packers]  [history of the Green Bay Packers], country, United States  [history of the Green Bay Packers], facet of, Green Bay Packers  
3. [New York Giants]. Graph:  United States, country, United States  Dallas Cowboys, country, United States  Green Bay Packers, country, United States  [New York Giants], country, United States  
4. [New England Patriots Cheerleaders]. Graph:  United States, country, United States  Dallas Cowboys, country, United States  Green Bay Packers, country, United States  [New England Patriots Cheerleaders], country, United States  
5. [Green Bay Packers]. Graph:  United States of America, country, United States of America  Dallas Cowboys, country, United States of America  [Green Bay Packers], country, United States of America  
6. [Gerrit Kolenbrander]. Graph:  Tony Romo, sex or gender, male  Tony Romo, member of sports team, Dallas Cowboys  John Stephens, sex or gender, male  John Stephens, member of sports team, Green Bay Packers  [Gerrit Kolenbrander], sex or gender, male  
7. [G.B.C.]. Graph:  United States, country, United States  United States, diplomatic relation, Italy  Italy, diplomatic relation, United States  Italy, country, Italy  Dallas Cowboys, country, United States  Green Bay Packers, country, United States  [G.B.C.], country, Italy  
8. [District of Columbia]. Graph:  United States, country, United States  United States, contains the administrative territorial entity, [District of Columbia]  Dallas Cowboys, country, United States  Green Bay Packers, country, United States  [District of Columbia], country, United States  
9. [Denver Broncos]. Graph:  United States, country, United States  Dallas Cowboys, country, United States  Green Bay Packers, country, United States  [Denver Broncos], country, United States  
10. [Dallas Cowboys]. Graph:  United States, country, United States  [Dallas Cowboys], country, United States  Green Bay Packers, country, United States  
11. [Chicago Bears]. Graph:  United States, country, United States  Dallas Cowboys, country, United States  Green Bay Packers, country, United States  [Chicago Bears], country, United States
A: {Green Bay Packers}

Q: As of Q2 2021, who has the least GPU shipment share?
1. [methyldopa]. Graph:  computer, on focus list of Wikimedia project, Wikipedia:Vital articles/Level/4  computer, has part(s), graphics processing unit  essential medicine, on focus list of Wikimedia project, Wikipedia:Vital articles/Level/4  [methyldopa], instance of, essential medicine  
2. [age related macular degeneration]. Graph:  computer, on focus list of Wikimedia project, Wikipedia:Vital articles/Level/4  computer, has part(s), graphics processing unit  ophthalmology, on focus list of Wikimedia project, Wikipedia:Vital articles/Level/4  [age related macular degeneration], subclass of, disease of a particular individual  [age related macular degeneration], health specialty, ophthalmology  arteriovenous malformation, subclass of, disease of a particular individual  Large-scale ensemble simulations of biomathematical brain arteriovenous malformation models using graphics processing unit computation, main subject, graphics processing unit  Large-scale ensemble simulations of biomathematical brain arteriovenous malformation models using graphics processing unit computation, main subject, arteriovenous malformation  
3. [PowerVR]. Graph:  Imagination Technologies, produces, graphics processing unit  Imagination Technologies, owner of, [PowerVR]  [PowerVR], owned by, Imagination Technologies  
4. [Nvidia]. Graph:  [Nvidia], produces, graphics processing unit  
5. [Imagination Technologies (United States)]. Graph:  Imagination Technologies, produces, graphics processing unit  Imagination Technologies, has subsidiary, [Imagination Technologies (United States)]  [Imagination Technologies (United States)], parent organisation, Imagination Technologies  
6. [Imagination Technologies]. Graph:  [Imagination Technologies], produces, graphics processing unit  
7. [GeForce]. Graph:  [GeForce], subclass of, graphics processing unit  
8. [CUDA]. Graph:  [CUDA], uses, graphics processing unit  
9. [AMD]. Graph:  [AMD], produces, graphics processing unit  
A: {Nvidia}

"""

In [50]:
prompt_fs = """{}
{}
A: """

FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    prompt_fs.format(few_shot, dev_ds['possible_answers'][0])
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 100, early_stopping=True)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|>Q: Are the Green Bay Packers or Dallas Cowboys older?
1. [history of the Los Angeles Rams]. Graph:  United States, country, United States  Dallas Cowboys, country, United States  Green Bay Packers, country, United States  [history of the Los Angeles Rams], country, United States  
2. [history of the Green Bay Packers]. Graph:  United States, country, United States  Dallas Cowboys, country, United States  Green Bay Packers, country, United States  Green Bay Packers, history of topic, [history of the Green Bay Packers]  [history of the Green Bay Packers], country, United States  [history of the Green Bay Packers], facet of, Green Bay Packers  
3. [New York Giants]. Graph:  United States, country, United States  Dallas Cowboys, country, United States  Green Bay Packers, country, United States  [New York Giants], country, United States  
4. [New England Patriots Cheerleaders]. Graph:  United States, country, United States  Dallas Cowboys, country, United States  Green Bay 

In [52]:
#dev_df[dev_df.question=="After a series of number 1 hits in 1956, which Elvis Presley song failed to reach the top spot in the US chart?"]

In [53]:
#train_df[train_df.question=="Are the Green Bay Packers or Dallas Cowboys older?"]

In [37]:
#res

In [None]:
import re

res = {}
for pp in tqdm(dev_ds):
    input_text = prompt_fs.format(few_shot, pp['possible_answers'])
    inputs = tokenizer(
    [
        input_text
    ], return_tensors = "pt").to("cuda")
    
    outputs = model.generate(**inputs, max_new_tokens = 100, use_cache = True, early_stopping=True)
    out = tokenizer.batch_decode(outputs)[0]
    pattern = r'\{([^}]*)\}'
    out = out[len(input_text):]
    
    matchs = re.findall(pattern, out)
    if len(matchs) > 0:
        match = matchs[0]
    else:
        match = ""

    res[pp['question']] = match

In [15]:
res_df = pd.DataFrame({
    'question': list(res.keys()),
    'pred_ans': list(res.values())
})

dev_df2 = dev_df.merge(res_df, on='question', how='left')
dev_df2['pred'] = (dev_df2['pred_ans'] == dev_df2['answerEntity']).astype(float)
print(dev_df2['pred'].mean())

true_labels = dev_df2.label
pred_labels = dev_df2.pred

print(f"{precision_score(true_labels, pred_labels)},{recall_score(true_labels, pred_labels)},{f1_score(true_labels, pred_labels)}")

0.04599840467960649
0.5722543352601156,0.25984251968503935,0.3574007220216607


## First tune for instructions

In [7]:
alpaca_prompt_inst = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt_inst.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 1e-3,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

trainer_stats = trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,1.8262
2,2.3094
3,1.639
4,1.6103
5,1.2519
6,1.2541
7,0.9473
8,1.1565
9,1.0229
10,1.1415


In [8]:
#del trainer
gc.collect()
torch.cuda.empty_cache()

In [9]:
# Save model
model.save_pretrained("models/lora_model_instruct_1") 

## Tune for task

In [11]:
os.environ['WANDB_DISABLED'] = 'true'

In [12]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset_train,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 1e-3,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Map (num_proc=2): 100%|████████████████████████████████████████████████████| 3181/3181 [00:02<00:00, 1185.21 examples/s]
max_steps is given, it will override any value given in num_train_epochs


In [13]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 3,181 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,0.9768
2,0.9774
3,0.7649
4,0.6445
5,0.4874
6,0.4177
7,0.3873
8,0.327
9,0.3473
10,0.3482


In [10]:
# alpaca_prompt = Copied from above

# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    """How many gods did the Incas believe in?"""
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, do_sample=True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 100)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|>How many gods did the Incas believe in? The Incas believed in one god, Viracocha, who created the world and all the people in it. They also believed in many lesser gods, or deities, who were responsible for different aspects of life.
What are the 3 main gods of the Incas? The three main gods of the Incas were Inti, the sun god, Pachamama, the earth goddess, and Mama Quilla, the moon goddess. Inti was the most important god, and he


In [11]:
# alpaca_prompt = Copied from above

instruction = "Explain your answer briefly ans write answer to the question like {answer}"

# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        instruction, # instruction
        "How many gods did the Incas believe in?",
        #dev_ds[2]['possible_answers'], # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, do_sample=True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 200, early_stopping=True)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
Explain your answer briefly ans write answer to the question like {answer}

### Input:
How many gods did the Incas believe in?

### Response:
The Incas believed in a supreme god, Viracocha, who created the world and all its inhabitants. They also believed in a number of lesser gods, including Inti, the sun god, and Mama Quilla, the moon goddess. Additionally, they had a pantheon of nature gods, including Chasca, the god of the rainbow, and Pachamama, the earth goddess. The Incas also believed in a number of other supernatural beings, such as the Apus, mountain spirits, and the Huacas, which were the spirits of the dead. In total, the Incas believed in a complex and diverse pantheon of gods and supernatural beings.<|end_of_text|>


In [18]:
import re

res = {}
for pp in tqdm(dev_ds):
    inp = alpaca_prompt.format(
            instruction, # instruction
            pp['possible_answers'], # input
            "", # output - leave this blank for generation!
        )
    inputs = tokenizer(
    [
        inp
    ], return_tensors = "pt").to("cuda")
    
    outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True, early_stopping=True)
    out = tokenizer.batch_decode(outputs)[0]
    out = out[len(inp):]
    pattern = r'\{([^}]*)\}'
    match = re.findall(pattern, out)
    if len(match) > 0:
        ret = match[0]
    else:
        ret = ""

    res[pp['question']] = ret

  0%|                                                                                           | 0/354 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|▏                                                                                  | 1/354 [00:00<05:06,  1.15it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|▍                                                                                  | 2/354 [00:01<03:57,  1.48it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|▋                                                                                  | 3/354 [00:02<03:45,  1.56it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|▉                                                                                  | 4/354 [00:02<03:30,  1.66it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|█▏                            

In [20]:
from copy import copy
res_approx = copy(res)

In [68]:
## To possible



model_name="sentence-transformers/all-mpnet-base-v2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name)
bert_model.eval()
print()

res_approx = copy(res)

for question, model_answer in tqdm(res.items()):
    answers = dev_df[dev_df.question==question]['answerEntity'].tolist()
    if model_answer in answers:
        continue
    ans_list = [model_answer]+answers
    tok = tokenizer(ans_list, return_tensors="pt", truncation=True, padding="max_length")
    #tok = {k:v.cuda() for k,v in tok.items()}
    last_hidden_states = bert_model(**tok)['last_hidden_state']
    text_cls_embeddings = torch.stack([elem[0, :] for elem in last_hidden_states])
    scores = text_cls_embeddings[0]@text_cls_embeddings[1:].T / torch.norm(text_cls_embeddings[0]) / torch.norm(text_cls_embeddings[1:],dim=1)
    scores = scores.detach().cpu().numpy()
    best_idx = np.argmax(scores)
    
    res_approx[question] = answers[best_idx]





100%|█████████████████████████████████████████████████████████████████████████████████| 354/354 [00:54<00:00,  6.49it/s]


In [21]:
res_df = pd.DataFrame({
    'question': list(res_approx.keys()),
    'pred_ans': list(res_approx.values())
})

dev_df2 = dev_df.merge(res_df, on='question', how='left')
dev_df2['pred'] = (dev_df2['pred_ans'] == dev_df2['answerEntity']).astype(float)
print(dev_df2['pred'].mean())

true_labels = dev_df2.label
pred_labels = dev_df2.pred

print(f"{precision_score(true_labels, pred_labels)},{recall_score(true_labels, pred_labels)},{f1_score(true_labels, pred_labels)}")

0.10608880616857219
0.47117794486215536,0.49343832020997375,0.48205128205128206


In [None]:
0.48614609571788414,0.5065616797900262,0.4961439588688946

In [19]:
'''res_df = pd.DataFrame({
    'question': list(res.keys()),
    'pred_true': list(res.values())
})

dev_df2 = dev_df.merge(res_df, on='question', how='left')
dev_df2['pred'] = 0.0
rand_fls = dev_df2[dev_df2['answerEntity']!=dev_df2['groundTruthAnswerEntity']].groupby('question')[['question', 'answerEntity']].head(1).rename(columns={'answerEntity': 'w_answerEntity'})
dev_df2 = dev_df2.merge(rand_fls, on='question', how='left')
#print(rand_fls.head())
dev_df2.loc[dev_df2['pred_true'] & (dev_df2['answerEntity']==dev_df2['groundTruthAnswerEntity']), 'pred'] = 1.0
dev_df2.loc[~dev_df2['pred_true'] & (dev_df2['answerEntity']==dev_df2['w_answerEntity']), 'pred'] = 1.0'''

"res_df = pd.DataFrame({\n    'question': list(res.keys()),\n    'pred_true': list(res.values())\n})\n\ndev_df2 = dev_df.merge(res_df, on='question', how='left')\ndev_df2['pred'] = 0.0\nrand_fls = dev_df2[dev_df2['answerEntity']!=dev_df2['groundTruthAnswerEntity']].groupby('question')[['question', 'answerEntity']].head(1).rename(columns={'answerEntity': 'w_answerEntity'})\ndev_df2 = dev_df2.merge(rand_fls, on='question', how='left')\n#print(rand_fls.head())\ndev_df2.loc[dev_df2['pred_true'] & (dev_df2['answerEntity']==dev_df2['groundTruthAnswerEntity']), 'pred'] = 1.0\ndev_df2.loc[~dev_df2['pred_true'] & (dev_df2['answerEntity']==dev_df2['w_answerEntity']), 'pred'] = 1.0"

In [20]:
#dev_df[dev_df.question == 'Whose assassination sparked the beginning of World War I?']

In [17]:
#dev_df2[dev_df2.question=='How many gods did the Incas believe in?']