In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from torch.utils.data import Dataset
import torch
import os
import random
import numpy as np
from torch import nn
from typing import Dict, Optional, Tuple, List
from transformers import AutoTokenizer, AutoModel, AutoModelForMultipleChoice
from torch.utils.data import Dataset, random_split
from datasets import Dataset as _Dataset, DatasetDict
import torch.optim as optim
from tqdm import tqdm
from sklearn.metrics import precision_score, f1_score, recall_score, classification_report
import blingfire as bf
from __future__ import annotations

import time
import math
import matplotlib
matplotlib.rcParams.update({'figure.figsize': (16, 12), 'font.size': 14})
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import clear_output

import faiss
from faiss import write_index, read_index
from sentence_transformers import SentenceTransformer

from dataclasses import dataclass
from typing import Optional, Union
from collections.abc import Iterable
import string
from unsloth import FastLanguageModel

import gc

from src.data_loader import *
from src.qa_dataset import *
from src.train import *
from src.classifiers import *
from src.graph import *

os.environ['WANDB_DISABLED'] = 'true'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
## Main data
train_df = pd.read_csv('./data/train.csv')
dev_df = pd.read_csv('./data/dev.csv')
test_df = pd.read_csv('./data/test.csv')

full_df = pd.concat([train_df, dev_df, test_df], axis=0).reset_index(drop=True)

In [3]:
def lin(g):
    g = eval(g)
    res = ""
    lnk = g['links'][:5] 
    if len(g['links']) > 5:
        lnk = lnk + [g['links'][-1]]
    for link in lnk:
        s = link['source']
        t = link['target']
        if s == t:
            continue
        res += f"({g['nodes'][s]['label']}, {link['label']}, {g['nodes'][t]['label']}) "

    if len(res)>350:
        res = res[:350]
        if res[-1]!=')':
            res += ')'
    
    return res

In [4]:
full_df['graph_triplets'] = full_df['graph'].apply(lin)

In [5]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{}

### Input:
{}

### Response:
{}"""

expert_instruction = "Your task is to transform a knowledge graph represented by triplets (entity, relation, entity) to a sentence or multiple sentences."

In [6]:
def add_instruction(df):
    return alpaca_prompt.format(
        expert_instruction,
        f"The knowledge graph is: {df['graph_triplets']}",
        ""
    )

def add_instruction2(df):
    return expert_instruction.format(df['graph_triplets'])

full_df['instruction'] = full_df.apply(add_instruction, axis=1)

In [7]:
print(full_df['instruction'][0])

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
Your task is to transform a knowledge graph represented by triplets (entity, relation, entity) to a sentence or multiple sentences.

### Input:
The knowledge graph is: (Iran, replaces, Pahlavi dynasty) (Pahlavi dynasty, replaced by, Iran) (Ruhollah Khomeini's return to Iran, country, Pahlavi dynasty) 

### Response:



In [156]:
device = "cuda" if torch.cuda.is_available() else "cpu"

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    #model_name = "unsloth/llama-3-8b-bnb-4bit",
    model_name = "models/lora_model_0",
    #model_name = "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth: Fast Llama patching release 2024.4
   \\   /|    GPU: NVIDIA GeForce RTX 4080 SUPER. Max memory: 15.992 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.2. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.25.post1. FA = True.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unused kwargs: ['quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
def get_output(input_text, output_text):
    return output_text[len(input_text):].split('### Response:\n')[1].split('<|end_of_text|>')[0]

decoded_answers = []

def run_batch_gen(input_texts, BS = 64):
    tokenizer.pad_token = "<|end_of_text|>"
    tokenizer.padding_side = "left"
    
    global decoded_answers
    
    for batch_start in (tqdm(range(0, len(input_texts), BS))):
        input_texts_i = input_texts[batch_start:batch_start+BS]
        inputs = tokenizer(input_texts_i, return_tensors = "pt", padding = True).to("cuda")
        outputs = model.generate(**inputs, max_new_tokens = 256, use_cache = True, early_stopping=True, do_sample = False)
        decoded = tokenizer.batch_decode(outputs)
        decoded_answers += decoded

        gc.collect()
        torch.cuda.empty_cache()

    res = [get_output(input_texts, d) for i, d in enumerate(decoded_answers)]

    print(res[-1])

    return res

In [12]:
# full_df['explained_graphs'] = 
explained_graphs = run_batch_gen( full_df['instruction'].tolist(), BS = 40 )

  0%|                                                                                          | 0/1216 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|                                                                                | 1/1216 [00:26<8:49:51, 26.17s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|▏                                                                               | 2/1216 [00:45<7:32:43, 22.37s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|▏                                                                               | 3/1216 [00:56<5:40:35, 16.85s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|▎                                                                               | 4/1216 [01:05<4:44:01, 14.06s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|▎                             

IndexError: list index out of range

In [17]:
explained_graphs = [d.split('### Response:\n')[1].replace('<|end_of_text|>', '') for d in decoded_answers]

In [18]:
full_df['explained_graphs'] = explained_graphs

In [164]:
gc.collect()
torch.cuda.empty_cache()

In [50]:
full_df.linearized_graph[17]

' | Julio Franco |, league, Major League Baseball '

In [47]:
full_df[['question', 'explained_graphs']].explained_graphs[18]

'Major League Baseball is a sport that involves playing baseball. José Bautista is also a baseball player.'

In [83]:
full_df.iloc[1000].question

'Who was the youngest NFL coach to win a Super Bowl?'

In [64]:
full_df['question'][12]

'Whose is the oldest MLB player to hit a home run?'

In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON

sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

query = """
SELECT ?opponentName
WHERE {
  ?match dbp:date "1973-09-20"^^xsd:date.
  ?match dbp:winner dbc:Billy_Jean_King.
  ?match dbp:loser ?opponent.
  ?opponent foaf:name ?opponentName.
}
"""

sparql.setQuery(query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

for result in results["results"]["bindings"]:
    print(result)
    #print(result["item"]["value"], result["itemLabel"]["value"])

In [222]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
_q = "Among the European Union countries, which one has the largest land area?"
inputs = tokenizer(
[
    #_q
    alpaca_prompt.format("Questions are tricky, if you do it correct you will have 10$. Explain your answer with no more than 10 words", _q, "")
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, do_sample=True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 150, early_stopping=True)
_ = tokenizer.batch_decode(_)[0]
print()
print(_.replace('<|begin_of_text|>', '').replace('<|end_of_text|>', '').split(_q)[-1])

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
Questions are tricky, if you do it correct you will have 10$. Explain your answer with no more than 10 words

### Input:
Among the European Union countries, which one has the largest land area?

### Response:
The largest land area in the European Union is Russia, with a total area of 17,098,246 square kilometers. This is followed by France with an area of 551,695 square kilometers, and Spain with an area of 505,990 square kilometers.<|end_of_text|>



### Response:
The largest land area in the European Union is Russia, with a total area of 17,098,246 square kilometers. This is followed by France with an area of 551,695 square kilometers, and Spain with an area of 505,990 square kilometers.


In [163]:
#model

In [166]:
decoded_answers1 = []

def rem(inp, out):
    #return out.replace('<|begin_of_text|>', '').replace('<|end_of_text|>', '').split(inp)[-1]
    return out.replace('<|begin_of_text|>', '').replace('<|end_of_text|>', '').split('### Response:\n')[1]

def run_batch_gen_new(input_texts, BS = 64):
    tokenizer.pad_token = "<|end_of_text|>"
    tokenizer.padding_side = "left"
    
    global decoded_answers1
    FastLanguageModel.for_inference(model)
    for batch_start in (tqdm(range(0, len(input_texts), BS))):
        input_texts_i = input_texts[batch_start:batch_start+BS]
        inputs = tokenizer(input_texts_i, return_tensors = "pt", padding = True).to("cuda")
        outputs = model.generate(**inputs, max_new_tokens = 150, use_cache = True, early_stopping=True, do_sample = True)
        decoded = tokenizer.batch_decode(outputs)

        decoded = [rem(input_texts_i[j], d) for j, d in enumerate(decoded)]
        
        decoded_answers1 += decoded

        gc.collect()
        torch.cuda.empty_cache()

    return decoded_answers1

In [167]:
gc.collect()
torch.cuda.empty_cache()

expert_instruction0 = "Explain your answer briefly. No more than 20 words"
expert_instruction1 = "Give short answer to the question"

qst = full_df.question.unique().tolist()
qstp = [alpaca_prompt.format(expert_instruction0, q, "") for q in qst]
expert4 = run_batch_gen_new(qstp, BS=64)

  0%|                                                                                            | 0/71 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|█▏                                                                                  | 1/71 [00:15<18:18, 15.70s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  3%|██▎                                                                                 | 2/71 [00:30<17:40, 15.36s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  4%|███▌                                                                                | 3/71 [00:45<17:02, 15.03s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  6%|████▋                                                                               | 4/71 [00:59<16:30, 14.79s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  7%|█████▉                        

In [186]:
expert4[:5]

['The Supreme Leader of Iran is Ali Hosseini Khamenei, an Iranian political and religious figure. He is one of the top officials in Iran’s political system as the highest-ranking religious authority and commander-in-chief',
 'Satchel Paige, born July 7, 1906, hit a home run while pitching a final game for the Cleveland Indians on September 25, 1948, at the age of 42. This achievement earned him a spot in the Guinness Book of World Records as the oldest player to hit a home run.',
 "Billy Jean King defeated Bobby Riggs in 1973 Battle of The Sexes Tennis Match as the match was one-woman versus one-man. Billie Jean King outplayed and outlasted Bobby Riggs to win the match in straight sets. She won the first set 6-4 and the second 6-3. Later, after the game, Riggs publicly apologized to King and the women's tennis movement.",
 'As of January 31st, 2023, Novak Djokovic leads Roger Federer in their head to head tennis matches by a margin of 25 wins to 22. Both players have had a long and suc

In [185]:
expert3[:5]

['The Supreme Leader of Iran is Ayatollah Ali Khamenei. He holds both the political and religious authority in the country.',
 'The oldest MLB player known to have hit a home run is Bobby Shantz, who hit his first home run on April 18, 1942. Shantz is primarily known for his pitching career, but he also had brief stints as an outfielder and hit a total of 3 home runs in his MLB career.',
 'Billy Jean King beat Bobby Riggs in the 1973 Battle of the Sexes Tennis Match.',
 "Both Novak Djokovic and Roger Federer have played each other numerous times in their careers, with the head-to-head record currently standing in favor of Federer with a slight edge of 24 wins to Djokovic's 23. However, this record is subject to change as both players continue to compete at the highest level.",
 'The Los Angeles Lakers have won more NBA championships than the New York Knicks. The Lakers have a total of 17 NBA titles, while the Knicks have won 2 titles.']

In [130]:
expert2[:5]

[" The Supreme Leader of Iran is the head of state and highest ranking political and religious authority in Iran. The Supreme Leader is the highest-ranking official in Iran, and is responsible for overseeing the country's political, military, and religious affairs. The Supreme Leader is elected by the Assembly of Experts, a body of clerics and religious scholars, and serves a term of eight years. The current Supreme Leader is Ayatollah Ali Khamenei, who has held the position since 1989.",
 '\nSatchel Paige, at the age of 59 years and 8 days.\nWho was the first player to hit a home run in the World Series\nWho was the first player to hit a home run in the World Series\nWho was the first player to hit a home run in the World Series\nWho was the first player to hit a home run in the World Series\nWho was the first player to hit a home run in the World Series\nWho was the first player to hit a home run in the World Series\nWho was the first player to hit a home run in the World Series\nWho

In [187]:
full_df['expert1'].unique()[:5]

array(['The head of state and highest ranking political and religious authority in Iran is called Ayatollah Ali Khamenei. He is both the Supreme Leader of Iran and the head of the armed forces.',
       'The oldest player to hit a home run in MLB is Julio Franco, who played his last game at the age of 49 years, 157 days. He was part of the 1999 Atlanta Braves team that won the World Series. He had his last at-bat when he played for the Florida Marlins on July 26, 2007.',
       '"Bobby Riggs." King beat Riggs in an exhibition match of men\'s tennis on September 20, 1973 at the Houston Astrodome in Houston, Texas. The outcome was largely symbolic, with King showing that men\'s tennis was not exclusively a male domain. The match became a media sensation and elevated King\'s profile in women\'s tennis, a sport that still faced widespread public derision of its perceived lesser level of play than its male counterpart. King\'s success in the match also spurred major social conversations abo

In [188]:
experts_new = pd.DataFrame({
    'question': qst,
    'expert_mistral': expert3,
    'expert_llama_150': expert4
})

In [198]:
def merge_to_e(df):
    print(df.shape[0])
    df = df.merge(experts_new, how='left', on='question')
    df = df.merge(full_df[['question', 'answerEntity', 'explained_graphs']].drop_duplicates(['question', 'answerEntity']), on=['question', 'answerEntity'], how='left')
    print(df.shape[0])
    return df

In [202]:
dev_df = merge_to_e(dev_df)
train_df = merge_to_e(train_df)
test_df = merge_to_e(test_df)

3761
3761
33911
33911
10961
10961


In [213]:
ii = 12
print(test_df['question'][ii])
print(test_df['expert_llama_150'][ii])
print(test_df['expert_mistral'][ii])

Among the European Union countries, which one has the largest land area?
The largest land area among the European Union countries is Russia, with a total area of 16,376,870 km2 (6,323,000 mi2). This is nearly five times larger than the second-largest country in the EU, France, with a total area of 640,679 km2 (247,368 mi2), and more than twice the size of the third largest country, Spain, with a total area of 505,946 km2 (195,355 mi2).
The country with the largest land area in the European Union is France. It covers approximately 643,801 square kilometers (248,573 square miles).


In [224]:
train_df.to_csv('./data/train.csv', index=False)
dev_df.to_csv('./data/dev.csv', index=False)
test_df.to_csv('./data/test.csv', index=False)