In [None]:
import os
from huggingface_hub import login
#token
login(token='token')

In [2]:
!pip install -q datasets accelerate evaluate trl accelerate bitsandbytes peft

In [3]:
def create_prompt(question, answer):
    prompt = f"""Generator is an expert SPARQL query generator. For each question that the user supplies, the generator will convert it into a valid SPARQL query that can be used to answer the question. The query should be enclosed by three backticks on new lines, denoting that it is a code block.

Human: {question}
Generator: '''{answer}'''"""
    return prompt

In [4]:
import re 

def normalize_variables(query):
    if query is None:
        return ""
    variable_pattern = re.compile(r"\?\w+")
    variables = variable_pattern.findall(query)
    normalized_query = query
    for i, var in enumerate(variables):
        normalized_query = normalized_query.replace(var, f"?var{i}")
    return normalized_query

In [5]:
# This prefix map will be used to shrink the uri's down to the prefix level, to help the model better understand them and decrease mistakes.
prefix_map = {"http://www.opengis.net/ont/geosparql#" : "geo:",
               "http://www.opengis.net/def/function/geosparql/" : "geof:",
               "http://www.w3.org/1999/02/22-rdf-syntax-ns#" : "rdf:",
               "http://www.w3.org/2000/01/rdf-schema#" : "rdfs:",
               "http://www.w3.org/2001/XMLSchema#" : "xsd:",
               "http://yago-knowledge.org/resource/" : "yago:",
               "http://kr.di.uoa.gr/yago2geo/resource/" : "y2geor:",
               "http://kr.di.uoa.gr/yago2geo/ontology/" : "y2geoo:",
               "http://strdf.di.uoa.gr/ontology#" : "strdf:",
               "http://www.opengis.net/def/uom/OGC/1.0/" : "uom:",
               "http://www.w3.org/2002/07/owl#" : "owl:"}

In [6]:
from datasets import Dataset
import json

def flatten_dataset(original_dataset):
    flattened_data = []
    
    # Iterate through each numbered key in the dataset
    for key in original_dataset:
        if key.isdigit():  # Ensure we're only processing the numbered keys
            item = original_dataset[key]  # Access the single dictionary in the list
            # Create a new entry combining question and answer
            query = normalize_variables(item['Query'])
            # Shorten the uris down to prefixes.
            for uri_map, prefix in prefix_map.items():
                query = query.replace(uri_map, prefix)
            prompt = create_prompt(item['Question'], query)
            flattened_data.append(prompt)
    
    return flattened_data

with open('/kaggle/input/finetuning-no-rdfs/training_set_no_rdfs.json', 'r') as file:
    original_dataset = json.load(file)
    
# Assuming original_dataset is your original dictionary
flattened_texts = flatten_dataset(original_dataset)

# Create a Dataset object from the flattened data
dataset = Dataset.from_dict({"text": flattened_texts})

In [7]:
with open('/kaggle/input/finetuning-no-rdfs/validation_set_no_rdfs - Copy.json', 'r') as file:
    val_dataset = json.load(file)
    
# Assuming original_dataset is your original dictionary
val_flat = flatten_dataset(val_dataset)

# Create a Dataset object from the flattened data
val_set = Dataset.from_dict({"text": val_flat})

In [None]:
print(dataset[0])

In [None]:
print(val_set[4])


## Step 2: Set up the model and tokenizer

In [None]:
!pip install peft

In [11]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

def __get_mistral_tokenizer() -> AutoTokenizer:
    tokenizer = AutoTokenizer.from_pretrained("alpindale/Mistral-7B-v0.2-hf")
    tokenizer.pad_token = tokenizer.unk_token#"<PAD>"
    tokenizer.padding_side = 'right'
    return tokenizer 

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
from peft import prepare_model_for_kbit_training

model_id = "alpindale/Mistral-7B-v0.2-hf"
tokenizer = __get_mistral_tokenizer()

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

## Step 3: Set up PEFT (Parameter-Efficient Fine-Tuning)

In [16]:
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments
from trl import SFTTrainer

# # ------------------------
# # ----- Quantization -----
# # ------------------------
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True, 
#     bnb_4bit_use_double_quant=True, 
#     bnb_4bit_quant_type="nf4", 
#     bnb_4bit_compute_dtype=torch.bfloat16
# )
 
# # -----------------
# # ----- Model -----
# # -----------------
# model = AutoModelForCausalLM.from_pretrained(
#     MODEL.model_id,
#     device_map="auto",
#     torch_dtype=torch.bfloat16,
#     quantization_config=bnb_config
# )
for param in model.parameters():
    param.requires_grad = False
    if param.ndim ==1:
        param.data = param.data.to(torch.float32)
    
model.gradient_checkpointing_enable()
model.enable_input_require_grads()

# compute_metrics(None, subset_test_dataset)
# compute_metrics(None, dataset['test'])

# ----------------
# ----- LoRA -----
# ----------------
peft_config = LoraConfig(
    lora_dropout=0.1,
    # target_modules = ['q_proj', 'k_proj', 'down_proj', 'v_proj', 'gate_proj', 'o_proj', 'up_proj'],
    bias = 'none',
    # modules_to_save = ['lm_head', 'embed_tokens'],
    task_type="CAUSAL_LM"
)

peft_config.save_pretrained("/kaggle/working")

In [None]:
from huggingface_hub import HfApi

# Upload the adapter config to the model repo
api = HfApi()
api.upload_file(
    path_or_fileobj="/kaggle/working/adapter_config.json",
    path_in_repo="adapter_config.json",
    repo_id="Stratos-Kakalis/New_FT_Weights",
    repo_type="model",
)

In [None]:
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments
from trl import SFTTrainer

# # ------------------------
# # ----- Quantization -----
# # ------------------------
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True, 
#     bnb_4bit_use_double_quant=True, 
#     bnb_4bit_quant_type="nf4", 
#     bnb_4bit_compute_dtype=torch.bfloat16
# )
 
# # -----------------
# # ----- Model -----
# # -----------------
# model = AutoModelForCausalLM.from_pretrained(
#     MODEL.model_id,
#     device_map="auto",
#     torch_dtype=torch.bfloat16,
#     quantization_config=bnb_config
# )
for param in model.parameters():
    param.requires_grad = False
    if param.ndim ==1:
        param.data = param.data.to(torch.float32)
    
model.gradient_checkpointing_enable()
model.enable_input_require_grads()

# compute_metrics(None, subset_test_dataset)
# compute_metrics(None, dataset['test'])

# ----------------
# ----- LoRA -----
# ----------------
peft_config = LoraConfig(
    lora_dropout=0.1,
    # target_modules = ['q_proj', 'k_proj', 'down_proj', 'v_proj', 'gate_proj', 'o_proj', 'up_proj'],
    bias = 'none',
    # modules_to_save = ['lm_head', 'embed_tokens'],
    task_type="CAUSAL_LM"
)

peft_config.save_pretrained("path/to/save/adapter_config")
model = get_peft_model(model, peft_config)
 
# --------------------
# ----- Training -----
# --------------------
args = TrainingArguments(
    output_dir="/kaggle/working/",
    # Training length
    num_train_epochs=8,
    # Important for VRAM
    per_device_train_batch_size=8,          # batch size per device during training
    gradient_accumulation_steps=2,          # number of steps before performing a backward/update pass
    # Other
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    bf16=True,                              # use bfloat16 precision
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    # lr_scheduler_type="constant",           # use constant learning rate scheduler
    learning_rate=0.0002,
    # Logging
    logging_dir="/kaggle/working/logs/",
    logging_steps=10,
    # Evaluation
    evaluation_strategy="steps",            # evaluate every 'eval_steps'
    eval_steps=10,                         # evaluation step frequency
    load_best_model_at_end=True,            # load the best model at the end of training
    metric_for_best_model="eval_loss",        # metric to compare the best model
    greater_is_better=False
)

trainer = SFTTrainer(
    model=model,
    args=args,
#     train_dataset=dataset['train'],
    train_dataset=dataset,              ## ????????????
    dataset_text_field='text',
    eval_dataset=val_set,
#     dataset_text_field="input",
#     compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": False, # No need to add additional separator token
    }
)

trainer.train()

trainer.evaluate()

# print("FULL EVALUATION")
# compute_metrics(None, dataset['test'])

# ---------------------------------
# ----- Upload to HuggingFace -----
# ---------------------------------
model.push_to_hub("norm_trunc_no_rdfs_8_epoch", private=True)
tokenizer.push_to_hub("norm_trunc_no_rdfs_8_epoch",private=True) 