In [1]:
import os
from huggingface_hub import login
#token
login(token='token')

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [2]:
!pip install -q datasets accelerate evaluate trl accelerate bitsandbytes peft

In [3]:
def create_prompt(question, answer):
    prompt = f"""Generator is an expert SPARQL query generator. For each question that the user supplies, the generator will convert it into a valid SPARQL query that can be used to answer the question. The query should be enclosed by three backticks on new lines, denoting that it is a code block.

Human: {question}
Generator: '''{answer}'''"""
    return prompt

In [4]:
import re 

def normalize_variables(query):
    if query is None:
        return ""
    variable_pattern = re.compile(r"\?\w+")
    variables = variable_pattern.findall(query)
    normalized_query = query
    for i, var in enumerate(variables):
        normalized_query = normalized_query.replace(var, f"?var{i}")
    return normalized_query

In [5]:
# This prefix map will be used to shrink the uri's down to the prefix level, to help the model better understand them and decrease mistakes.
prefix_map = {"http://www.opengis.net/ont/geosparql#" : "geo:",
               "http://www.opengis.net/def/function/geosparql/" : "geof:",
               "http://www.w3.org/1999/02/22-rdf-syntax-ns#" : "rdf:",
               "http://www.w3.org/2000/01/rdf-schema#" : "rdfs:",
               "http://www.w3.org/2001/XMLSchema#" : "xsd:",
               "http://yago-knowledge.org/resource/" : "yago:",
               "http://kr.di.uoa.gr/yago2geo/resource/" : "y2geor:",
               "http://kr.di.uoa.gr/yago2geo/ontology/" : "y2geoo:",
               "http://strdf.di.uoa.gr/ontology#" : "strdf:",
               "http://www.opengis.net/def/uom/OGC/1.0/" : "uom:",
               "http://www.w3.org/2002/07/owl#" : "owl:"}

In [6]:
from datasets import Dataset
import json

def flatten_dataset(original_dataset):
    flattened_data = []
    
    # Iterate through each numbered key in the dataset
    for key in original_dataset:
        if key.isdigit():  # Ensure we're only processing the numbered keys
            item = original_dataset[key]  # Access the single dictionary in the list
            # Create a new entry combining question and answer
            query = normalize_variables(item['Query'])
            # Shorten the uris down to prefixes.
            for uri_map, prefix in prefix_map.items():
                query = query.replace(uri_map, prefix)
            prompt = create_prompt(item['Question'], query)
            flattened_data.append(prompt)
    
    return flattened_data

with open('/kaggle/input/finetuning-no-rdfs/training_set_no_rdfs.json', 'r') as file:
    original_dataset = json.load(file)
    
# Assuming original_dataset is your original dictionary
flattened_texts = flatten_dataset(original_dataset)

# Create a Dataset object from the flattened data
dataset = Dataset.from_dict({"text": flattened_texts})

In [7]:
with open('/kaggle/input/finetuning-no-rdfs/validation_set_no_rdfs - Copy.json', 'r') as file:
    val_dataset = json.load(file)
    
# Assuming original_dataset is your original dictionary
val_flat = flatten_dataset(val_dataset)

# Create a Dataset object from the flattened data
val_set = Dataset.from_dict({"text": val_flat})

In [8]:
print(dataset[0])

{'text': "Generator is an expert SPARQL query generator. For each question that the user supplies, the generator will convert it into a valid SPARQL query that can be used to answer the question. The query should be enclosed by three backticks on new lines, denoting that it is a code block.\n\nHuman: What is the population of Central Greece?\nGenerator: '''SELECT DISTINCT ?var0 WHERE { <yago:Central_Greece_(region)> y2geoo:hasGAG_Population ?var0 }'''"}


In [9]:
print(val_set[4])

{'text': "Generator is an expert SPARQL query generator. For each question that the user supplies, the generator will convert it into a valid SPARQL query that can be used to answer the question. The query should be enclosed by three backticks on new lines, denoting that it is a code block.\n\nHuman: Is Ierapetra south of Athens?\nGenerator: '''ASK {     <yago:Ierapetra> geo:hasGeometry ?var0. ?var0 geo:asWKT ?var2.      <yago:geoentity_Dimos_Athens_8133876> geo:hasGeometry ?var3.     ?var3 geo:asWKT ?var5.   FILTER (strdf:below(?var2, ?var5)) }'''"}



## Step 2: Set up the model and tokenizer

In [10]:
!pip install peft



In [11]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

def __get_mistral_tokenizer() -> AutoTokenizer:
    tokenizer = AutoTokenizer.from_pretrained("alpindale/Mistral-7B-v0.2-hf")
    tokenizer.pad_token = tokenizer.unk_token#"<PAD>"
    tokenizer.padding_side = 'right'
    return tokenizer 

In [12]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
from peft import prepare_model_for_kbit_training

model_id = "alpindale/Mistral-7B-v0.2-hf"
tokenizer = __get_mistral_tokenizer()

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

tokenizer_config.json:   0%|          | 0.00/960 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

## Step 3: Set up PEFT (Parameter-Efficient Fine-Tuning)

In [16]:
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments
from trl import SFTTrainer

# # ------------------------
# # ----- Quantization -----
# # ------------------------
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True, 
#     bnb_4bit_use_double_quant=True, 
#     bnb_4bit_quant_type="nf4", 
#     bnb_4bit_compute_dtype=torch.bfloat16
# )
 
# # -----------------
# # ----- Model -----
# # -----------------
# model = AutoModelForCausalLM.from_pretrained(
#     MODEL.model_id,
#     device_map="auto",
#     torch_dtype=torch.bfloat16,
#     quantization_config=bnb_config
# )
for param in model.parameters():
    param.requires_grad = False
    if param.ndim ==1:
        param.data = param.data.to(torch.float32)
    
model.gradient_checkpointing_enable()
model.enable_input_require_grads()

# compute_metrics(None, subset_test_dataset)
# compute_metrics(None, dataset['test'])

# ----------------
# ----- LoRA -----
# ----------------
peft_config = LoraConfig(
    lora_dropout=0.1,
    # target_modules = ['q_proj', 'k_proj', 'down_proj', 'v_proj', 'gate_proj', 'o_proj', 'up_proj'],
    bias = 'none',
    # modules_to_save = ['lm_head', 'embed_tokens'],
    task_type="CAUSAL_LM"
)

peft_config.save_pretrained("/kaggle/working")

In [17]:
from huggingface_hub import HfApi

# Upload the adapter config to the model repo
api = HfApi()
api.upload_file(
    path_or_fileobj="/kaggle/working/adapter_config.json",
    path_in_repo="adapter_config.json",
    repo_id="Stratos-Kakalis/New_FT_Weights",
    repo_type="model",
)

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/Stratos-Kakalis/New_FT_Weights/commit/4e4b309cde515910a27c57fdc8fcd74295c8bb64', commit_message='Upload adapter_config.json with huggingface_hub', commit_description='', oid='4e4b309cde515910a27c57fdc8fcd74295c8bb64', pr_url=None, pr_revision=None, pr_num=None)

In [18]:
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments
from trl import SFTTrainer

# # ------------------------
# # ----- Quantization -----
# # ------------------------
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True, 
#     bnb_4bit_use_double_quant=True, 
#     bnb_4bit_quant_type="nf4", 
#     bnb_4bit_compute_dtype=torch.bfloat16
# )
 
# # -----------------
# # ----- Model -----
# # -----------------
# model = AutoModelForCausalLM.from_pretrained(
#     MODEL.model_id,
#     device_map="auto",
#     torch_dtype=torch.bfloat16,
#     quantization_config=bnb_config
# )
for param in model.parameters():
    param.requires_grad = False
    if param.ndim ==1:
        param.data = param.data.to(torch.float32)
    
model.gradient_checkpointing_enable()
model.enable_input_require_grads()

# compute_metrics(None, subset_test_dataset)
# compute_metrics(None, dataset['test'])

# ----------------
# ----- LoRA -----
# ----------------
peft_config = LoraConfig(
    lora_dropout=0.1,
    # target_modules = ['q_proj', 'k_proj', 'down_proj', 'v_proj', 'gate_proj', 'o_proj', 'up_proj'],
    bias = 'none',
    # modules_to_save = ['lm_head', 'embed_tokens'],
    task_type="CAUSAL_LM"
)

peft_config.save_pretrained("path/to/save/adapter_config")
model = get_peft_model(model, peft_config)
 
# --------------------
# ----- Training -----
# --------------------
args = TrainingArguments(
    output_dir="/kaggle/working/",
    # Training length
    num_train_epochs=8,
    # Important for VRAM
    per_device_train_batch_size=8,          # batch size per device during training
    gradient_accumulation_steps=2,          # number of steps before performing a backward/update pass
    # Other
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    bf16=True,                              # use bfloat16 precision
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    # lr_scheduler_type="constant",           # use constant learning rate scheduler
    learning_rate=0.0002,
    # Logging
    logging_dir="/kaggle/working/logs/",
    logging_steps=10,
    # Evaluation
    evaluation_strategy="steps",            # evaluate every 'eval_steps'
    eval_steps=10,                         # evaluation step frequency
    load_best_model_at_end=True,            # load the best model at the end of training
    metric_for_best_model="eval_loss",        # metric to compare the best model
    greater_is_better=False
)

trainer = SFTTrainer(
    model=model,
    args=args,
#     train_dataset=dataset['train'],
    train_dataset=dataset,              ## ????????????
    dataset_text_field='text',
    eval_dataset=val_set,
#     dataset_text_field="input",
#     compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": False, # No need to add additional separator token
    }
)

trainer.train()

trainer.evaluate()

# print("FULL EVALUATION")
# compute_metrics(None, dataset['test'])

# ---------------------------------
# ----- Upload to HuggingFace -----
# ---------------------------------
model.push_to_hub("norm_trunc_no_rdfs_8_epoch", private=True)
tokenizer.push_to_hub("norm_trunc_no_rdfs_8_epoch",private=True) 


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/754 [00:00<?, ? examples/s]

Map:   0%|          | 0/92 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
10,1.7396,1.559345
20,1.1395,0.718919
30,0.5299,0.432377
40,0.3999,0.362587
50,0.3522,0.328678
60,0.3013,0.313414
70,0.2876,0.303795
80,0.296,0.292512
90,0.2819,0.28396
100,0.2654,0.279635


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


adapter_model.safetensors:   0%|          | 0.00/13.6M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Stratos-Kakalis/norm_trunc_no_rdfs_8_epoch/commit/a3baa8f92e6c97b4f34d2c2bb8d418e3181917ce', commit_message='Upload tokenizer', commit_description='', oid='a3baa8f92e6c97b4f34d2c2bb8d418e3181917ce', pr_url=None, pr_revision=None, pr_num=None)