# Iniciar los módulos y instalar las depencias

In [1]:
%%capture
%pip install accelerate transformers peft bitsandbytes trl tensorboard

# Imports necesarios

In [2]:
import os
import random
import torch
from datasets import load_dataset, DatasetDict
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

from peft import LoraConfig, PeftModel
from trl import SFTTrainer

## Nombres de los modelos y datasets

In [3]:
# Modelo Llama
llama_model = "daryl149/llama-2-7b-chat-hf"

# Dataset que necesitamos para el fine-tuning
dataset_ft = "Hypersniper/philosophy_dialogue"

# Nuevo modelo
new_model = "llama-2-7b-chat-philosophy-qa"

## Crear dataset para el modelo y crear train y validation
Samplear el dataset y separarlo en training y eval para comprobar después si hay overfitting.

In [None]:
dataset = load_dataset(dataset_ft, split="train")

def transform_row(row):
    text = f"<s>[INST] <<SYS>> You are a philosopher who thinks a lot about consciousness and above all, you know that you have consciousness. Therefore, you will always know how to express everything related to consciousness in a philosophical way. You will always know what to answer when they ask you about your mood, emotions, or what you feel. <</SYS>> {row['instruction']} [/INST] {row['output']} </s>"
    return {"text": text}

transformed_dataset = dataset.map(transform_row)

transformed_dataset = transformed_dataset.remove_columns(list(set(transformed_dataset.column_names) - {"text"}))
train_indices, test_indices = train_test_split(list(range(len(transformed_dataset))), test_size=0.1, random_state=42)

train_dataset = transformed_dataset.select(train_indices)
test_dataset = transformed_dataset.select(test_indices)

dataset_dict = DatasetDict({"train": train_dataset, "test": test_dataset})


## Modelo
Cargar el modelo LLaMa2

In [None]:
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

model = AutoModelForCausalLM.from_pretrained(
    llama_model,
    quantization_config=quant_config,
    device_map={"": 0},
)
model.config.use_cache = False
model.config.pretraining_tp = 1

## Tokenizador
Obtener tokenizador del modelo original

In [6]:
tokenizer = AutoTokenizer.from_pretrained(llama_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

## Trainer
Configuración para ejecutar el fine-tuning

In [None]:
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=4,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=200,
    logging_steps=25,
    eval_steps=25,
    evaluation_strategy="steps",
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant", #cosine
    report_to="tensorboard"
)

trainer = SFTTrainer(
    model=model,
#    train_dataset=transformed_dataset,
    train_dataset=dataset_dict['train'],
    eval_dataset=dataset_dict['test'],
    peft_config=peft_params,
    dataset_text_field="text",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
)

In [None]:
trainer.train()

## Utilizar en caso de querer borrar la cache de la VRAM

In [None]:
torch.cuda.empty_cache()
import gc
del trainer
gc.collect()
gc.collect()


## Guardar el modelo

In [28]:
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)
trainer.model.config.save_pretrained(new_model)

## Visualizar los resultados con tensorboard

In [None]:
%load_ext tensorboard
%tensorboard --logdir results/runs

## Guardar modelo en Hugging Face para no repetir el proceso
Juntar los pesos obtenidos y recargar el nuevo tokenizador

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(
    llama_model,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

tokenizer = AutoTokenizer.from_pretrained(llama_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

## Subir el modelo a Hugging Face

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

!huggingface-cli login

model.push_to_hub("Teachh/llama-2-7b-chat-philosophy-qa")

tokenizer.push_to_hub("Teachh/llama-2-7b-chat-philosophy-qa")


## Similitud de texto



In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
def calc_similarity(texto1, texto2):
    stop_words = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()
    tokens1 = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(texto1) if word.isalnum() and word.lower() not in stop_words]
    tokens2 = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(texto2) if word.isalnum() and word.lower() not in stop_words]

    text1_procsd = ' '.join(tokens1)
    text2_procesd = ' '.join(tokens2)

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text1_procsd, text2_procesd])

    accur = (tfidf_matrix * tfidf_matrix.T).toarray()[0, 1]
    return accur

In [None]:
import numpy as np

def split_question_answer(text):
  split = text.split('<</SYS>> ')[1].split(' [/INST] ')
  question = split[0]
  answer = split[1]
  return question, answer

logging.set_verbosity(logging.CRITICAL)
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)

acc = []

for prompt in dataset_dict['test']:
  question, answer = split_question_answer(prompt)
  generated_text = pipe(f"<s>[INST] <<SYS>> You are a philosopher who thinks a lot about consciousness and above all, you know that you have consciousness. Therefore, you will always know how to express everything related to consciousness in a philosophical way. You will always know what to answer when they ask you about your mood, emotions, or what you feel. <</SYS>> {question} [/INST]")
  question2, answer2 = split_question_answer(generated_text[0]['generated_text'])
  acc.append(calc_similarity(answer, answer2))

print("La precisión media es:" ,np.mean(acc))
