### 1. Importation + Connexion HF et Wandb

In [None]:
!pip install -U transformers
!pip install -U datasets
!pip install -U peft
!pip install -U bitsandbytes

!pip install -U accelerate 
!pip install -U trl
!pip install -U wandb

In [None]:
import torch
import time
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt

from datasets import Dataset, load_dataset
from datasets import load_dataset, load_metric
from transformers import pipeline, set_seed
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Connexion à Hugging Face et à Wandb
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
import wandb

!git config --global credential.helper store
user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("HF")

login(token=hf_token, add_to_git_credential=True)

wb_token = user_secrets.get_secret("wandb")

wandb.login(key=wb_token)

run = wandb.init(
    project='Fine-tune Summarization', 
    job_type="training", 
    anonymous="allow"
)

### 2. Importation du Dataset 

In [None]:
# Introduire le modèle de base + Dataset de training + nom du nouveau modèle
base_model = "/kaggle/input/llama-3/transformers/8b-hf/1"
dataset_name = "cnn_dailymail"
new_model = "llama-3-8b-summarize"

In [None]:
# Importation du dataset
dataset = load_dataset(dataset_name, "3.0.0")

### 3. Preprocessing 

In [None]:
def format_instruction(dialogue: str, summary: str):
    return f"""### Instruction:
Summarize the following conversation.

### Input:
{dialogue.strip()}

### Summary:
{summary}
""".strip()

def generate_instruction_dataset(data_point):
    return {
        "article": data_point["article"],
        "highlights": data_point["highlights"],
        "text": format_instruction(data_point["article"],data_point["highlights"])
    }

def process_dataset(data: dataset):
    return (
        data.map(generate_instruction_dataset).remove_columns(['id'])
    )

In [None]:
# Preprocessing des données 
dataset["train"] = process_dataset(dataset["train"])
dataset["test"] = process_dataset(dataset["validation"])
dataset["validation"] = process_dataset(dataset["validation"])

# Splits des données 
train_data = dataset['train'].select([i for i in range(1000)]) # on peut ajouter un .shuffle(seed=42)

test_data = dataset['test'].select([i for i in range(100)])
validation_data = dataset['validation'].select([i for i in range(100)])

train_data,test_data,validation_data

### 4. Modification de Llama3

In [None]:
torch_dtype = torch.float16 # Définition du type de données utilisé par PyTorch pour les calculs tensoriels
attn_implementation = "eager" # Opération excécutée immédiatement (top pour le jupyter notebook) ≠ "graph" (mieux pour des gros modèles)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Configuration de QLoRA (méthode de quantification : Quantized Low Rank Adapter), obligatoire car on a une contrainte de mémoire
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

# Chargement du modèle
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

In [None]:
# Chargement du tokenizer (ChatML template qui distingue l'user de l'assistant)
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Si on a un dataset avec un user et un assistant : model, tokenizer = setup_chat_format(model, tokenizer)

### 5. Training

In [None]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
print(model)

In [None]:
from peft import LoraConfig, get_peft_model

# Configuration de LoRA, paramètres pour améliorer le temps d'entrainement 
peft_config = LoraConfig(
    r=16,
    lora_alpha=64,# ou 32 à voir pour les bons paramètres à utiliser
    lora_dropout=0.1, # ou 0.1 mais tj checker pour les paramètres 
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj']
)
model = get_peft_model(model, peft_config)

In [None]:
from transformers import TrainingArguments

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    optim="adamw_torch", # ou optim="paged_adamw_32bit",
    logging_steps=1,
    learning_rate=1e-4,
    fp16=False,
    max_grad_norm=0.3,
    num_train_epochs=2,
    evaluation_strategy="steps",
    eval_steps=0.2,
    warmup_ratio=0.05,
    save_strategy="epoch",
    group_by_length=True,
    report_to="wandb",
    save_safetensors=True,
    lr_scheduler_type="cosine",
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

In [None]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=validation_data,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=1024,
    tokenizer=tokenizer,
    args=training_arguments,
)

In [None]:
trainer.train()

In [None]:
wandb.finish()
model.config.use_cache = True

## 6. Inférence sur le modèle non merge avec llama3

In [None]:
# Remettre la mémoire cache 
from transformers import TextStreamer
model.config.use_cache = True
model.eval()

In [None]:
import os

os.environ["TOKEN"] = "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"

In [None]:
index = 51

dialogue = train_data['article'][index][:10000]
summary = train_data['highlights'][index]

prompt = f"""
Summarize the following conversation.

### Input:
{dialogue}

### Summary:
"""

input_ids = tokenizer(prompt, return_tensors='pt',truncation=True).input_ids.cuda()
outputs = trained_model.generate(input_ids=input_ids, max_new_tokens=200, )
output= tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'TRAINED MODEL GENERATED TEXT :\n{output}')

### 7. Enregistrement du modèle finetune

In [None]:
peft_model_path="./peft-dialogue-summary"

trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

### Zero Shot

In [None]:
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer

peft_model_dir = "peft-dialogue-summary"

# load base LLM model and tokenizer
trained_model = AutoPeftModelForCausalLM.from_pretrained(
    peft_model_dir,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_4bit=True,
)
tokenizer = AutoTokenizer.from_pretrained(peft_model_dir)

In [None]:
index = 2

dialogue = test_data['article'][index]
summary = test_data['highlights'][index]

prompt = f"""
Summarize the following conversation.

### Input:
{dialogue}

### Summary:
"""

inputs = tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        max_new_tokens=100,
    )[0],
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')