# Étape 1 : Prérequis

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import matplotlib.pyplot as plt
from datasets import load_dataset
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling

# Étape 1 : Initialisation du modèle et du tokenizer à partir de zéro

In [None]:
# Si vous utilisez Google Colab, connectez-vous à Hugging Face
from huggingface_hub import notebook_login
notebook_login()

In [None]:
# Assurez-vous que le modèle utilise le bon type de données
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM(config=model.config, quantization_config=bnb_config, device_map={"":0})


# Étape 2 : Chargement des données

In [None]:
# from datasets import load_dataset

# dataset = load_dataset("Noorgha/pytest", split="train")
# dataset

In [None]:
dataset = load_dataset("json", data_files="/content/data.json", split="train")
print(dataset)

In [None]:
# Préparation des données pour l'entraînement
def generate_prompt(data_point):
    """Gen. input text based on a prompt, task instruction, (context info.), and answer

    :param data_point: dict: Data point
    :return: dict: tokenzed prompt
    """
    prefix_text = 'Below is an instruction that describes a task. Write a pytest unit test function that ' \
                  'appropriately completes the request.\n\n'
    # Samples with additional context info.
    if data_point['input']:
        text = f"""<start_of_turn>user {prefix_text} {data_point["instruction"]} Here is the function:\n{data_point["input"]} <end_of_turn>\n<start_of_turn>model\n{data_point["output"]} <end_of_turn>"""
    # Without additional context
    else:
        text = f"""<start_of_turn>user {prefix_text} {data_point["instruction"]} <end_of_turn>\n<start_of_turn>model\n{data_point["output"]} <end_of_turn>"""
    return text


In [None]:
text_column = [generate_prompt(data_point) for data_point in dataset]
dataset = dataset.add_column("prompt", text_column)

dataset = dataset.shuffle(seed=1234)  # Shuffle dataset here
dataset = dataset.map(lambda samples: tokenizer(samples["prompt"]), batched=True)

dataset = dataset.train_test_split(test_size=0.2)
train_data = dataset["train"]
test_data = dataset["test"]
print(train_data)
print(test_data)


# Étape 3 : Configuration de l'entraîneur

In [None]:

tokenizer.pad_token = tokenizer.eos_token
torch.cuda.empty_cache()

In [None]:
training_args = TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    max_steps=100,
    learning_rate=2e-4,
    logging_steps=1,
    output_dir="outputs_scratch",
    optim="paged_adamw_8bit",
    save_strategy="epoch",
    evaluation_strategy="epoch",
)

# Étape 4 : Création de l'entraîneur

In [None]:
trainer = Trainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=test_data,
    tokenizer=tokenizer,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

# Listes pour stocker l'accuracy après chaque epoch
train_accuracies = []
eval_accuracies = []

# Étape 5 : Modification de la fonction d'entraînement pour enregistrer l'accuracy

In [None]:
def calculer_accuracy(predictions, labels):
    predictions = predictions.argmax(dim=-1)
    accuracy = (predictions == labels).float().mean()
    return accuracy.item()

for epoch in range(trainer.args.num_train_epochs):
    # Entraînement
    trainer.train()
    
    # Évaluation
    eval_results = trainer.evaluate()
    
    # Calcul de l'accuracy sur l'ensemble d'entraînement et de test
    train_accuracy = calculer_accuracy(eval_results['predictions'], eval_results['label_ids'])
    eval_accuracy = calculer_accuracy(eval_results['eval_predictions'], eval_results['eval_label_ids'])
    
    train_accuracies.append(train_accuracy)
    eval_accuracies.append(eval_accuracy)
    
    print(f"Epoch {epoch + 1}: Accuracy Entraînement = {train_accuracy:.4f}, Accuracy Évaluation = {eval_accuracy:.4f}")


# Étape 6 : Tracer les courbes d'accuracy

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(train_accuracies) + 1), train_accuracies, label='Accuracy Entraînement')
plt.plot(range(1, len(eval_accuracies) + 1), eval_accuracies, label='Accuracy Évaluation')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Courbes d\'Accuracy au cours de l\'Entraînement')
plt.legend()
plt.grid(True)
plt.show()
