Projet de LLM ISDEV EXPERTS

# **Modèle de génération de l'idée générale d'un formulaire**

In [None]:
import pandas as pd
import random

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pwd  # Vérifier le dossier courant

/content


In [None]:
!ls /content/drive/My\ Drive/isdev_nlp_project/ # List files in the target directory

 fichier_fusionne.csv		    form_dataset_with_summaries.csv
'form_dataset_grouped (1).gsheet'   form_dataset_with_summaries.gsheet
 form_dataset_grouped.csv	    form_summarization
 form_dataset_grouped.gsheet


In [None]:
# Charger le fichier d'origine
df = pd.read_csv("/content/drive/My Drive/isdev_nlp_project/fichier_fusionne.csv")

# Aperçu des données
print(df.head())
# Afficher les colonnes sur une seule ligne
print(f"\nColonnes : {', '.join(df.columns.tolist())}")

                             question  \
0                 How is the parking?   
1             How close is the wharf?   
2  What do you think about the view ?   
3  What do you think about the hotel?   
4           How do you like the view?   

                                              review  \
0  This is a great location. It is an easy walk t...   
1  Used some of my Marriott &quot;points&quot; to...   
2  We stayed here in October while attending the ...   
3  My wife and I are both professionals. We are y...   
4  The Grand Hyatt in San Francisco is close to e...   

                            human_ans_spans  is_ques_subjective  \
0                Parking was $ 35 per night               False   
1                            ANSWERNOTFOUND               False   
2                            ANSWERNOTFOUND               False   
3  I wonder if the hotels needed to fill up               False   
4                    We had excellent views               False   

   ques_subj_

### Suppression des colonnes inutiles

In [None]:
# Garder uniquement la colonne 'question'
df_questions = df[['question']].dropna().reset_index(drop=True)

# Mélanger les questions
questions = df_questions['question'].tolist()
random.shuffle(questions)

### Regroupent des questions en groupe de 6 à 15 aléatoirement

In [None]:
# Regrouper en formulaires de 6 à 15 questions
grouped_data = []
i = 0
while i < len(questions):
    group_size = random.randint(6, 15)
    group = questions[i:i+group_size]
    if len(group) >= 6:
        grouped_data.append({
            'form_questions': " | ".join(group),  # Tu peux changer le séparateur si tu veux
            'summary': ""  # Vide pour l'instant
        })
    i += group_size


In [None]:
# Sauvegarder le nouveau dataset
df_grouped = pd.DataFrame(grouped_data)
df_grouped.to_csv("formulaires_regroupes.csv", index=False)

print("✅ Regroupement terminé. Fichier 'formulaires_regroupes.csv' créé.")

✅ Regroupement terminé. Fichier 'formulaires_regroupes.csv' créé.


In [None]:
print(df_grouped.head())
print(f"\nColonnes : {df_grouped.columns.tolist()}")

                                      form_questions summary
0  Where can I get a cheap hotel? | How is the se...        
1  How is the hotel inside? | How is the hotel? |...        
2  Does the hotel offer good service? | How is ca...        
3  How is the service? | Is the wine nice? | How ...        
4  What is your take on the service by the doctor...        

Colonnes : ['form_questions', 'summary']


### **Remplissage automatique de la colonne des Summaries**

Il s'afit ici d'écrire un script qui analyse chaque bloc de questions regroupées et d'entrouver l'idée générale automatiquement

1.   Installation des dépendances






In [None]:
!pip install transformers datasets sentencepiece





2.  Création du pipeline de summarization



In [None]:
from transformers import pipeline
from tqdm import tqdm

# Charger le modèle
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Créer une colonne vide pour les résumés
df_grouped["summary"] = ""

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


In [None]:
df_grouped.columns.tolist()

['form_questions', 'summary']

3. Génération des résumés par lots

In [None]:
batch_size = 10

for i in tqdm(range(0, len(df_grouped), batch_size)):
    batch = df_grouped["form_questions"].iloc[i:i+batch_size].tolist()
    try:
        summaries = summarizer(batch, max_length=45, min_length=15, do_sample=False)
        for j, summary in enumerate(summaries):
            df_grouped.at[i + j, "summary"] = summary['summary_text']
    except Exception as e:
        print(f"Erreur à la ligne {i} : {e}")

 12%|█▎        | 3/24 [05:26<36:55, 105.48s/it]Your max_length is set to 45, but your input_length is only 41. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=20)
 21%|██        | 5/24 [08:37<31:21, 99.03s/it] Your max_length is set to 45, but your input_length is only 37. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=18)
 54%|█████▍    | 13/24 [20:41<16:31, 90.14s/it]Your max_length is set to 45, but your input_length is only 44. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=22)
 67%|██████▋   | 16/24 [25:03<11:50, 88.85s/it]Your max_length is set to 45, but your input_length is only 44. Sinc

4. Sauvegarde du nouveau fichier CSV

In [None]:
output_path = '/content/drive/My Drive/isdev_nlp_project/form_dataset_with_summaries.csv'
df_grouped.to_csv(output_path, index=False)
print("Fichier sauvegardé à :", output_path)

Fichier sauvegardé à : /content/drive/My Drive/isdev_nlp_project/form_dataset_with_summaries.csv


## **Entrainement des données déjà prétraitées**

1. **Chargement des données**


In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_df, test_df = train_test_split(df_grouped, test_size=0.2, random_state=42)

print("Training set shape:", train_df.shape)
print("Testing set shape:", test_df.shape)

Training set shape: (190, 2)
Testing set shape: (48, 2)


In [None]:
from transformers import AutoTokenizer

model_checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

max_input_length = 512
max_target_length = 64

def preprocess(examples):
    inputs = examples["form_questions"]
    targets = examples["summary"]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Instead of loading from CSV, use your existing DataFrames:
# Assuming 'train_df' and 'test_df' are your Pandas DataFrames
train_dataset = train_df.apply(lambda row: preprocess(row.to_dict()), axis=1).tolist()
test_dataset = test_df.apply(lambda row: preprocess(row.to_dict()), axis=1).tolist()

# Continue with the rest of your code...
from transformers import AutoModelForSeq2SeqLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# Define training_args here:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=64,
    num_train_epochs=15,
    logging_dir='./logs',
)

# Define data_collator before passing it to Trainer
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Use the processed train_dataset directly
    eval_dataset=test_dataset,   # Use the processed test_dataset directly
    tokenizer=tokenizer,
    data_collator=data_collator,  # Add data_collator
)

trainer.train()

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

In [None]:
# After training:
eval_results = trainer.evaluate()
print(eval_results) # Print evaluation metrics like loss, accuracy, etc.

In [None]:
trainer.save_model("/content/isdev_nlp_project/form_summarization/new_t5_model")
# Example: trainer.save_model("./my_form_summarization_model")

In [None]:
# Test the model
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline

# Load the model and tokenizer
model_path = "/content/isdev_nlp_project/form_summarization/new_t5_model"  # Replace with your model's path
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Create a summarization pipeline
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

# Test on a new input
input_text = "How was the hotel room? | Was the breakfast satisfying? | Did you enjoy your stay? | Is there anything you would like to be ameliorate? | Can you recommend us to someone?"
summary = summarizer(input_text, max_length=60, min_length=15, do_sample=False)
print(summary[0]['summary_text'])

2. **Prétraitement (Tokenisation)**

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_df, test_df = train_test_split(df_grouped, test_size=0.2, random_state=42)

print("Training set shape:", train_df.shape)
print("Testing set shape:", test_df.shape)

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

max_input_length = 512
max_target_length = 64

def preprocess(examples):
    inputs = examples["form_questions"]
    targets = examples["summary"]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess, batched=True)

3. **Chargement du midèle et configuration de l'entrainement**

In [None]:
!pip install --upgrade transformers

In [None]:
from transformers import AutoModelForSeq2SeqLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

training_args = TrainingArguments(
    output_dir="./t5-form-summary",
    # The 'evaluation_strategy' argument has been replaced with 'eval_strategy'.
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=8,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir='./logs',
)

# Use DataCollatorForSeq2Seq for padding
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,  # Add data_collator
)

trainer.train()

4. **Sauvegarde et usage du model**

*  Sauvegarde



In [None]:
trainer.save_model("/content/isdev_nlp_project/form_summarization/form-summary-model")

*  Pour de futures prédictions

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model="/content/drive/MyDrive/form-summary-model", tokenizer=tokenizer)
summary = summarizer("How is the service? | What about the ambiance?")
print(summary[0]["summary_text"])

## **Test du modèle entrainé**

### Chargement du modèle entrainé

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

model_path = "/content/drive/MyDrive/form-summary-model"  # This is your local model path

# Load the tokenizer and model using the local path directly
tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path, local_files_only=True)

# Create a summarization pipeline
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

### Test sur un exemple

In [None]:
input_questions = """I'm going to give you the following list of questions issued from a form:
                  How was the hotel room? | Was the breakfast satisfying? | Did you enjoy your stay?
                  Analyse them and return the main goal of the form"""

input_prompt = """You are an AI trained to understand the purpose of forms by analyzing their questions.
Each form consists of a list of questions, separated by the “|” character.
Your task is to read the questions and return a concise, high-level summary of the form’s overall goal or purpose in one sentence.
Only output the purpose—do not restate the questions.

Example Input:
Questions: What is your full name? | What is your email address? | What is your phone number? | What product did you purchase? | Please describe the issue.
Output: To collect customer information for submitting a product support or complaint request.

Now analyze the following form and give the output accordingly:
Questions: How was the hotel room? | Was the breakfast satisfying? | Did you enjoy your stay?


"""

summary = summarizer(input_prompt, max_length=60, min_length=15, do_sample=False)
print("📝 Résumé généré :", summary[0]['summary_text'])

## **Ignorer la suite**

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

text = "How was the hotel room? | Was the breakfast satisfying? | Did you enjoy your stay?"

summary = summarizer(text, max_length=50, min_length=15, do_sample=False)
print(summary[0]['summary_text'])


In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model="google/pegasus-xsum")

text = "How was the hotel room? | Was the breakfast satisfying? | Did you enjoy your stay?"

input_prompt = """You are an AI trained to understand the purpose of forms by analyzing their questions.
Each form consists of a list of questions, separated by the “|” character.
Your task is to read the questions and return a concise, high-level summary of the form’s overall goal or purpose in one sentence.
Only output the purpose—do not restate the questions.

Example Input:
Questions: What is your full name? | What is your email address? | What is your phone number? | What product did you purchase? | Please describe the issue.
Output: To collect customer information for submitting a product support or complaint request.

Now analyze the following form and give the output accordingly:
Questions: How was the hotel room? | Was the breakfast satisfying? | Did you enjoy your stay?


"""

summary = summarizer(input_prompt, max_length=20, min_length=10, do_sample=False)
print(summary[0]['summary_text'])

In [None]:
from transformers import pipeline

generator = pipeline("text2text-generation", model="google/flan-t5-base")

prompt = """You are an AI trained to understand the purpose of forms by analyzing their questions.
Each form consists of a list of questions, separated by the “|” character.
Your task is to read the questions and return a concise, high-level summary of the form’s overall goal or purpose in one sentence.
Only output the purpose—do not restate the questions.

Example Input:
Questions: What is your full name? | What is your email address? | What is your phone number? | What product did you purchase? | Please describe the issue.
Output: To collect customer information for submitting a product support or complaint request.

Now analyze the following form and give the output accordingly:
Questions: How was the hotel room? | Was the breakfast satisfying? | Did you enjoy your stay?


"""
res = generator(prompt, max_length=40)[0]['generated_text']
print(res)


In [None]:
# Import the login function
from huggingface_hub import login

# Log in to the Hugging Face Hub
# You will be prompted to enter your Hugging Face token.
# You can generate a token from your Hugging Face account settings -> Access Tokens.
login()

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

prompt = """### Instruction:
Given the following questions from a feedback form:
- How was the hotel room?
- Was the breakfast satisfying?
- Did you enjoy your stay?
What is the main objective of this form?

### Response:
"""

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

output = model.generate(
    **inputs,
    max_new_tokens=60,
    temperature=0.7,
    top_p=0.9,
    do_sample=False
)

response = tokenizer.decode(output[0], skip_special_tokens=True)
print(response)


## let's do it using Mistral and huggingface

In [None]:
import pandas as pd
import json

# Charger ton CSV
df = pd.read_csv("/content/drive/My Drive/isdev_nlp_project/form_dataset_with_summaries.csv", encoding='latin1')

# Liste des entrées formatées
data = []

In [None]:
df

In [None]:
for _, row in df.iterrows():
    questions = row["form_questions"].strip()
    target = str(row["summary"]).strip()

    prompt = f"Given the following questions issued from a form: {questions} What is the main goal of the form?"

    entry = {
        "messages": [
            {"role": "user", "content": prompt},
            {"role": "assistant", "content": target}
        ]
    }
    data.append(entry)

In [None]:
# Sauvegarder en JSONL
with open("train.jsonl", "w", encoding="utf-8") as f:
    for item in data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

In [None]:
from datasets import load_dataset, Dataset

dataset = Dataset.from_list(data)

In [None]:
# Step 3: (optional) Formatage en texte pour causal LM
def format_for_causal_lm(example):
    text = ""
    for m in example["messages"]:
        text += f"{m['role'].capitalize()}: {m['content']}\n"
    return {"text": text}

tokenized_dataset = dataset.map(format_for_causal_lm)

In [None]:
# Import the login function
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import get_peft_model, LoraConfig, TaskType
# Import GatedRepoError to catch the specific exception
from huggingface_hub.utils import GatedRepoError


# Log in to the Hugging Face Hub
# You will be prompted to enter your Hugging Face token.
# You can generate a token from your Hugging Face account settings -> Access Tokens.
# Uncomment the line below and replace "YOUR_HF_TOKEN" with your actual token
# if you prefer to hardcode it (use with caution, especially in shared notebooks):
# login(token="YOUR_HF_TOKEN")

# If you prefer to be prompted securely (recommended in interactive sessions like Colab):
login()

In [None]:
# Install bitsandbytes
# This should ideally be done in a separate cell or before loading quantized models
!pip install -q bitsandbytes

model_id = "mistralai/Mistral-7B-Instruct-v0.2"

# Try loading the tokenizer first. If this fails, the authentication or access is the issue.
try:
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    # Ensure tokenizer has a pad_token, common for causal LMs during training/padding
    if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token # Use eos_token if pad_token is not defined
except GatedRepoError as e: # GatedRepoError is now defined and can be caught
    print(f"Error loading tokenizer: {e}")
    print("Please ensure you have accepted the terms for the model on Hugging Face and are logged in.")
    # You might want to stop execution or handle this error appropriately
    raise # Re-raise the exception to halt execution if necessary
except Exception as e:
     print(f"An unexpected error occurred while loading the tokenizer: {e}")
     raise # Re-raise any other unexpected exceptions

# If tokenizer loading is successful, proceed with model loading
try:
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        load_in_4bit=True,
        device_map="auto"
    )
except GatedRepoError as e: # GatedRepoError is now defined and can be caught
    print(f"Error loading model: {e}")
    print("Please ensure you have accepted the terms for the model on Hugging Face and are logged in.")
    # You might want to stop execution or handle this error appropriately
    raise # Re-raise the exception to halt execution if necessary
except Exception as e:
     print(f"An unexpected error occurred while loading the model: {e}")
     raise # Re-raise any other unexpected exceptions


# Now you can proceed with PEFT setup
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)

print("Tokenizer and Model loaded successfully.")

In [None]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)

In [None]:
{"messages": [{"role": "user", "content": "How was the room? ..."}, {"role": "assistant", "content": "To evaluate the hotel experience."}]}

In [None]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer # Import AutoTokenizer if not already imported

# Ensure you have your tokenizer loaded
# model_id = "mistralai/Mistral-7B-Instruct-v0.2"
# tokenizer = AutoTokenizer.from_pretrained(model_id)
# if tokenizer.pad_token is None:
#      tokenizer.pad_token = tokenizer.eos_token # Already handled later, but good practice

# Load the dataset directly from the local JSONL file
# Using load_dataset with the 'json' builder and data_files pointing to the local file is correct,
# but sometimes specifying the full path or ensuring the current working directory is correct helps.
# Let's re-try the same command as it should work for local files,
# ensuring the file 'train.jsonl' exists in the current working directory.

try:
    dataset = load_dataset("json", data_files="train.jsonl")
except NotImplementedError as e:
    print(f"Caught expected NotImplementedError: {e}")
    print("This error can sometimes occur if the library misinterprets the local file path.")
    print("Ensure 'train.jsonl' is in the current directory or use the full path.")
    # If the error persists, ensure the datasets library is up-to-date
    # and check for any environment-specific configurations that might be affecting file system access.
    # As a fallback, loading into pandas first and then converting to a Dataset might work
    import pandas as pd
    df_loaded = pd.read_json("train.jsonl", lines=True)
    dataset = Dataset.from_pandas(df_loaded)

# Access the dataset (it's likely a DatasetDict with a single key 'train' if loaded from file like this,
# but if it's a Dataset from_pandas, it's a single Dataset)
# Let's assume it's a Dataset object or we want to work with the main part if it's a DatasetDict
if isinstance(dataset, dict):
    # If load_dataset returned a DatasetDict (which might happen depending on structure),
    # we take the first split (usually 'train' if it exists, or the only one)
    if 'train' in dataset:
        dataset_to_split = dataset['train']
    else:
        # If no explicit 'train' split, just take the first available split
        dataset_to_split = next(iter(dataset.values()))
else:
    # If it's already a single Dataset
    dataset_to_split = dataset

# Split the dataset into training and testing sets
# Adjust test_size as needed (e.g., 0.1 for 10% test set)
train_test_split = dataset_to_split.train_test_split(test_size=0.1)

# Access the splits
train_dataset_split = train_test_split['train']
test_dataset_split = train_test_split['test']

# Your subsequent code for mapping can remain the same
def format_conversation(example):
    prompt = ""
    for m in example["messages"]:
        # Use the instruction format for Mistral-Instruct models
        if m["role"] == "user":
            prompt += f"[INST] {m['content']} [/INST]\n"
        elif m["role"] == "assistant":
             prompt += f"{m['content']}\n" # Assistant response does not need the tags for training next-token prediction

    # For training a causal LM to generate the assistant response, we need the prompt part
    # and the target part. The data collator will handle the shifting for language modeling.
    # We'll keep the 'text' column for simplicity and let the tokenizer handle it.
    return {"text": prompt}

# Apply formatting to the training and testing splits
formatted_train_dataset = train_dataset_split.map(format_conversation)
formatted_test_dataset = test_dataset_split.map(format_conversation)

# NEW STEP: Tokenize the formatted text
def tokenize_function(examples):
    # Tokenize the 'text' column
    # Ensure max_length is appropriate for your data and model
    # Add padding and truncation
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512) # Adjust max_length as needed

# Apply tokenization
tokenized_train_dataset = formatted_train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = formatted_test_dataset.map(tokenize_function, batched=True)

# Remove the original 'messages' and 'text' columns as they are no longer needed by the model
# The tokenizer adds 'input_ids', 'attention_mask', and potentially 'token_type_ids'
# and the DataCollatorForLanguageModeling expects 'input_ids' and 'labels' (which it creates from input_ids)
tokenized_train_dataset = tokenized_train_dataset.remove_columns(["messages", "text"])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["messages", "text"])


print("Dataset loaded, split, formatted, and tokenized successfully.")
print("Tokenized Train dataset:", tokenized_train_dataset)
print("Tokenized Test dataset:", tokenized_test_dataset)

In [None]:
# Entrainement avec Trainer

from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir="./mistral-form-tuned",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    logging_steps=10,
    num_train_epochs=3,
    save_strategy="epoch",
    fp16=True,
    lr_scheduler_type="cosine",
    learning_rate=2e-4,
    # Add evaluation strategy to evaluate on the test set
    eval_strategy="epoch",
    # Set remove_unused_columns=False to prevent the Trainer from removing columns
    # that the model might not explicitly take but the data collator needs (like 'input_ids'
    # from the tokenizer, which DataCollatorForLanguageModeling uses to create 'labels').
    # However, since we explicitly removed 'messages' and 'text' and kept 'input_ids' etc.,
    # this might not be strictly necessary, but it's a good safety measure if the collator
    # relies on specific generated columns not in the model signature.
    remove_unused_columns=False # Add this argument
)

# Ensure the tokenizer has a pad_token. T5 tokenizers usually have one, but Mistral may not by default.
# For causal LMs, it's common to set pad_token to eos_token or add a specific pad_token.
if tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
     # Alternatively, add a new pad token if eos_token is not suitable or you prefer a dedicated one
     # tokenizer.add_special_tokens({'pad_token': '[PAD]'})
     # model.resize_token_embeddings(len(tokenizer)) # Resize embeddings if adding new tokens

# The DataCollatorForLanguageModeling expects 'input_ids' and generates 'labels' from it.
# It also handles padding.
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    # Use the split and tokenized datasets
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset, # Add the evaluation dataset
    tokenizer=tokenizer, # Pass the tokenizer to the Trainer
    data_collator=data_collator, # Add data_collator
)

trainer.train()