# Imports

In [1]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Tokenization

In [2]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

# Charger le tokenizer et le modèle
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
model = GPT2LMHeadModel.from_pretrained("gpt2-medium")

# Charger votre jeu de données à partir du fichier CSV
data = pd.read_csv("wiki_movie_plots_deduped.csv") 

# Prétraiter les données et les sauvegarder dans un fichier texte
with open("dataset.txt", "w", encoding="utf-8") as f:
    for idx, row in data.iterrows():
        plot = row["Plot"]
        genre = row["Genre"]
        f.write(f"<BOS> Genre: {genre} Plot: {plot}\n")

# Charger le dataset à partir du fichier texte
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="dataset.txt",
    block_size=128  # Taille maximale du bloc pour le modèle
)

# Configuration des paramètres d'entraînement
training_args = TrainingArguments(
    output_dir="./fine-tuned-gpt2",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Entraînement du modèle
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    train_dataset=train_dataset,
)

trainer.train()

2024-05-14 09:21:49.140305: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-14 09:21:49.140435: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-14 09:21:49.246900: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  


[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  


[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Step,Training Loss
500,3.4506
1000,3.4118
1500,3.4036
2000,3.3863
2500,3.3794
3000,3.3545
3500,3.3657
4000,3.3442
4500,3.3333
5000,3.3347




TrainOutput(global_step=38589, training_loss=3.1310325886895596, metrics={'train_runtime': 30479.448, 'train_samples_per_second': 10.128, 'train_steps_per_second': 1.266, 'total_flos': 7.16731726257193e+16, 'train_loss': 3.1310325886895596, 'epoch': 3.0})

In [3]:
# Chemin où vous souhaitez enregistrer le modèle
output_dir = "/kaggle/working/fine-tuned-gpt2"

# Enregistrer le modèle fine-tuned
model.save_pretrained(output_dir)

# Test


In [4]:
%%time
# Charger le tokenizer et le modèle fine-tuned
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")  # Chemin vers le modèle fine-tuned
model = GPT2LMHeadModel.from_pretrained("/kaggle/working/fine-tuned-gpt2")  # Chemin vers le modèle fine-tuned

# Fonction pour générer une intrigue à partir d'un genre
def generate_plot(genre):
    # Préparer l'entrée pour le modèle (ajout de tokens spéciaux)
    input_text = "<BOS> Genre: " + genre + " Plot:"

    # Convertir l'entrée en tokens
    input_ids = tokenizer.encode(input_text, return_tensors='pt')

    # Utiliser le modèle pour générer une suite de texte
    output = model.generate(input_ids, max_length=200, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)

    # Décoder la sortie en texte
    plot = tokenizer.decode(output[0], skip_special_tokens=True)

    return plot


genre = "Comedy"
intrigue = generate_plot(genre)
print("Intrigue de film pour le genre", genre + ":", intrigue)

print("------------------------")
genre = "Action"
intrigue = generate_plot(genre)
print("Intrigue de film pour le genre", genre + ":", intrigue)

Intrigue de film pour le genre Comedy: <BOS> Genre: Comedy Plot: The film opens with a young man named Michael (Michael Rennie) who is a student at a local college. He is a bit of a loner, and is always in trouble with his friends. One day, he meets a girl named Sarah (Sarah Silverman), who is a student at the same college. They both fall in love with each other, and they decide to get married.
Michael's father, a lawyer, is a very strict man, and he is very strict with his son. Michael's father is very strict with his son, and he is very strict with his father. Michael's father is very strict with his son, and he is very strict with his father. Michael's father is very strict with his son, and he is very strict with his father. Michael's father is very strict with his son, and he is very strict with his father. Michael's father is very strict with his son,
------------------------
Intrigue de film pour le genre Action: <BOS> Genre: Action Plot: The film opens with a young man named Ra