In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split
import pandas as pd
import random

2024-05-18 14:05:38.436396: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-18 14:05:38.436512: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-18 14:05:38.574335: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
dataset_path = '/kaggle/input/IMDB Dataset.csv'
imdb_data = pd.read_csv(dataset_path)

In [3]:
# Charger le tokenizer et le modèle
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('koweez/generate_review_2')
print("Number of parameters: ", sum(p.numel() for p in model.parameters()))

# Préparation des données
def clean_text(text):
    text = text.replace('<br />', ' ')  # Nettoyer les balises HTML
    return text

imdb_data['clean_review'] = imdb_data['review'].apply(clean_text)
imdb_data_positive = imdb_data[imdb_data['sentiment'] == 'positive'].head(15000)
imdb_data_negative = imdb_data[imdb_data['sentiment'] == 'negative'].head(15000)
generator_training = pd.concat([imdb_data_negative, imdb_data_positive])

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/930 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Number of parameters:  124439808


In [5]:
# Créer les datasets d'entraînement et de test
#train_texts, test_texts = train_test_split(generator_training['clean_review'], shuffle=True)
train_texts = generator_training['clean_review']
train_texts.to_csv('train_dataset.txt', header=False, index=False)
#test_texts.to_csv('test_dataset.txt', header=False, index=False)

In [6]:
# Préparer les datasets pour GPT-2
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path='/kaggle/working/train_dataset.txt',
    block_size=128)



In [7]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False)

# Paramètres d'entraînement
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=15,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2
)

In [None]:
# Entraîneur
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset
)

# Entraîner le modèle
trainer.train()

In [None]:
# Sauvegarder le modèle entraîné
model.save_pretrained('trained_gpt2_imdb')

In [4]:
model = model.to("cuda")
def generate_review(prompt, max_length=200, temperature=0.8, top_k=50, top_p=0.95):
    # Encoder le texte d'entrée
    encoded_input = tokenizer.encode(prompt, return_tensors='pt').to("cuda")
    # Générer la sortie à partir du modèle
    output_sequences = model.generate(
        input_ids=encoded_input,
        max_length=max_length,
        temperature=temperature,  # Ajuster pour plus de créativité
        top_k=top_k,         # Top-k sampling
        top_p=top_p,       # Top-p (nucleus) sampling
        num_return_sequences=1,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    # Décoder les textes générés
    generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)

    return generated_text

# Exemple d'utilisation
prompt = "Batman is a movie that"
generated_review_text = generate_review(prompt)
generated_review_text = generated_review_text[:generated_review_text.rfind('.') + 1]

print("Generated Review:", generated_review_text)

Generated Review: Batman is a movie that is fun to watch and entertaining to watch."
"I have to admit, this was the worst movie I have ever seen. I had heard good things about it and when I finally watched it on my birthday, my family and I all agreed that it was awful. But at the same time, we were both kind of sad that we had wasted precious time and money on this piece of crap. The acting was pretty bad, the storyline pretty bad, and the direction was awful. I was not scared at all, I was actually kinda upset that I wasted so much money on something that shouldn't have been there. It wasn't even bad enough to make me laugh.  I guess I had just wasted the $3.00 on the DVD case file. That's about right. I'm glad I did not have to pay that much money. I was actually able to sit through the entire film in the theater, but I felt I would have better if I had


In [5]:
imdb_data_positive_prompt = imdb_data[imdb_data['sentiment'] == 'positive'][15000:20000]
imdb_data_negative_prompt = imdb_data[imdb_data['sentiment'] == 'negative'][15000:20000]
generator_prompt = pd.concat([imdb_data_positive_prompt, imdb_data_negative_prompt])

In [10]:
from tqdm import tqdm
def generate_fake_reviews(generator_prompt):
    dico = {"fake_review": []}
    for prompt in tqdm(generator_prompt.head(1000)['clean_review']): 
        r = random.randint(7,16)
        short_prompt = ' '.join(prompt.split()[:r])
        
        max_length_r = random.randint(150, 201)
        temperature_r = random.uniform(0.6, 0.8)
        top_k_r = random.randint(200, 221)
        top_p_r = random.uniform(0.6, 0.76)
        
        review = generate_review(short_prompt, max_length=max_length_r, temperature=temperature_r, top_k=top_k_r, top_p=top_p_r)
        dico["fake_review"].append(review)
    return pd.DataFrame(data=dico)
df = generate_fake_reviews(generator_prompt)
df.to_csv('/kaggle/working/fake_reviews.csv', index=False)

100%|██████████| 1000/1000 [30:49<00:00,  1.85s/it]


In [83]:
# cleaning du dataset généré

# supprime les phrases présente plus de <threshold> fois
def clean_text(text, threshold=3):
    sentences = text.split('.')
    occurences = {}
    for s in sentences:
        if s in occurences:
            occurences[s] += 1
        else:
            occurences[s] = 1
    new_text = [s for s in occurences if occurences[s] < threshold]
    return '.'.join(new_text)

# enleve tout après le dernier . et dernier \n et applique clean_text et enleve les reviews trop courtes
def clean_reviews(df):
    data = df.copy()
    data['fake_review'] = data['fake_review'].str.rsplit('.', n=1).str[0] + '.'
    data['fake_review'] = data['fake_review'].str.rsplit('\n', n=1).str[0]
    data['fake_review'] = data['fake_review'].apply(clean_text, threshold=3)
    data = data[data['fake_review'].str.split().str.len() >= 20]
    return data


In [106]:
# saving du dataset nettoyé
df_2 = pd.read_csv('/kaggle/working/fake_reviews.csv')
final_df = clean_reviews(df_2)
final_df.to_csv('/kaggle/working/final_reviews.csv')

In [109]:
# ajout de la length et étude du dataset
final_df['length'] = final_df['fake_review'].str.split().str.len()
minn, maxx, mean, std = final_df['length'].min(), final_df['length'].max(), final_df['length'].mean(), final_df['length'].std()
minn, maxx, mean, std

(20, 177, 102.84678243105209, 37.2291447687059)

In [130]:
# sélection de la partie du dataset IMDB réservée à notre tâche de classification
dataset_eval_positive = imdb_data[imdb_data['sentiment'] == 'positive'][20000:]
dataset_eval_negative = imdb_data[imdb_data['sentiment'] == 'negative'][20000:]
dataset_eval = pd.concat([dataset_eval_positive, dataset_eval_negative])

In [131]:
# ajout de la length
dataset_eval['length'] = dataset_eval['clean_review'].str.split().str.len()

In [132]:
# sort par length et clean du df
dataset_eval.sort_values('length')
dataset_eval.reset_index(inplace=True)
dataset_eval.drop(columns=['index', 'review', 'sentiment'], inplace=True)

In [141]:
# suppression des reviews trop courtes/longues et étude du dataset de review d'IMDB
dataset_eval = dataset_eval[(dataset_eval['length'] >= 20) & (dataset_eval['length'] <= 179)]
min_1, max_1, mean_1, std_1 = dataset_eval['length'].min(), dataset_eval['length'].max(), dataset_eval['length'].mean(), dataset_eval['length'].std()
min_1, max_1, mean_1, std_1

(20, 179, 121.7270659590599, 36.17060935399155)

In [142]:
# cherche la distribution de la length 
def sample_freq(df):
    df['class'] = df['length'] // 10
    freq = {}
    for i in range(len(df)):
        index = str(df['class'].iloc[i])
        if index in freq:
            freq[index] += 1
        else:
            freq[index] = 1
    return df, freq

dataset_eval, freq_eval = sample_freq(dataset_eval)
final_df, freq_final = sample_freq(final_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['class'] = df['length'] // 10


In [160]:
# sampling du dataset imdb selon la distribution du dataset généré
def get_sample(df, freq):
    sample_size = freq[str(df['class'].iloc[0])]
    return df.sample(sample_size, replace=True)

sampled_eval_df = dataset_eval.groupby('class', group_keys=False).apply(lambda x : get_sample(x, freq_final))
sampled_eval_df['length'].min(), sampled_eval_df['length'].max(), sampled_eval_df['length'].mean(), sampled_eval_df['length'].std()
sampled_eval_df = sampled_eval_df.drop_duplicates()

  sampled_eval_df = dataset_eval.groupby('class', group_keys=False).apply(lambda x : get_sample(x, freq_final))


Unnamed: 0,clean_review,length,class
7266,I can't believe they got the actors and actres...,28,2
9438,"An unfunny, unworthy picture which is an undes...",22,2
3219,Wonderful movie. Adult content. Lots of erotic...,29,2
8237,This movie is terrible. It's about some no bra...,21,2
6011,"You may like Tim Burton's fantasies, but not i...",29,2
...,...,...,...
3337,As the superb `Prime Suspect' series reaches p...,162,16
9562,Don't get me wrong - I love David Suchet as Po...,165,16
6055,I doubt this will ever even be a cult film. I ...,173,17
2675,"The ""movie aimed at adults"" is a rare thing th...",173,17


In [161]:
# clean du df imdb ajusté a la bonne distribution
sampled_eval_df.reset_index(inplace=True)
sampled_eval_df.drop(columns = ['index', 'length', 'class'], inplace=True)
sampled_eval_df.rename(columns={'clean_review': 'review'}, inplace=True)
sampled_eval_df['label'] = 'real'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp.drop(columns = ['index', 'length', 'class'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp.rename(columns={'clean_review': 'review'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp['label'] = 'real'


In [162]:
# clean du df généré
final_df['label'] = 'fake'
final_df.drop(columns=['length', 'class'], inplace=True)
final_df.rename(columns={'fake_review': 'review'}, inplace=True)

In [169]:
# création des datasets train et test pour la tache de classification
# train : 50 % real, 50 % fake
real_reviews_train = tmp.head(500)
fake_reviews_train = final_df.head(500)

# test : 66 % fake, 33 % real
fake_reviews_eval = final_df.iloc[500:]
real_reviews_eval = tmp.iloc[500: 500 + len(fake_reviews_eval) // 3]


train_reviews_dataset = pd.concat([real_reviews_train, fake_reviews_train])
train_reviews_dataset.reset_index(inplace=True, drop=True)
test_reviews_dataset = pd.concat([fake_reviews_eval, real_reviews_eval])
test_reviews_dataset.reset_index(inplace=True, drop=True)


In [172]:
# save des datasets pour la classification
test_reviews_dataset.to_csv('/kaggle/working/test_reviews_dataset.csv')
train_reviews_dataset.to_csv('/kaggle/working/train_reviews_dataset.csv')