In [3]:
# Import necessary libraries
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset
from sklearn.model_selection import train_test_split

In [4]:
# Load the dataset
csv_path = "C:/Users/DELL/Desktop/VOIP_Phishing_Attacks/Repos/convoPredict/conversation-prediction/FINAL_DATASET2.csv"
data = pd.read_csv(csv_path)

In [5]:
# Preprocess the data
data['input_text'] = data.groupby('CONVERSATION_ID')['TEXT'].transform(lambda x: ' '.join(x.shift(1).fillna("")))
data['target_text'] = data['TEXT']
data = data[data['CONVERSATION_STEP'] > 1]

In [6]:
# Split the data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.1)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

# Initialize the tokenizer and model from the pre-trained 't5-small'
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [8]:
# Define preprocessing function for tokenization
def preprocess_function(examples):
    model_inputs = tokenizer(examples['input_text'], max_length=512, padding='max_length', truncation=True)
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['target_text'], max_length=128, padding='max_length', truncation=True)
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [9]:
# Apply preprocessing
train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# Define the data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors='pt')

Map: 100%|██████████| 801/801 [00:01<00:00, 567.75 examples/s]
Map: 100%|██████████| 90/90 [00:00<00:00, 646.44 examples/s]


In [10]:
# Setup training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
)




In [11]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
)

# Start training
trainer.train()

  0%|          | 0/153 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
 18%|█▊        | 27/153 [25:07<2:08:37, 61.25s/it]

In [None]:
# Save the model and tokenizer
model.save_pretrained("./t5_trained_model")
tokenizer.save_pretrained("./t5_trained_model")

# Optionally evaluate the model
print(trainer.evaluate())