In [1]:
!pip install transformers datasets torch




In [4]:
!pip install datasets evaluate




In [13]:
!pip install transformers torch pandas




In [1]:
import pandas as pd
from transformers import XLNetTokenizer, XLNetLMHeadModel, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader


  from .autonotebook import tqdm as notebook_tqdm
  warn(





In [2]:

class ConversationDataset(Dataset):
    def __init__(self, tokenizer, filepath, max_len=512):
        self.tokenizer = tokenizer
        self.data = pd.read_csv(filepath)
        self.max_len = max_len
        self.conversations = self._prepare_data()

    def _prepare_data(self):
        # Group and create full and partial texts
        conversations = []
        grouped = self.data.groupby('CONVERSATION_ID')
        for _, group in grouped:
            group = group.sort_values(by='CONVERSATION_STEP')
            full_text = " ".join(group['TEXT'].tolist())
            partial_index = int(len(group) * 0.5)  # Using 50% of the conversation as partial
            partial_text = " ".join(group['TEXT'].iloc[:partial_index].tolist())
            conversations.append({'partial': partial_text, 'full': full_text})
        return conversations

    def __len__(self):
        return len(self.conversations)

    def __getitem__(self, idx):
        conversation = self.conversations[idx]
        encoded_input = self.tokenizer(
            conversation['partial'],
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        encoded_labels = self.tokenizer(
            conversation['full'],
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        # `input_ids` as labels for language modeling (XLNet auto-regression)
        return {
            'input_ids': encoded_input.input_ids.squeeze(),
            'attention_mask': encoded_input.attention_mask.squeeze(),
            'labels': encoded_labels.input_ids.squeeze()
        }

# Usage
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
csv_path = "C:/Users/DELL/Desktop/VOIP_Phishing_Attacks/Repos/convoPredict/conversation-prediction/FINAL_DATASET2.csv"
dataset = ConversationDataset(tokenizer, csv_path)


In [4]:
model = XLNetLMHeadModel.from_pretrained('xlnet-base-cased')


In [5]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=4,   # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

trainer.train()


 18%|█▊        | 10/57 [11:07<1:05:00, 82.99s/it]

{'loss': 13.6548, 'grad_norm': 231.1257781982422, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.53}


 35%|███▌      | 20/57 [23:21<46:27, 75.33s/it]  

{'loss': 12.682, 'grad_norm': 117.57601165771484, 'learning_rate': 2.0000000000000003e-06, 'epoch': 1.05}


 53%|█████▎    | 30/57 [34:55<29:52, 66.39s/it]

{'loss': 11.0163, 'grad_norm': 65.33290100097656, 'learning_rate': 3e-06, 'epoch': 1.58}


 70%|███████   | 40/57 [47:20<18:25, 65.05s/it] 

{'loss': 9.5867, 'grad_norm': 42.49066925048828, 'learning_rate': 4.000000000000001e-06, 'epoch': 2.11}


 88%|████████▊ | 50/57 [59:28<07:58, 68.34s/it] 

{'loss': 8.2404, 'grad_norm': 51.6396598815918, 'learning_rate': 5e-06, 'epoch': 2.63}


100%|██████████| 57/57 [1:06:32<00:00, 70.05s/it]

{'train_runtime': 3991.066, 'train_samples_per_second': 0.057, 'train_steps_per_second': 0.014, 'train_loss': 10.552279355233175, 'epoch': 3.0}





TrainOutput(global_step=57, training_loss=10.552279355233175, metrics={'train_runtime': 3991.066, 'train_samples_per_second': 0.057, 'train_steps_per_second': 0.014, 'total_flos': 64560379723776.0, 'train_loss': 10.552279355233175, 'epoch': 3.0})

In [6]:
model.save_pretrained('./xlnet_finetuned')
tokenizer.save_pretrained('./xlnet_finetuned')


('./xlnet_finetuned\\tokenizer_config.json',
 './xlnet_finetuned\\special_tokens_map.json',
 './xlnet_finetuned\\spiece.model',
 './xlnet_finetuned\\added_tokens.json')