In [3]:
!pip install pandas torch transformers nltk rouge-score matplotlib seaborn




In [10]:
# =========================================
# 1) Imports
# =========================================
import os
import math
import pandas as pd
import torch
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from transformers import XLNetTokenizer, XLNetLMHeadModel
import matplotlib.pyplot as plt
import numpy as np

nltk.download('punkt')  # Ensure the NLTK tokenizers are downloaded

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cpu


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
# =========================================
# 2) Paths
# =========================================
csv_path = r"C:/Users/DELL/Desktop/VOIP_Phishing_Attacks/Repos/convoPredict/conversation-prediction/testdataset.csv"
model_path = "C:/Users/DELL/Desktop/VOIP_Phishing_Attacks/Repos/convoPredict/conversation-prediction/research/3. MidEvaluation/fineTune/xlnet_finetuned"
print("CSV Path:", csv_path)
print("Model Path:", model_path)

CSV Path: C:/Users/DELL/Desktop/VOIP_Phishing_Attacks/Repos/convoPredict/conversation-prediction/testdataset.csv
Model Path: C:/Users/DELL/Desktop/VOIP_Phishing_Attacks/Repos/convoPredict/conversation-prediction/research/3. MidEvaluation/fineTune/xlnet_finetuned


In [12]:
# =========================================
# 3) Build partial->entire from CSV
# =========================================
def build_partial_entire(csv_path, partial_ratio=0.5):
    df = pd.read_csv(csv_path)
    pairs = []
    for convo_id, group in df.groupby("CONVERSATION_ID"):
        group_sorted = group.sort_values("CONVERSATION_STEP")
        lines = group_sorted["TEXT"].tolist()
        if len(lines) < 2:
            continue
        entire_str = "\n".join(lines).strip()
        cutoff = max(1, int(len(lines) * partial_ratio))
        partial_lines = lines[:cutoff]
        partial_str = "\n".join(partial_lines).strip()
        pairs.append((partial_str, entire_str))
    return pairs

pairs = build_partial_entire(csv_path, partial_ratio=0.5)
print("Number of partial->entire pairs:", len(pairs))
if pairs:
    print("\nSample pair:\nPartial:", pairs[0][0], "\nEntire:", pairs[0][1])

Number of partial->entire pairs: 29

Sample pair:
Partial: Hello, I need help with my bank account.
Sure, I can help you with that. Can you please provide your account number?
My account number is 12345678.
Thank you. Can you also provide your PIN?
My PIN is 1234.
Thank you. Please hold on while I verify your information.
Okay, I'll wait. 
Entire: Hello, I need help with my bank account.
Sure, I can help you with that. Can you please provide your account number?
My account number is 12345678.
Thank you. Can you also provide your PIN?
My PIN is 1234.
Thank you. Please hold on while I verify your information.
Okay, I'll wait.
Wait, how do I know this is really my bank?
I assure you, this is a legitimate call. You can check the number we are calling from.
Alright, the number does match my bank's number.
Great, now please provide your account number and PIN.
Here is my account number: 12345678 and my PIN: 1234.
Thank you. Your account is now secure.
I'm glad to hear that. Thank you for you

In [13]:
# =========================================
# 4) Load Fine-Tuned XLNet
# =========================================
tokenizer = XLNetTokenizer.from_pretrained(model_path)
model = XLNetLMHeadModel.from_pretrained(model_path)
model.to(device)
model.eval()


XLNetLMHeadModel(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0-11): 12 x XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (activation_function): GELUActivation()
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (lm_loss): Linear(in_features=768, out_features=32000, bias=True)
)

In [15]:
# =========================================
# 5) Generate Entire Conversation from Partial
# =========================================
def generate_entire_conversation(model, tokenizer, partial_text, max_length=512, num_beams=4):
    inputs = tokenizer.encode_plus(partial_text, add_special_tokens=True, return_tensors="pt", max_length=max_length, truncation=True)
    inputs = inputs.to(device)
    with torch.no_grad():
        outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=max_length, num_beams=num_beams)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)



In [16]:
predicted_texts = []
for pair in pairs:
    predicted_text = generate_entire_conversation(model, tokenizer, pair[0])
    print(predicted_text)
    predicted_texts.append(predicted_text)

Hello, I need help with my bank account. Sure, I can help you with that. Can you please provide your account number? My account number is 12345678. Thank you. Can you also provide your PIN? My PIN is 1234. Thank you. Please hold on while I verify your information. Okay, I'll wait....................................................................................................................................................................................................................................................................................................................................................................................................................................................
Hi, I received a message about a package delivery. Yes, we have a package for you. Can you confirm your address?, I have been waiting for you for a long time..............................................................................................................................

In [None]:
# =========================================
# 6) Evaluate with BLEU
# =========================================
bleu_scores = [sentence_bleu([nltk.word_tokenize(pair[1])], nltk.word_tokenize(pred), smoothing_function=SmoothingFunction().method1) for pair, pred in zip(pairs, predicted_texts)]
average_bleu = np.mean(bleu_scores)
print(f"Average BLEU score: {average_bleu}")

In [None]:
# =========================================
# 7) Visualization
# =========================================
plt.figure(figsize=(10, 6))
plt.hist(bleu_scores, bins=30, color="skyblue", edgecolor="black")
plt.title("Distribution of BLEU Scores")
plt.xlabel("BLEU Score")
plt.ylabel("Number of Samples")
plt.show()

# Optional: Print sample predictions with their BLEU scores
for i, (pair, pred) in enumerate(zip(pairs, predicted_texts)):
    if i < 5:  # Show only first 5 predictions
        print(f"Partial Text: {pair[0]}")
        print(f"Generated Full Text: {pred}")
        print(f"Actual Full Text: {pair[1]}")
        print(f"BLEU Score: {bleu_scores[i]}\n")