In [8]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import os
import pickle  # for saving tokenizer or other objects

In [10]:
# ========================================
# 8) Reusable Inference Code
# ========================================
# Suppose we want to load the model + tokenizer in a new environment or script 
# and generate the remainder from a partial snippet.

def load_model_and_tokenizer(model_dir):
    # Load tokenizer
    tokenizer_path = os.path.join(model_dir, "tokenizer.pkl")
    with open(tokenizer_path, "rb") as f:
        loaded_tokenizer = pickle.load(f)
    # Instantiate model with correct vocab size
    # Make sure you match d_model, nhead, layers, etc. 
    # or store them in a config file
    model = TransformerEncoderDecoder(
        vocab_size=loaded_tokenizer.vocab_size,
        d_model=128,
        nhead=4,
        num_encoder_layers=2,
        num_decoder_layers=2
    )
    # Load state_dict
    model_path = os.path.join(model_dir, "model_state.pt")
    state_dict = torch.load(model_path, map_location=torch.device('cpu'))
    model.load_state_dict(state_dict)
    model.eval()
    return model, loaded_tokenizer

def generate_remainder(model, tokenizer, partial_text, max_len=50):
    model.eval()
    # encode partial
    partial_ids = tokenizer.encode(partial_text, add_bos=True)  # add <BOS>
    src = torch.tensor([partial_ids], dtype=torch.long)

    # We start decoder input with <BOS>
    dec_in = torch.tensor([[tokenizer.word2idx["<BOS>"]]], dtype=torch.long)

    for _ in range(max_len):
        with torch.no_grad():
            logits = model(src, dec_in)
            # next token from last time-step
            next_token_logits = logits[0,-1,:]
            next_id = torch.argmax(next_token_logits).item()

        # append
        dec_in = torch.cat([dec_in, torch.tensor([[next_id]])], dim=1)

        # stop if <EOS>
        if next_id == tokenizer.word2idx["<EOS>"]:
            break

    # decode the generated tokens after <BOS>
    generated_ids = dec_in[0,1:].tolist()
    text_out = tokenizer.decode(generated_ids)
    return text_out


In [11]:
# Example of usage:

# (A) In the same script, we can simply reuse the 'model' and 'tokenizer' we already have.
# Or let's simulate how we'd do it from a fresh environment:

loaded_model, loaded_tokenizer = load_model_and_tokenizer("C:/Users/DELL/Desktop/VOIP_Phishing_Attacks/Repos/convoPredict/conversation-prediction/research/3. MidEvaluation/encoder-decoder/gpt2_saved_conversation_model")
loaded_model.to(device)

# Now let's pick a partial snippet from the dataset or custom:
test_partial = "Hello, I'm Sam. I saw an ad about a photography workshop."
print("=== PARTIAL ===")
print(test_partial)

remainder_pred = generate_remainder(loaded_model, loaded_tokenizer, test_partial)
print("\n=== PREDICTED REMAINDER ===")
print(remainder_pred)


AttributeError: Can't get attribute 'SimpleTokenizer' on <module '__main__'>