In [5]:
!pip install pandas torch





In [6]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import re
import math
from collections import Counter

In [7]:
########################################
# 1) Load and Aggregate Conversations
########################################

def load_and_aggregate(csv_path):
    """
    Reads a CSV file with columns:
      - CONVERSATION_ID
      - CONVERSATION_STEP
      - TEXT
      - (other columns ignored)

    Returns:
      A list of strings, where each string represents one
      entire conversation joined by newlines.
    """
    df = pd.read_csv(csv_path)
    grouped = df.groupby("CONVERSATION_ID")

    conversation_texts = []
    for convo_id, group in grouped:
        group_sorted = group.sort_values("CONVERSATION_STEP")
        texts = group_sorted["TEXT"].tolist()
        # Join all steps with line breaks (or another delimiter)
        full_convo = "\n".join(texts)
        conversation_texts.append(full_convo)

    return conversation_texts

In [8]:
########################################
# 2) Basic Tokenization & Vocab Building
########################################

def simple_tokenize(text):
    """
    Splits on whitespace and basic punctuation.
    Feel free to customize or replace with a more advanced tokenizer.
    """
    # Lowercase
    text = text.lower()
    # Remove non-alphanumeric (keep basic punctuation if you prefer)
    text = re.sub(r"[^a-z0-9\s.,!?']", "", text)
    # Split on whitespace
    tokens = text.split()
    return tokens

def build_vocab(all_texts, min_freq=1):
    """
    Builds a vocabulary from a list of text strings.
    Args:
      - all_texts: list of conversation strings
      - min_freq: minimum frequency a token must have to appear in vocab
    Returns:
      - stoi (dict): string-to-index mapping
      - itos (list): index-to-string list
    """
    counter = Counter()
    
    for txt in all_texts:
        tokens = simple_tokenize(txt)
        counter.update(tokens)

    # Sort by frequency
    sorted_tokens = sorted(counter.items(), key=lambda x: x[1], reverse=True)
    
    # Filter tokens below min_freq
    filtered_tokens = [t for t, c in sorted_tokens if c >= min_freq]
    
    # Special tokens
    # For a language model, at least need PAD and UNK, ideally BOS/EOS as well
    special_tokens = ["<pad>", "<unk>"]
    
    itos = special_tokens + filtered_tokens
    stoi = {token: idx for idx, token in enumerate(itos)}
    
    return stoi, itos


In [9]:
########################################
# 3) Dataset Class
########################################

class ConversationLSTMDataset(Dataset):
    """
    Splits each conversation text into tokens, then divides them into
    fixed-length sequences for language modeling.

    For example, if sequence length = 5,
    tokens: [w1, w2, w3, w4, w5, w6, w7, w8]
    We get samples: 
      input = [w1, w2, w3, w4, w5], target = [w2, w3, w4, w5, w6]
      input = [w2, w3, w4, w5, w6], target = [w3, w4, w5, w6, w7]
      ...
    This is a sliding window approach.
    """

    def __init__(self, conversation_texts, stoi, seq_len=10):
        self.stoi = stoi
        self.seq_len = seq_len
        self.samples = []
        
        for text in conversation_texts:
            tokens = simple_tokenize(text)
            # Convert tokens -> IDs
            token_ids = [self.stoi.get(t, self.stoi["<unk>"]) for t in tokens]
            
            # Create (input, target) pairs in a sliding window
            for i in range(len(token_ids) - seq_len):
                input_seq = token_ids[i : i + seq_len]
                target_seq = token_ids[i + 1 : i + seq_len + 1]
                self.samples.append((input_seq, target_seq))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        input_seq, target_seq = self.samples[idx]
        return torch.tensor(input_seq, dtype=torch.long), torch.tensor(target_seq, dtype=torch.long)



In [10]:
#######################################
# 4) Collate Function
########################################

def collate_fn(batch):
    """
    batch is a list of (input_seq, target_seq) tuples.
    We'll stack them into tensors of shape (batch_size, seq_len).
    """
    inputs = [item[0] for item in batch]
    targets = [item[1] for item in batch]

    inputs = torch.stack(inputs)   # (batch_size, seq_len)
    targets = torch.stack(targets) # (batch_size, seq_len)
    
    return inputs, targets

In [11]:
########################################
# 5) Define the LSTM Model
########################################

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers=1):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        """
        x: (batch_size, seq_len)
        hidden: (h_0, c_0) if provided
        """
        emb = self.embedding(x)  # (batch_size, seq_len, embed_dim)
        out, hidden = self.lstm(emb, hidden)  # (batch_size, seq_len, hidden_dim)
        logits = self.fc(out)   # (batch_size, seq_len, vocab_size)
        return logits, hidden

In [12]:
########################################
# 6) Training Loop
########################################

def train_lstm_model(model, dataloader, optimizer, criterion, device="cpu", num_epochs=5):
    model.to(device)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0
        
        for batch_idx, (inputs, targets) in enumerate(dataloader):
            inputs, targets = inputs.to(device), targets.to(device)

            # Forward pass
            logits, _ = model(inputs)
            # logits shape: (batch_size, seq_len, vocab_size)
            # targets shape: (batch_size, seq_len)

            # We need to reshape for cross-entropy:
            logits_reshaped = logits.view(-1, logits.size(-1))   # (batch_size * seq_len, vocab_size)
            targets_reshaped = targets.view(-1)                  # (batch_size * seq_len)

            loss = criterion(logits_reshaped, targets_reshaped)

            # Backprop
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        
        avg_loss = total_loss / len(dataloader)
        print(f"Epoch [{epoch+1}/{num_epochs}] - Loss: {avg_loss:.4f}")



In [13]:
########################################
# 7) Text Generation (Inference)
########################################

def generate_text(model, stoi, itos, prompt, max_tokens=20, device="cpu"):
    """
    Given a prompt (string), tokenizes it, feeds into the model, and
    repeatedly generates the next token in a greedy way.
    """
    model.eval()
    tokens = simple_tokenize(prompt)
    input_ids = [stoi.get(t, stoi["<unk>"]) for t in tokens]

    # Convert to tensor of shape (1, length)
    input_tensor = torch.tensor([input_ids], dtype=torch.long).to(device)

    # We'll keep track of the hidden state
    hidden = None

    generated_tokens = tokens[:]  # Copy of the original prompt tokens

    with torch.no_grad():
        for _ in range(max_tokens):
            # Forward pass
            logits, hidden = model(input_tensor, hidden)
            # Get the last time step's logits
            last_logits = logits[:, -1, :]  # shape: (1, vocab_size)

            # Greedy pick
            next_token_id = torch.argmax(last_logits, dim=-1).item()
            next_token_word = itos[next_token_id]

            # Append to generated sequence
            generated_tokens.append(next_token_word)

            # Prepare next input
            input_tensor = torch.tensor([[next_token_id]], dtype=torch.long).to(device)

    # Join tokens into string
    return " ".join(generated_tokens)



In [20]:
def main():
    # 1. Load & aggregate your CSV dataset
    df = pd.read_csv("/Users/ashansubodha/Desktop/VOIP Vishing/conversation-prediction/FINAL_DATASET2.csv")
    conversation_texts = load_and_aggregate(csv_path)
    print("Loaded conversations:", len(conversation_texts))

    # 2. Build vocab
    stoi, itos = build_vocab(conversation_texts, min_freq=1)
    print("Vocab size:", len(stoi))

    # 3. Create Dataset & DataLoader
    seq_len = 10   # sliding window size
    dataset = ConversationLSTMDataset(conversation_texts, stoi, seq_len=seq_len)
    print("Number of samples (input-target pairs):", len(dataset))

    batch_size = 16
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

    # 4. Instantiate the LSTM Model
    vocab_size = len(stoi)
    embed_dim = 100
    hidden_dim = 128
    num_layers = 1

    model = LSTMModel(vocab_size, embed_dim, hidden_dim, num_layers=num_layers)

    # 5. Define optimizer & loss
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.CrossEntropyLoss()

    # 6. Train
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_lstm_model(model, dataloader, optimizer, criterion, device=device, num_epochs=5)

    # 7. Text Generation Demo
    prompt = "Hello, this is the personal assistant of your name"
    generated = generate_text(model, stoi, itos, prompt, max_tokens=20, device=device)
    print("\n=== Generated Text ===")
    print(generated)


In [19]:
if __name__ == "__main__":
    main()

NameError: name 'csv_path' is not defined