In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import glob

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
# Use correct path
data_path = "/kaggle/input/new-york-stories-dataset/himym_season7.csv"
all_files = [data_path]  # For single file

all_texts = []
for f in all_files:
    df = pd.read_csv(f)
    print(f"Loaded {f}, shape={df.shape}, columns={df.columns.tolist()}")
    if 'line' in df.columns:
        lines = df['line'].dropna().astype(str).tolist()
        all_texts.extend(lines)
    elif 'Line' in df.columns:
        lines = df['Line'].dropna().astype(str).tolist()
        all_texts.extend(lines)
    else:
        text_col = df.columns[0]
        lines = df[text_col].dropna().astype(str).tolist()
        all_texts.extend(lines)

script_text = "\n".join(all_texts)
print("Total lines of dialogue:", len(all_texts))
print("Characters in text:", len(script_text))


Loaded /kaggle/input/new-york-stories-dataset/himym_season7.csv, shape=(9768, 3), columns=['episode', 'name', 'line']
Total lines of dialogue: 9768
Characters in text: 368905


In [4]:
# Create character-level mapping (you can switch to word-level if you want)
chars = sorted(list(set(script_text)))
vocab_size = len(chars)

# char to index mapping
char2idx = {ch: i for i, ch in enumerate(chars)}
idx2char = {i: ch for i, ch in enumerate(chars)}

print("Vocab size:", vocab_size)

Vocab size: 88


In [5]:
# Convert text into integer IDs
encoded_text = np.array([char2idx[c] for c in script_text])

# Hyperparameters
seq_length = 100   # sequence length for training
batch_size = 64

In [6]:
class ScriptDataset(torch.utils.data.Dataset):
    def __init__(self, data, seq_length):
        self.data = data
        self.seq_length = seq_length

    def __len__(self):
        return len(self.data) - self.seq_length

    def __getitem__(self, idx):
        x = self.data[idx:idx+self.seq_length]
        y = self.data[idx+1:idx+self.seq_length+1]
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)

dataset = ScriptDataset(encoded_text, seq_length)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)


In [7]:
class ScriptRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers=2):
        super(ScriptRNN, self).__init__()
        self.vocab_size = vocab_size
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim

    def forward(self, x, hidden=None):
        x = self.embed(x)  # (batch, seq_length, embed_dim)
        out, hidden = self.rnn(x, hidden)
        out = self.fc(out)  # (batch, seq_length, vocab_size)
        return out, hidden

    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        # Initialize hidden state and cell state to zeros
        return (weight.new_zeros(self.num_layers, batch_size, self.hidden_dim),
                weight.new_zeros(self.num_layers, batch_size, self.hidden_dim))


In [8]:
model = ScriptRNN(vocab_size, embed_dim=256, hidden_dim=512, num_layers=2).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.002)


In [9]:
epochs = 20

for epoch in range(epochs):
    total_loss = 0

    for batch, (x, y) in enumerate(dataloader):
        x, y = x.to(device), y.to(device)

        batch_size = x.size(0)
        hidden = model.init_hidden(batch_size)  # Initialize hidden per batch

        optimizer.zero_grad()
        out, hidden = model(x, hidden)

        hidden = (hidden[0].detach(), hidden[1].detach())  # Detach to prevent exploding graph

        loss = criterion(out.transpose(1, 2), y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        if batch % 50 == 0:
            print(f"Epoch {epoch+1}/{epochs}, Batch {batch}, Loss: {loss.item():.4f}")

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}, Average Loss: {avg_loss:.4f}")



Epoch 1/20, Batch 0, Loss: 4.4718
Epoch 1/20, Batch 50, Loss: 2.0880
Epoch 1/20, Batch 100, Loss: 1.7567
Epoch 1/20, Batch 150, Loss: 1.5908
Epoch 1/20, Batch 200, Loss: 1.4865
Epoch 1/20, Batch 250, Loss: 1.4683
Epoch 1/20, Batch 300, Loss: 1.4066
Epoch 1/20, Batch 350, Loss: 1.3467
Epoch 1/20, Batch 400, Loss: 1.3176
Epoch 1/20, Batch 450, Loss: 1.2995
Epoch 1/20, Batch 500, Loss: 1.2245
Epoch 1/20, Batch 550, Loss: 1.1928
Epoch 1/20, Batch 600, Loss: 1.1259
Epoch 1/20, Batch 650, Loss: 1.1404
Epoch 1/20, Batch 700, Loss: 1.1253
Epoch 1/20, Batch 750, Loss: 1.0882
Epoch 1/20, Batch 800, Loss: 1.0551
Epoch 1/20, Batch 850, Loss: 1.0471
Epoch 1/20, Batch 900, Loss: 0.9514
Epoch 1/20, Batch 950, Loss: 0.9610
Epoch 1/20, Batch 1000, Loss: 0.9299
Epoch 1/20, Batch 1050, Loss: 0.9063
Epoch 1/20, Batch 1100, Loss: 0.8713
Epoch 1/20, Batch 1150, Loss: 0.8511
Epoch 1/20, Batch 1200, Loss: 0.8224
Epoch 1/20, Batch 1250, Loss: 0.7730
Epoch 1/20, Batch 1300, Loss: 0.7793
Epoch 1/20, Batch 1350, 

In [10]:
def generate_text(model, start_str="Ted:", length=500):
    model.eval()
    input_eval = torch.tensor([char2idx[s] for s in start_str], dtype=torch.long).unsqueeze(0).to(device)
    hidden = None
    result = [s for s in start_str]

    with torch.no_grad():
        for _ in range(length):
            out, hidden = model(input_eval, hidden)
            probs = torch.softmax(out[0, -1], dim=0).cpu().numpy()
            idx = np.random.choice(len(probs), p=probs)
            result.append(idx2char[idx])

            input_eval = torch.tensor([[idx]], dtype=torch.long).to(device)

    return "".join(result)

In [11]:
print(generate_text(model, start_str="Barney:"))

Barney: swick of "To Mey?
His timing up high.
Why doing him--
not two shoors and I velit?
He's a raid backly and... yaket together!
Whoa!
TV Crissed!
Party purples!
I'm having jutt gabor.
Lartmas, is, and you find my drubing?
Oh, I've got anything; I wenld leat being.
(chuck straid.
Yeah.
Don't go.
Um, I.N
No sais, "Marshay, gulast.
Techissignon theathers do that sole-- nigh.
It's know. Just something you lat some read.
I nained. Why? It's never the okay.
Come on, Marv?
Hi, I just tanked into fine. For
