In [None]:
# -----------------------------
# Extract Conversations
# -----------------------------
with open("input.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

def extract_conversations(text):
    lines = text.split('\n')
    conversations = []
    for i in range(len(lines)-1):
        if lines[i].startswith("First Citizen") and lines[i+1].startswith("All"):
            user_line = lines[i].split("First Citizen", 1)[-1].strip()
            bot_line = lines[i+1].split("All", 1)[-1].strip()
            if user_line and bot_line:
                conversations.append((user_line, bot_line))
    return conversations

conversations = extract_conversations(raw_text)
print(f"Extracted {len(conversations)} user-bot pairs.")


In [None]:
class ConversationDataset(Dataset):
    def __init__(self, conversations, stoi, block_size=30):
        self.stoi = stoi
        self.block_size = block_size
        self.data = []

        for user, bot in conversations:
            user_tokens = [stoi[w] for w in user.lower().split() if w in stoi]
            bot_tokens = [stoi[w] for w in bot.lower().split() if w in stoi]
            tokens = user_tokens + [stoi['<sep>']] + bot_tokens
            if len(tokens) >= 2:
                self.data.append(torch.tensor(tokens, dtype=torch.long))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x = self.data[idx][:-1]
        y = self.data[idx][1:]
        return x, y

# Add special <sep> token if missing
if '<sep>' not in stoi:
    sep_idx = len(stoi)
    stoi['<sep>'] = sep_idx
    itos[sep_idx] = '<sep>'
    vocab_size += 1

conversation_dataset = ConversationDataset(conversations, stoi)
conversation_loader = DataLoader(conversation_dataset, batch_size=1, shuffle=True)


In [None]:
# -----------------------------
# Training loop
# -----------------------------
def train(model, loader, optimizer, device='cpu', epochs=5):
    model.train()
    loss_fn = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        total_loss = 0
        for step, (x, y) in enumerate(loader):
            x, y = x.to(device), y.to(device)
            logits, loss = model(x, targets=y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

            if step % 50 == 0:
                print(f"Epoch {epoch} Step {step} Loss: {loss.item():.4f}")

        print(f"Epoch {epoch} Average Loss: {total_loss / len(loader):.4f}")


In [None]:
# -----------------------------
# 1. Load data & build vocab
# -----------------------------

with open("input.txt", "r", encoding="utf-8") as f:
    text = f.read().lower()

sentences = text.split('\n')  # Split into lines/sentences
words = text.split()          # Word-level tokenization

unique_words = sorted(set(words))
stoi = {word: idx for idx, word in enumerate(unique_words)}
itos = {idx: word for word, idx in stoi.items()}
vocab_size = len(stoi)
embedding_dim = 16

print(f"Vocabulary size: {vocab_size}, Total sentences: {len(sentences)}")

# -----------------------------
# 2. Positional encoding
# -----------------------------

def positional_encodings(sequence_length, embedding_size):
    pe = torch.zeros(sequence_length, embedding_size)
    for pos in range(sequence_length):
        for i in range(embedding_size):
            if i % 2 == 0:
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/embedding_size)))
            else:
                pe[pos, i] = math.cos(pos / (10000 ** ((2 * i)/embedding_size)))
    return pe

# -----------------------------
# 3. Dataset
# -----------------------------

class WordLevelDataset(Dataset):
    def __init__(self, sentences):
        self.sentences = [s for s in sentences if len(s.split()) >= 2]

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx]

dataset = WordLevelDataset(sentences)
loader = DataLoader(dataset, batch_size=1, shuffle=True)

# -----------------------------
# 4. Shared embedding
# -----------------------------

embedding_layer = nn.Embedding(vocab_size, embedding_dim)

def embedding_gen(sentence):
    words = sentence.lower().split()
    indices = [stoi[word] for word in words if word in stoi]
    input_tensor = torch.LongTensor(indices)
    input_embeddings = embedding_layer(input_tensor)
    return input_embeddings, input_tensor

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = Model().to(device)
optimizer = torch.optim.Adam(list(model.parameters()) + list(embedding_layer.parameters()), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()
loss_fn.to(device)

for epoch in range(5):
    model.train()
    total_loss = 0

    for batch in loader:
        sentence = batch[0]
        input_embeddings, input_tensor = embedding_gen(sentence)
        input_embeddings = input_embeddings.to(device)
        input_tensor = input_tensor.to(device)

        if len(input_tensor) < 2:
            continue  # skip short lines

        pos_enc = positional_encodings(input_embeddings.size(0), embedding_dim)
        pos_enc = pos_enc.to(device)
        input_combined = input_embeddings + pos_enc
        input_combined = input_combined.to(device)
        input_tensor = input_tensor.to(device)

        # Forward
        logits = model(input_combined, input_combined, input_combined, input_combined)
        #logits = torch.matmul(output, embedding_layer.weight.T)

        embedding_layer.to(device)

        # Predict next word embeddings
        target = input_tensor[1:]
        pred = logits[0, :-1]

        loss = loss_fn(pred, embedding_layer(target))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1} | Avg Loss: {total_loss / len(loader):.4f}")