In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import time

# 1. SAMPLE DATA
data = """
What is Python? Python is a popular programming language used for data science.
Data science involves statistics, programming, and machine learning.
Machine learning is a subset of artificial intelligence.
Artificial intelligence aims to create smart machines.
How to learn data science? You should start by learning Python and libraries like Numpy.
Numpy is used for numerical computing in Python.
Pandas is a great tool for data manipulation and analysis.
Machine learning models require clean data for training.
Deep learning uses neural networks like LSTM for sequence prediction.
LSTMs are great for natural language processing tasks.
Natural language processing helps computers understand human text.
The course covers deep learning and neural networks in detail.
You will build projects using PyTorch and Scikit-Learn.
PyTorch is a flexible deep learning framework developed by Meta.
Keep practicing to master the art of data science and AI.
"""

# 2. PREPROCESSING
text = data.lower()
words = text.split()
vocab = sorted(list(set(words)))
word_to_idx = {word: i for i, word in enumerate(vocab)}
idx_to_word = {i: word for i, word in enumerate(vocab)}
vocab_size = len(vocab)

# Create input sequences (N-Grams)
input_sequences = []
for i in range(1, len(words)):
    n_gram = words[:i+1]
    input_sequences.append([word_to_idx[w] for w in n_gram])

# Padding
max_len = max([len(seq) for seq in input_sequences])
padded_sequences = np.array([([0] * (max_len - len(seq)) + seq) for seq in input_sequences])

# Split into Features (X) and Label (y)
X = torch.tensor(padded_sequences[:, :-1], dtype=torch.long)
y = torch.tensor(padded_sequences[:, -1], dtype=torch.long)

# 3. MODEL ARCHITECTURE
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, (h, c) = self.lstm(embedded)
        # Use the hidden state of the last time step
        logits = self.fc(lstm_out[:, -1, :])
        return logits

# Initialize Model
EMBED_DIM = 100
HIDDEN_DIM = 150
model = LSTMModel(vocab_size, EMBED_DIM, HIDDEN_DIM)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# 4. TRAINING LOOP
print("--- Starting Training ---")
for epoch in range(100):
    model.train()
    optimizer.zero_grad()

    output = model(X)
    loss = criterion(output, y)

    loss.backward()
    optimizer.step()

    if (epoch + 1) % 20 == 0:
        print(f"Epoch {epoch+1}/100 | Loss: {loss.item():.4f}")

# 5. PREDICTION FUNCTION
def predict_next_words(seed_text, next_words=5):
    model.eval()
    for _ in range(next_words):
        # Tokenize and pad the input
        words_in_seed = seed_text.lower().split()
        tokens = [word_to_idx[w] for w in words_in_seed if w in word_to_idx]

        if not tokens: break # Handle unknown words

        # Pad sequence to match max_len-1
        padded_tokens = [0] * (max_len - 1 - len(tokens)) + tokens
        input_tensor = torch.tensor([padded_tokens], dtype=torch.long)

        # Get prediction
        with torch.no_grad():
            output = model(input_tensor)
            next_word_idx = torch.argmax(output, dim=1).item()
            next_word = idx_to_word[next_word_idx]

        seed_text += " " + next_word
    return seed_text

# 6. TEST IT
print("\n--- Testing Model ---")
test_sentence = "What is"
prediction = predict_next_words(test_sentence, next_words=20)
print(f"Input: '{test_sentence}'\nOutput: '{prediction}'")

--- Starting Training ---
Epoch 20/100 | Loss: 0.0135
Epoch 40/100 | Loss: 0.0009
Epoch 60/100 | Loss: 0.0005
Epoch 80/100 | Loss: 0.0004
Epoch 100/100 | Loss: 0.0003

--- Testing Model ---
Input: 'What is'
Output: 'What is python? python is a popular programming language used for data science. data science involves statistics, programming, and machine learning. machine'
