# Tiny virsion of Building mini LLM By using Pytorch

In [43]:
!pip install torch torchvision torchaudio

Collecting torch
  Using cached torch-2.8.0-cp313-cp313-win_amd64.whl.metadata (30 kB)
Collecting torchvision
  Using cached torchvision-0.23.0-cp313-cp313-win_amd64.whl.metadata (6.1 kB)
Collecting torchaudio
  Using cached torchaudio-2.8.0-cp313-cp313-win_amd64.whl.metadata (7.2 kB)
Collecting filelock (from torch)
  Downloading filelock-3.20.0-py3-none-any.whl.metadata (2.1 kB)
Collecting sympy>=1.13.3 (from torch)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Using cached networkx-3.5-py3-none-any.whl.metadata (6.3 kB)
Collecting fsspec (from torch)
  Using cached fsspec-2025.9.0-py3-none-any.whl.metadata (10 kB)
Collecting setuptools (from torch)
  Using cached setuptools-80.9.0-py3-none-any.whl.metadata (6.6 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy>=1.13.3->torch)
  Using cached mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Using cached torch-2.8.0-cp313-cp313-win_amd64.whl (241.3 MB)
Using cached torchvision-0.23.0-cp313-


[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [45]:
import torch
import torch.nn as nn   #Neural network modules and layers
import torch.optim as optim  #Optimization algorithms
import torch.nn.functional as F #Functional interface for activation functions, loss functions, etc.
from torch.utils.data import DataLoader, Dataset  #Data loading and batching
import torchvision.transforms as transforms  #Image transformations and augmentations
import torchvision.datasets as datasets  #Standard datasets like MNIST, CIFAR-10, etc
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns


In [58]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# 1. Create data (torch)
x_train = torch.randn(100, 784)  # 100 samples, 784 features
y_train = torch.randint(0, 10, (100,))  # 100 labels (0-9)

# 2. Define model (nn)
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(784, 128)
        self.fc2 = nn.Linear(128, 10)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))  # F for activation function
        x = self.fc2(x)
        return x

model = Net()

# 3. Define loss and optimizer (nn + optim)
criterion = nn.CrossEntropyLoss()  # or F.cross_entropy
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 4. Training loop (torch + optim)
for epoch in range(20):
    optimizer.zero_grad()
    outputs = model(x_train)
    loss = criterion(outputs, y_train)  # Compute loss
    loss.backward()                     # torch computes gradients
    optimizer.step()                    # optim updates parameters
    
    print(f'Epoch {epoch}, Loss: {loss.item():.4f}')

Epoch 0, Loss: 2.3428
Epoch 1, Loss: 2.0677
Epoch 2, Loss: 1.8238
Epoch 3, Loss: 1.6055
Epoch 4, Loss: 1.4086
Epoch 5, Loss: 1.2322
Epoch 6, Loss: 1.0709
Epoch 7, Loss: 0.9263
Epoch 8, Loss: 0.7960
Epoch 9, Loss: 0.6784
Epoch 10, Loss: 0.5735
Epoch 11, Loss: 0.4813
Epoch 12, Loss: 0.4011
Epoch 13, Loss: 0.3320
Epoch 14, Loss: 0.2733
Epoch 15, Loss: 0.2240
Epoch 16, Loss: 0.1829
Epoch 17, Loss: 0.1492
Epoch 18, Loss: 0.1216
Epoch 19, Loss: 0.0993


# Here we will start

In [60]:
# import laibraries
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F


## Step 2. Prepare a Small Text Dataset

In [61]:
text = "Artificial intelligence is the future of technology and innovation"


## Step 3. Tokenize the Text

In [62]:
words = text.split()
words

['Artificial',
 'intelligence',
 'is',
 'the',
 'future',
 'of',
 'technology',
 'and',
 'innovation']

In [67]:
vocab = sorted(set(words))
vocab

['Artificial',
 'and',
 'future',
 'innovation',
 'intelligence',
 'is',
 'of',
 'technology',
 'the']

In [81]:
vocab_size=len(vocab)

In [69]:
#index mapping
word_to_ix = {word: i for i, word in enumerate(vocab)}
print(word_to_ix)

{'Artificial': 0, 'and': 1, 'future': 2, 'innovation': 3, 'intelligence': 4, 'is': 5, 'of': 6, 'technology': 7, 'the': 8}


In [71]:
ix_to_word = {i: word for word,i in word_to_ix.items()}
print(ix_to_word)

{0: 'Artificial', 1: 'and', 2: 'future', 3: 'innovation', 4: 'intelligence', 5: 'is', 6: 'of', 7: 'technology', 8: 'the'}


In [72]:
print("Vocabulary:", word_to_ix)

Vocabulary: {'Artificial': 0, 'and': 1, 'future': 2, 'innovation': 3, 'intelligence': 4, 'is': 5, 'of': 6, 'technology': 7, 'the': 8}


## Step 4. Create Training Data (Next-Word Prediction)

In [79]:
def make_data(words, context_size=3):
    data = []
    for i in range(len(words) - context_size):
        context = words[i : i + context_size]  # ✅ take a slice, not a single index
        target = words[i + context_size]       # next word to predict
        data.append((context, target))
    return data

data = make_data(words)
print(data[:3])

    

[(['Artificial', 'intelligence', 'is'], 'the'), (['intelligence', 'is', 'the'], 'future'), (['is', 'the', 'future'], 'of')]


## Step 5. Define a Tiny Language Model (like GPT Core)

In [80]:
class TinyLLM(nn.Module):
    def __init__(self, vocab_size, embed_dim=16, hidden_dim=32):
        super(TinyLLM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc1 = nn.Linear(embed_dim * 3, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, vocab_size)

    def forward(self, inputs):
        embeds = self.embedding(inputs).view(1, -1)
        out = F.relu(self.fc1(embeds))
        out = self.fc2(out)
        return out


## Step 6. Train the Model

In [82]:
model = TinyLLM(vocab_size)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

for epoch in range(300):
    total_loss = 0
    for context, target in data:
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)
        target_idx = torch.tensor([word_to_ix[target]], dtype=torch.long)

        # Forward + Backward
        model.zero_grad()
        logits = model(context_idxs)
        loss = loss_fn(logits, target_idx)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    if epoch % 50 == 0:
        print(f"Epoch {epoch}, Loss: {total_loss:.4f}")


Epoch 0, Loss: 14.1975
Epoch 50, Loss: 0.0022
Epoch 100, Loss: 0.0007
Epoch 150, Loss: 0.0004
Epoch 200, Loss: 0.0002
Epoch 250, Loss: 0.0001


## Step 7. Generate Text

In [83]:
def predict_next(context_words):
    context_idxs = torch.tensor([word_to_ix[w] for w in context_words], dtype=torch.long)
    with torch.no_grad():
        logits = model(context_idxs)
        predicted_idx = torch.argmax(logits, dim=1).item()
        return ix_to_word[predicted_idx]

context = ["intelligence", "is", "the"]
next_word = predict_next(context)
print(f"Input: {' '.join(context)} → Predicted next word: {next_word}")


Input: intelligence is the → Predicted next word: future
