In [6]:
!pip install numpy
!pip install --upgrade datasets



In [10]:
import numpy as np
from datasets import load_dataset
import re
from collections import Counter
import math
dataset=load_dataset("imdb")
train_data=dataset["train"]
test_data=dataset["test"]
def clean_text(text):
  text=text.lower()
  text=re.sub(r'<[^>]+>','',text)
  text=re.sub(r'[^\w\s]','',text)
  text=re.sub(r'\d+','',text)
  return text.strip()
train_texts = [clean_text(x['text']) for x in train_data]
train_labels = [x['label'] for x in train_data]
test_texts = [clean_text(x['text']) for x in test_data]
test_labels = [x['label'] for x in test_data]
class Tokenizer:
    def __init__(self, max_words=20000):
        self.vocab = {"[PAD]": 0, "[UNK]": 1, "[CLS]": 2, "[SEP]": 3}
        self.max_words = max_words
    def fit(self, texts):
        word_counts = Counter()
        for text in texts:
            word_counts.update(text.split())
        for word, _ in word_counts.most_common(self.max_words - 4):
            self.vocab[word] = len(self.vocab)
    def encode(self, text):
        return [self.vocab.get(word, 1) for word in text.split()]
tokenizer = Tokenizer()
tokenizer.fit(train_texts)
vocab_size = len(tokenizer.vocab)
print(f"Vocabulary size: {vocab_size}")
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=-1, keepdims=True)

class TransformerModel:
    def __init__(self, vocab_size, embed_size=128):
        self.embeddings = np.random.randn(vocab_size, embed_size) * 0.01
        self.Wq = np.random.randn(embed_size, embed_size) * 0.01
        self.Wk = np.random.randn(embed_size, embed_size) * 0.01
        self.Wv = np.random.randn(embed_size, embed_size) * 0.01
        self.fc = np.random.randn(embed_size, 2) * 0.01

    def attention(self, x):
        Q = np.dot(x, self.Wq)
        K = np.dot(x, self.Wk)
        V = np.dot(x, self.Wv)

        scores = np.dot(Q, K.T) / np.sqrt(x.shape[-1])
        attention = softmax(scores)
        return np.dot(attention, V)

    def forward(self, token_ids):
        x = self.embeddings[token_ids]
        x = x.mean(axis=0)
        return softmax(np.dot(x, self.fc))
    def train(self, texts, labels, epochs=3, lr=0.01):
        for epoch in range(epochs):
            correct = 0
            for i, (text, label) in enumerate(zip(texts, labels)):
                tokens = tokenizer.encode(text)
                probs = self.forward(tokens)
                pred = np.argmax(probs)
                if pred != label:
                    grad = probs - np.array([1-label, label])
                    self.fc -= lr * np.outer(self.embeddings[tokens].mean(0), grad)

                correct += (pred == label)

                if i % 1000 == 0:
                    print(f"Batch {i}, Accuracy: {correct/(i+1):.2%}")

            print(f"Epoch {epoch+1}, Final Accuracy: {correct/len(texts):.2%}")


model = TransformerModel(vocab_size)
model.train(train_texts[:2000], train_labels[:2000])
def evaluate(model, texts, labels):
    correct = 0
    for text, label in zip(texts, labels):
        tokens = tokenizer.encode(text)
        probs = model.forward(tokens)
        correct += (np.argmax(probs) == label)
    print(f"Test Accuracy: {correct/len(texts):.2%}")

evaluate(model, test_texts[:500], test_labels[:500])

def predict(text):
    tokens = tokenizer.encode(clean_text(text))
    probs = model.forward(tokens)
    sentiment = "Positive" if np.argmax(probs) == 1 else "Negative"
    confidence = np.max(probs)
    print(f"Text: {text[:50]}...")
    print(f"Sentiment: {sentiment} ({confidence:.2%} confidence)")

predict("This movie was absolutely wonderful!")
predict("I hated every minute of this terrible film.")



Vocabulary size: 20000
Batch 0, Accuracy: 100.00%
Batch 1000, Accuracy: 96.70%
Epoch 1, Final Accuracy: 97.10%
Batch 0, Accuracy: 100.00%
Batch 1000, Accuracy: 98.70%
Epoch 2, Final Accuracy: 98.65%
Batch 0, Accuracy: 100.00%
Batch 1000, Accuracy: 98.90%
Epoch 3, Final Accuracy: 98.85%
Test Accuracy: 98.80%
Text: This movie was absolutely wonderful!...
Sentiment: Positive (50.01% confidence)
Text: I hated every minute of this terrible film....
Sentiment: Negative (50.01% confidence)
