In [1]:
!pip uninstall -y torch torchtext
!pip install torch==2.3.0 torchtext==0.18

Found existing installation: torch 2.5.1+cu121
Uninstalling torch-2.5.1+cu121:
  Successfully uninstalled torch-2.5.1+cu121
[0mCollecting torch==2.3.0
  Downloading torch-2.3.0-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting torchtext==0.18
  Downloading torchtext-0.18.0-cp310-cp310-manylinux1_x86_64.whl.metadata (7.9 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.3.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.3.0)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.3.0)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.3.0)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==

In [2]:
import numpy as np

docs = ['go nepal',
		'india nepal',
		'hip hip hurray',
		'jeetega bhai jeetega nepal jeetega',
		'jai nepal',
		'kohli kohli',
		'sachin sachin',
		'dhoni dhoni',
		'tmkoc ji ki jai',
		'aayo gorkhali']

In [3]:
import torch
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence



In [4]:
def tokenize(text):
  return text.split()

tokenized_docs = [tokenize(doc) for doc in docs]

In [5]:
tokenized_docs

[['go', 'nepal'],
 ['india', 'nepal'],
 ['hip', 'hip', 'hurray'],
 ['jeetega', 'bhai', 'jeetega', 'nepal', 'jeetega'],
 ['jai', 'nepal'],
 ['kohli', 'kohli'],
 ['sachin', 'sachin'],
 ['dhoni', 'dhoni'],
 ['tmkoc', 'ji', 'ki', 'jai'],
 ['aayo', 'gorkhali']]

In [6]:
#building a vocabulary using OOV token
oov_token = "<nothing>"
vocab = build_vocab_from_iterator(tokenized_docs, specials=[oov_token])
vocab.set_default_index(vocab[oov_token])

In [7]:
# View the word index equivalent to `tokenizer.word_index`
word_index = {word:idx for idx, word in enumerate(vocab.get_itos())}
print(f"Word Index : {word_index}")

Word Index : {'<nothing>': 0, 'nepal': 1, 'jeetega': 2, 'dhoni': 3, 'hip': 4, 'jai': 5, 'kohli': 6, 'sachin': 7, 'aayo': 8, 'bhai': 9, 'go': 10, 'gorkhali': 11, 'hurray': 12, 'india': 13, 'ji': 14, 'ki': 15, 'tmkoc': 16}


In [8]:
# View word counts equivalent to `tokenizer.word_counts`
word_counts = {word : sum(doc.count(word) for doc in tokenized_docs) for word in word_index.keys()}
word_counts

{'<nothing>': 0,
 'nepal': 4,
 'jeetega': 3,
 'dhoni': 2,
 'hip': 2,
 'jai': 2,
 'kohli': 2,
 'sachin': 2,
 'aayo': 1,
 'bhai': 1,
 'go': 1,
 'gorkhali': 1,
 'hurray': 1,
 'india': 1,
 'ji': 1,
 'ki': 1,
 'tmkoc': 1}

In [9]:
# Number of documents equivalent to `tokenizer.document_count`
document_count = len(docs)
print("Document Count:", document_count)

Document Count: 10


In [10]:
sequences = [[vocab[token] for token in tokens]for tokens in tokenized_docs]
sequences

[[10, 1],
 [13, 1],
 [4, 4, 12],
 [2, 9, 2, 1, 2],
 [5, 1],
 [6, 6],
 [7, 7],
 [3, 3],
 [16, 14, 15, 5],
 [8, 11]]

In [11]:
padded_sequences = pad_sequence(
    [torch.tensor(seq) for seq in sequences],
    batch_first = True,
    padding_value = vocab[oov_token]
)

print(f"Padded sequences :\n {padded_sequences}")

Padded sequences :
 tensor([[10,  1,  0,  0,  0],
        [13,  1,  0,  0,  0],
        [ 4,  4, 12,  0,  0],
        [ 2,  9,  2,  1,  2],
        [ 5,  1,  0,  0,  0],
        [ 6,  6,  0,  0,  0],
        [ 7,  7,  0,  0,  0],
        [ 3,  3,  0,  0,  0],
        [16, 14, 15,  5,  0],
        [ 8, 11,  0,  0,  0]])


In [12]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torchtext.datasets import IMDB



In [13]:
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [14]:
(X_train, y_train), (X_test,y_test) = imdb.load_data(num_words = 10000)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step


In [21]:
X_train = pad_sequences(X_train, maxlen = 50)
X_test = pad_sequences(X_test, maxlen = 50)

In [22]:
X_train.dtype

dtype('int32')

In [23]:
X_train.shape

(25000, 50)

In [24]:
# Convert data to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.long)
y_train = torch.tensor(y_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.float32)

In [25]:
batch_size = 32
train_data = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
test_data = DataLoader(TensorDataset(X_test, y_test), batch_size=batch_size)

In [26]:
class SimpleRNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, maxlen):
        super(SimpleRNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        rnn_out, _ = self.rnn(x)
        # Take the last output from the RNN
        last_hidden_state = rnn_out[:, -1, :]
        output = self.fc(last_hidden_state)
        return self.sigmoid(output)

In [27]:
# Initializing the model
embedding_dim = 32
hidden_dim = 32
output_dim = 1

model = SimpleRNNModel(vocab_size=10000, embedding_dim=embedding_dim, hidden_dim=hidden_dim, output_dim=output_dim, maxlen=50)

In [28]:
# Loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [29]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

SimpleRNNModel(
  (embedding): Embedding(10000, 32, padding_idx=0)
  (rnn): RNN(32, 32, batch_first=True)
  (fc): Linear(in_features=32, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [30]:
num_epochs = 5

In [31]:
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    correct = 0
    total = 0

    for batch_X, batch_y in train_data:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)

        # Forward pass
        outputs = model(batch_X).squeeze()
        loss = criterion(outputs, batch_y)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Calculate accuracy
        predictions = (outputs >= 0.5).float()
        correct += (predictions == batch_y).sum().item()
        total += batch_y.size(0)
        train_loss += loss.item()

    accuracy = correct / total
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {train_loss:.4f}, Accuracy: {accuracy:.4f}")

Epoch 1/5, Loss: 542.3435, Accuracy: 0.4932
Epoch 2/5, Loss: 542.1752, Accuracy: 0.4993
Epoch 3/5, Loss: 542.1127, Accuracy: 0.4996
Epoch 4/5, Loss: 542.0609, Accuracy: 0.5027
Epoch 5/5, Loss: 542.1029, Accuracy: 0.4980


In [32]:
# Validation loop
model.eval()
with torch.no_grad():
    test_loss = 0
    correct = 0
    total = 0

    for batch_X, batch_y in test_data:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        outputs = model(batch_X).squeeze()
        loss = criterion(outputs, batch_y)

        predictions = (outputs >= 0.5).float()
        correct += (predictions == batch_y).sum().item()
        total += batch_y.size(0)
        test_loss += loss.item()

    accuracy = correct / total
    print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {accuracy:.4f}")

Test Loss: 542.0594, Test Accuracy: 0.5000
