In [3]:
import torch
import torch.nn.functional as F
import string
# Define the LSTM class
class SimpleLSTM:
    def __init__(self, input_size, hidden_size, output_size):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size

        # Weights for LSTM gates
        self.Wi = torch.randn(hidden_size, input_size, requires_grad=True)  # Input gate
        self.Ui = torch.randn(hidden_size, hidden_size, requires_grad=True)
        self.bi = torch.zeros(hidden_size, requires_grad=True)

        self.Wf = torch.randn(hidden_size, input_size, requires_grad=True)  # Forget gate
        self.Uf = torch.randn(hidden_size, hidden_size, requires_grad=True)
        self.bf = torch.zeros(hidden_size, requires_grad=True)

        self.Wo = torch.randn(hidden_size, input_size, requires_grad=True)  # Output gate
        self.Uo = torch.randn(hidden_size, hidden_size, requires_grad=True)
        self.bo = torch.zeros(hidden_size, requires_grad=True)

        self.Wc = torch.randn(hidden_size, input_size, requires_grad=True)  # Cell state candidate
        self.Uc = torch.randn(hidden_size, hidden_size, requires_grad=True)
        self.bc = torch.zeros(hidden_size, requires_grad=True)

        # Output layer weights
        self.Why = torch.randn(output_size, hidden_size, requires_grad=True)
        self.by = torch.zeros(output_size, requires_grad=True)

    def forward(self, x):
        batch_size = x.size(1)
        seq_len = x.size(0)
        h = torch.zeros(self.hidden_size, batch_size)  # Hidden state
        c = torch.zeros(self.hidden_size, batch_size)  # Cell state

        for t in range(seq_len):
            input_t = x[t]
            
            i = torch.sigmoid(torch.matmul(self.Wi, input_t) + torch.matmul(self.Ui, h) + self.bi)
            f = torch.sigmoid(torch.matmul(self.Wf, input_t) + torch.matmul(self.Uf, h) + self.bf)
            o = torch.sigmoid(torch.matmul(self.Wo, input_t) + torch.matmul(self.Uo, h) + self.bo)
            c_hat = torch.tanh(torch.matmul(self.Wc, input_t) + torch.matmul(self.Uc, h) + self.bc)
            
            c = f * c + i * c_hat
            h = o * torch.tanh(c)
        
        output = torch.matmul(self.Why, h) + self.by
        return output

    def zero_grad(self):
        for param in self.parameters():
            if param.grad is not None:
                param.grad.zero_()

    def parameters(self):
        return [self.Wi, self.Ui, self.bi, self.Wf, self.Uf, self.bf,
                self.Wo, self.Uo, self.bo, self.Wc, self.Uc, self.bc,
                self.Why, self.by]

    def predict(self, x):
        output = self.forward(x)
        return torch.argmax(output, dim=0).item()

# Encode strings as one-hot vectors
def encode_string(s, input_size):
    char_to_idx = {char: idx for idx, char in enumerate(string.ascii_lowercase[:input_size])}
    tensor = torch.zeros(input_size)
    for char in s.lower():
        if char in char_to_idx:
            idx = char_to_idx[char]
            if idx < input_size:
                tensor[idx] = 1.0
    return tensor

# Encode a list of words
def encode_words(words, input_size, sequence_length):
    encoded_words = [encode_string(word, input_size) for word in words]
    if len(encoded_words) < sequence_length:
        encoded_words.extend([torch.zeros(input_size)] * (sequence_length - len(encoded_words)))
    elif len(encoded_words) > sequence_length:
        encoded_words = encoded_words[:sequence_length]
    return torch.stack(encoded_words)

# Define parameters
input_size = 10  # Number of unique characters
hidden_size = 128
output_size = 2  # Number of classes
sequence_length = 5  # Define sequence length

# Example sentences
sentences = [
    ["my", "name", "is", "rithin", "a"],
    ["i", "am", "a", "developer", "in"],
    ["this", "is", "a", "sample", "sentence"]
]

# Encode sentences
encoded_sentences = [encode_words(sentence, input_size, sequence_length) for sentence in sentences]

# Define targets for each sentence
targets = torch.tensor([0, 1, 1], dtype=torch.long)

# Initialize the model
model = SimpleLSTM(input_size, hidden_size, output_size)

# Training loop
learning_rate = 0.01
num_epochs = 3
for epoch in range(num_epochs):
    print(f'Epoch {epoch + 1}')
    
    total_loss = 0
    for i in range(len(encoded_sentences)):  # Iterate over each sentence
        input_seq = encoded_sentences[i].unsqueeze(1)  # Add batch dimension
        target = targets[i]
        
        # Zero the gradients
        model.zero_grad()
        
        # Forward pass
        output = model.forward(input_seq)
        
        # Compute loss (negative log likelihood loss)
        log_probs = F.log_softmax(output, dim=0)
        loss = -log_probs[target]
        total_loss += loss.item()
        
        # Backward pass
        loss.backward()
        
        # Update parameters using gradient descent
        with torch.no_grad():
            for param in model.parameters():
                param -= learning_rate * param.grad
            model.zero_grad()  # Manually zero the gradients after updating weights
    
    # Print average loss for the epoch
    print(f'Average Loss: {total_loss / len(encoded_sentences)}')
    print('-' * 50)

# Predicting a new sentence
new_sentence = ["hello", "my", "name", "is", "rithin"]
encoded_new_sentence = encode_words(new_sentence, input_size, sequence_length)
prediction = model.predict(encoded_new_sentence.unsqueeze(1))  # Add batch dimension
print(f'Prediction for the new sentence: {prediction}')


Epoch 1


RuntimeError: mat1 and mat2 shapes cannot be multiplied (128x10 and 1x10)

In [4]:
import torch
import torch.nn.functional as F

# Define the input size, hidden size, and output size
input_size = 10
hidden_size = 128
output_size = 2
sequence_length = 5 

In [5]:
# Initialize weights and biases
Wi = torch.randn(hidden_size, input_size, requires_grad=True)  # Input gate
Ui = torch.randn(hidden_size, hidden_size, requires_grad=True)
bi = torch.zeros(hidden_size, requires_grad=True)

Wf = torch.randn(hidden_size, input_size, requires_grad=True)  # Forget gate
Uf = torch.randn(hidden_size, hidden_size, requires_grad=True)
bf = torch.zeros(hidden_size, requires_grad=True)

Wo = torch.randn(hidden_size, input_size, requires_grad=True)  # Output gate
Uo = torch.randn(hidden_size, hidden_size, requires_grad=True)
bo = torch.zeros(hidden_size, requires_grad=True)

Wc = torch.randn(hidden_size, input_size, requires_grad=True)  # Cell state candidate
Uc = torch.randn(hidden_size, hidden_size, requires_grad=True)
bc = torch.zeros(hidden_size, requires_grad=True)

In [7]:
Wi

tensor([[-0.2881, -0.9919, -0.3783,  ..., -0.7103,  0.8392, -0.6060],
        [ 0.1656, -0.7472, -0.8907,  ..., -0.4476,  0.9162, -0.5639],
        [ 0.2608, -0.7969, -1.4423,  ..., -1.6837,  0.0633,  1.1479],
        ...,
        [ 0.5247, -0.1232,  0.3933,  ...,  0.7917, -1.4827,  1.9516],
        [ 0.4388, -1.0877, -1.3212,  ...,  1.3887, -0.7424,  0.3653],
        [-0.7982, -0.2358,  0.6281,  ...,  0.6494,  0.7852, -0.9230]],
       requires_grad=True)

In [8]:
Why = torch.randn(output_size, hidden_size, requires_grad=True)
by = torch.zeros(output_size, requires_grad=True)

In [9]:
inputs = torch.randn(sequence_length, input_size)
targets = torch.tensor([1], dtype=torch.long)

# Initialize hidden state and cell state
h = torch.zeros(hidden_size, 1)
c = torch.zeros(hidden_size, 1)

In [11]:
h,c

(tensor([[0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [

In [10]:
inputs

tensor([[ 0.5292,  0.0667,  1.3729, -0.7066, -0.1482, -1.2327,  0.9183,  0.7727,
          1.1748, -0.8674],
        [ 0.3653, -1.0230,  0.5819, -0.0475,  1.1893,  1.7140, -1.4997,  1.8099,
         -1.2016,  1.1302],
        [ 0.1289, -0.5043,  1.1278,  0.5382, -0.7277,  0.3761, -1.3886, -1.7325,
          0.2822, -1.6098],
        [-0.4679, -0.9395, -1.5223,  0.7699, -0.7546,  1.0621,  0.1705, -1.2919,
         -0.0026,  1.4833],
        [ 0.3669,  0.1718,  1.2400, -1.2826,  1.7289, -2.1316,  0.4384, -1.1724,
          2.0249,  0.4389]])

In [None]:
# Forward pass
for i in range(inputs.size(0)):
    input = inputs[i]
    hidden = torch.tanh(torch.matmul(Wxh, input) + torch.matmul(Whh, hidden) + bh)
output = torch.matmul(Why, hidden) + by

In [89]:
import torch

# Define the input size, hidden size, and output size
input_size = 1
hidden_size = 1
output_size = 2

# Initialize weights and biases
Wi = torch.randn(hidden_size, input_size, requires_grad=True)
Ui = torch.randn(hidden_size, hidden_size, requires_grad=True)
bi = torch.zeros(hidden_size, requires_grad=True)

Wf = torch.randn(hidden_size, input_size, requires_grad=True)
Uf = torch.randn(hidden_size, hidden_size, requires_grad=True)
bf = torch.zeros(hidden_size, requires_grad=True)

Wo = torch.randn(hidden_size, input_size, requires_grad=True)
Uo = torch.randn(hidden_size, hidden_size, requires_grad=True)
bo = torch.zeros(hidden_size, requires_grad=True)

Wc = torch.randn(hidden_size, input_size, requires_grad=True)
Uc = torch.randn(hidden_size, hidden_size, requires_grad=True)
bc = torch.zeros(hidden_size, requires_grad=True)

Why = torch.randn(output_size, hidden_size, requires_grad=True)
by = torch.zeros(output_size, requires_grad=True)

# Sample input (batch size = 1, sequence length = 3, input size = 10)
inputs = torch.randn(3, input_size)
print("inputs:", inputs.shape)
targets = torch.tensor([1], dtype=torch.long)
print("targets:", targets.shape)

# Initialize hidden state and cell state
h = torch.zeros(hidden_size, requires_grad=True)
c = torch.zeros(hidden_size, requires_grad=True)
print("hini:", h.shape)
print("cini:", c.shape)

# Forward pass
for i in range(inputs.size(0)):
    input_t = inputs[i].unsqueeze(1)  # Add batch dimension for the first input in the sequence
    print("*********")
    print("t",input_t.shape)
    print("wf",Wf.shape) 
    print("uf",Uf.shape)
    print("bf",bf.shape)
    
    print("++++++++++")
    f = torch.sigmoid(torch.matmul(Wf, input_t) + torch.matmul(Uf, h) + bf)  # Forget gate
    
    i = torch.sigmoid(torch.matmul(Wi, input_t) + torch.matmul(Ui, h) + bi)  # Input gate
    o = torch.sigmoid(torch.matmul(Wo, input_t) + torch.matmul(Uo, h) + bo)  # Output gate
    c_hat = torch.tanh(torch.matmul(Wc, input_t) + torch.matmul(Uc, h) + bc)  # Candidate cell state
    
    c = f * c + i * c_hat  # Update cell state
    h = o * torch.tanh(c)  # Update hidden state
    print("h:", h.shape)
    print("c:", c.shape)
    print("f",f.shape)
    print("i", i.shape)
    print("o", o.shape)
    print("c_hat", c_hat.shape)

# Compute output
output = torch.matmul(Why, h) + by

# Compute loss (negative log likelihood loss)
log_probs = torch.nn.functional.log_softmax(output, dim=0)
loss = -log_probs[targets].sum()  # Summing the loss to ensure it is a scalar

# Backward pass
loss.backward()

# Print loss and gradients
print('Loss:', loss.item())
print('Gradients:')
print('Wi.grad:', Wi.grad)
print('Ui.grad:', Ui.grad)
print('bi.grad:', bi.grad)
print('Wf.grad:', Wf.grad)
print('Uf.grad:', Uf.grad)
print('bf.grad:', bf.grad)
print('Wo.grad:', Wo.grad)
print('Uo.grad:', Uo.grad)
print('bo.grad:', bo.grad)
print('Wc.grad:', Wc.grad)
print('Uc.grad:', Uc.grad)
print('bc.grad:', bc.grad)
print('Why.grad:', Why.grad)
print('by.grad:', by.grad)

# Update parameters using gradient descent
learning_rate = 0.01
with torch.no_grad():
    Wi -= learning_rate * Wi.grad
    Ui -= learning_rate * Ui.grad
    bi -= learning_rate * bi.grad
    Wf -= learning_rate * Wf.grad
    Uf -= learning_rate * Uf.grad
    bf -= learning_rate * bf.grad
    Wo -= learning_rate * Wo.grad
    Uo -= learning_rate * Uo.grad
    bo -= learning_rate * bo.grad
    Wc -= learning_rate * Wc.grad
    Uc -= learning_rate * Uc.grad
    bc -= learning_rate * bc.grad
    Why -= learning_rate * Why.grad
    by -= learning_rate * by.grad

    # Manually zero the gradients after updating weights
    Wi.grad.zero_()
    Ui.grad.zero_()
    bi.grad.zero_()
    Wf.grad.zero_()
    Uf.grad.zero_()
    bf.grad.zero_()
    Wo.grad.zero_()
    Uo.grad.zero_()
    bo.grad.zero_()
    Wc.grad.zero_()
    Uc.grad.zero_()
    bc.grad.zero_()
    Why.grad.zero_()
    by.grad.zero_()

print("predicted :", output)
softmax = torch.nn.functional.softmax(output, dim=0)
print('Output (softmax probabilities):', softmax)
print("target :", targets)


inputs: torch.Size([3, 1])
targets: torch.Size([1])
hini: torch.Size([1])
cini: torch.Size([1])
*********
t torch.Size([1, 1])
wf torch.Size([1, 1])
uf torch.Size([1, 1])
bf torch.Size([1])
++++++++++
h: torch.Size([1, 1])
c: torch.Size([1, 1])
f torch.Size([1, 1])
i torch.Size([1, 1])
o torch.Size([1, 1])
c_hat torch.Size([1, 1])
*********
t torch.Size([1, 1])
wf torch.Size([1, 1])
uf torch.Size([1, 1])
bf torch.Size([1])
++++++++++
h: torch.Size([1, 1])
c: torch.Size([1, 1])
f torch.Size([1, 1])
i torch.Size([1, 1])
o torch.Size([1, 1])
c_hat torch.Size([1, 1])
*********
t torch.Size([1, 1])
wf torch.Size([1, 1])
uf torch.Size([1, 1])
bf torch.Size([1])
++++++++++
h: torch.Size([1, 1])
c: torch.Size([1, 1])
f torch.Size([1, 1])
i torch.Size([1, 1])
o torch.Size([1, 1])
c_hat torch.Size([1, 1])
Loss: 1.2640379667282104
Gradients:
Wi.grad: tensor([[0.0372]])
Ui.grad: tensor([[0.0027]])
bi.grad: tensor([0.0054])
Wf.grad: tensor([[-0.0057]])
Uf.grad: tensor([[0.0387]])
bf.grad: tensor([-

In [114]:
import torch

# Define the input size, hidden size, and output size
input_size = 3
hidden_size = 2
output_size = 3

# Initialize weights and biases
Wi = torch.randn(hidden_size, input_size, requires_grad=True)
Ui = torch.randn(hidden_size, hidden_size, requires_grad=True)
bi = torch.zeros(hidden_size, requires_grad=True)  # Shape: [128]

Wf = torch.randn(hidden_size, input_size, requires_grad=True)
Uf = torch.randn(hidden_size, hidden_size, requires_grad=True)
bf = torch.zeros(hidden_size, requires_grad=True)  # Shape: [128]

Wo = torch.randn(hidden_size, input_size, requires_grad=True)
Uo = torch.randn(hidden_size, hidden_size, requires_grad=True)
bo = torch.zeros(hidden_size, requires_grad=True)  # Shape: [128]

Wc = torch.randn(hidden_size, input_size, requires_grad=True)
Uc = torch.randn(hidden_size, hidden_size, requires_grad=True)
bc = torch.zeros(hidden_size, requires_grad=True)  # Shape: [128]

Why = torch.randn(output_size, hidden_size, requires_grad=True)
by = torch.zeros(output_size, requires_grad=True)

# Sample input (sequence length = 3, batch size = 1, input size = 10)
inputs = torch.randn(3, input_size)
print("inputs:",inputs, inputs.shape)

# Define targets for classification
targets = torch.tensor([1], dtype=torch.long)
print("targets:",targets, targets.shape)

# Initialize hidden state and cell state
h = torch.zeros(hidden_size, 1, requires_grad=True)  # Shape: [128, 1]
c = torch.zeros(hidden_size, 1, requires_grad=True)  # Shape: [128, 1]
print("h:",h, h.shape)
print("c:",c, c.shape)
print("inputsshape:", inputs.shape)

# Forward pass
for i in range(inputs.size(0)):
    input_t = inputs[i].unsqueeze(1)  # Add batch dimension for the first input in the sequence
    print("*********")
    print("input_t",{i},input_t, input_t.shape)
    a1=torch.matmul(Wf, input_t)
    print("a1:",{i},a1, a1.shape)
    print("Wf",{i},Wf,Wf.shape)
    a2=torch.matmul(Uf, h)
    print("a2:", {i}, a2, a2.shape)
    print("Uf", {i}, Uf, Uf.shape)
    print("h", {i}, h, h.shape)
    print("bfor", {i}, bf.unsqueeze)
    print("bf", {i}, bf.unsqueeze(1), bf.unsqueeze(1).shape)
    a3=(torch.matmul(Wf, input_t) + torch.matmul(Uf, h) + bf.unsqueeze(1))  # Forget gate
    print("a3:", {i}, a3, a3.shape)
    # Compute gate values
    f = torch.sigmoid(a3)  # Forget gate
    print("f", {i}, f, f.shape)
    a4=torch.matmul(Wi, input_t)
    a5=torch.matmul(Ui, h)
    a6=(torch.matmul(Wi, input_t) + torch.matmul(Ui, h) + bi.unsqueeze(1))
  
    i_gate = torch.sigmoid(a6)  # Input gate
    print("Wi",{i},Wi,Wi.shape)
    print("Ui", {i}, Ui, Ui.shape)
    print("bi", {i}, bi, bi.shape)
    print("bi", {i}, bi.unsqueeze(1), bi.unsqueeze(1).shape)
    print("h", {i}, h, h.shape)
    print("i_gate", {i}, i_gate, i_gate.shape)
    o = torch.sigmoid(torch.matmul(Wo, input_t) + torch.matmul(Uo, h) + bo.unsqueeze(1))  # Output gate
    c_hat = torch.tanh(torch.matmul(Wc, input_t) + torch.matmul(Uc, h) + bc.unsqueeze(1))  # Candidate cell state
    
    # Update cell and hidden states
    c = f * c + i_gate * c_hat
    h = o * torch.tanh(c)  # Update hidden state
    print("h:", h.shape)
    print("c:", c.shape)
    print("f", f.shape)
    print("i", i_gate.shape)
    print("o", o.shape)
    print("c_hat", c_hat.shape)

# Compute output
output = torch.matmul(Why, h.squeeze(1)) + by  # Reshape h to [hidden_size]
output = output.squeeze(0)  # Ensure output is [output_size]

# Compute loss (negative log likelihood loss)
log_probs = torch.nn.functional.log_softmax(output, dim=0)
loss = -log_probs[targets].sum()  # Summing the loss to ensure it is a scalar

# Backward pass
loss.backward()

# Print loss and gradients
print('Loss:', loss.item())
print('Gradients:')
print('Wi.grad:', Wi.grad)
print('Ui.grad:', Ui.grad)
print('bi.grad:', bi.grad)
print('Wf.grad:', Wf.grad)
print('Uf.grad:', Uf.grad)
print('bf.grad:', bf.grad)
print('Wo.grad:', Wo.grad)
print('Uo.grad:', Uo.grad)
print('bo.grad:', bo.grad)
print('Wc.grad:', Wc.grad)
print('Uc.grad:', Uc.grad)
print('bc.grad:', bc.grad)
print('Why.grad:', Why.grad)
print('by.grad:', by.grad)

# Update parameters using gradient descent
learning_rate = 0.01
with torch.no_grad():
    Wi -= learning_rate * Wi.grad
    Ui -= learning_rate * Ui.grad
    bi -= learning_rate * bi.grad
    Wf -= learning_rate * Wf.grad
    Uf -= learning_rate * Uf.grad
    bf -= learning_rate * bf.grad
    Wo -= learning_rate * Wo.grad
    Uo -= learning_rate * Uo.grad
    bo -= learning_rate * bo.grad
    Wc -= learning_rate * Wc.grad
    Uc -= learning_rate * Uc.grad
    bc -= learning_rate * bc.grad
    Why -= learning_rate * Why.grad
    by -= learning_rate * by.grad

    # Manually zero the gradients after updating weights
    Wi.grad.zero_()
    Ui.grad.zero_()
    bi.grad.zero_()
    Wf.grad.zero_()
    Uf.grad.zero_()
    bf.grad.zero_()
    Wo.grad.zero_()
    Uo.grad.zero_()
    bo.grad.zero_()
    Wc.grad.zero_()
    Uc.grad.zero_()
    bc.grad.zero_()
    Why.grad.zero_()
    by.grad.zero_()

print("predicted :", output)
softmax = torch.nn.functional.softmax(output, dim=0)
print('Output (softmax probabilities):', softmax)
print("target :", targets)


inputs: tensor([[-0.1721, -0.3410, -0.2648],
        [-0.5807,  0.2318,  0.3885],
        [ 0.1579,  1.0525,  0.6670]]) torch.Size([3, 3])
targets: tensor([1]) torch.Size([1])
h: tensor([[0.],
        [0.]], requires_grad=True) torch.Size([2, 1])
c: tensor([[0.],
        [0.]], requires_grad=True) torch.Size([2, 1])
inputsshape: torch.Size([3, 3])
*********
input_t {0} tensor([[-0.1721],
        [-0.3410],
        [-0.2648]]) torch.Size([3, 1])
a1: {0} tensor([[0.1080],
        [0.9703]], grad_fn=<MmBackward0>) torch.Size([2, 1])
Wf {0} tensor([[ 1.3342,  0.3895, -1.7767],
        [-1.2919, -2.1004, -0.1195]], requires_grad=True) torch.Size([2, 3])
a2: {0} tensor([[0.],
        [0.]], grad_fn=<MmBackward0>) torch.Size([2, 1])
Uf {0} tensor([[ 1.0640,  1.9649],
        [-0.5378,  0.4844]], requires_grad=True) torch.Size([2, 2])
h {0} tensor([[0.],
        [0.]], requires_grad=True) torch.Size([2, 1])
bfor {0} <built-in method unsqueeze of Tensor object at 0x000001D9D0C38720>
bf {0} tens

In [108]:
import torch
import string

# Function to encode a string into a tensor
def encode_string(s, input_size):
    char_to_idx = {char: idx for idx, char in enumerate(string.ascii_lowercase[:input_size])}
    tensor = torch.zeros(input_size)
    for char in s.lower():
        if char in char_to_idx:
            idx = char_to_idx[char]
            if idx < input_size:
                tensor[idx] = 1.0
    return tensor

def encode_words(words, input_size, sequence_length):
    encoded_words = [encode_string(word, input_size) for word in words]
    if len(encoded_words) < sequence_length:
        encoded_words.extend([torch.zeros(input_size)] * (sequence_length - len(encoded_words)))
    elif len(encoded_words) > sequence_length:
        encoded_words = encoded_words[:sequence_length]
    return torch.stack(encoded_words)

class SimpleRNN:
    def __init__(self, input_size, hidden_size, output_size):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size

        self.Wi = torch.randn(self.hidden_size, self.input_size, requires_grad=True)
        self.Ui = torch.randn(self.hidden_size, self.hidden_size, requires_grad=True)
        self.bi = torch.zeros(self.hidden_size, requires_grad=True)

        self.Wf = torch.randn(self.hidden_size, self.input_size, requires_grad=True)
        self.Uf = torch.randn(self.hidden_size, self.hidden_size, requires_grad=True)
        self.bf = torch.zeros(self.hidden_size, requires_grad=True)

        self.Wo = torch.randn(self.hidden_size, self.input_size, requires_grad=True)
        self.Uo = torch.randn(self.hidden_size, self.hidden_size, requires_grad=True)
        self.bo = torch.zeros(self.hidden_size, requires_grad=True)

        self.Wc = torch.randn(self.hidden_size, self.input_size, requires_grad=True)
        self.Uc = torch.randn(self.hidden_size, self.hidden_size, requires_grad=True)
        self.bc = torch.zeros(self.hidden_size, requires_grad=True)

        self.Why = torch.randn(self.output_size, self.hidden_size, requires_grad=True)
        self.by = torch.zeros(self.output_size, requires_grad=True)

    def forward(self, x):
        h = torch.zeros(self.hidden_size, 1, requires_grad=True)
        c = torch.zeros(self.hidden_size, 1, requires_grad=True)
        for t in range(x.size(0)):
            input_t = x[t].unsqueeze(1)
            f = torch.sigmoid(torch.matmul(self.Wf, input_t) + torch.matmul(self.Uf, h) + self.bf.unsqueeze(1))
            i = torch.sigmoid(torch.matmul(self.Wi, input_t) + torch.matmul(self.Ui, h) + self.bi.unsqueeze(1))
            o = torch.sigmoid(torch.matmul(self.Wo, input_t) + torch.matmul(self.Uo, h) + self.bo.unsqueeze(1))
            c_hat = torch.tanh(torch.matmul(self.Wc, input_t) + torch.matmul(self.Uc, h) + self.bc.unsqueeze(1))
            c = f * c + i * c_hat
            h = o * torch.tanh(c)
        output = torch.matmul(self.Why, h.squeeze(1)) + self.by
        return output

    def predict(self, x):
        output = self.forward(x)
        return torch.argmax(output, dim=0).item()

    def zero_grad(self):
        if self.Wi.grad is not None:
            self.Wi.grad.zero_()
        if self.Ui.grad is not None:
            self.Ui.grad.zero_()
        if self.bi.grad is not None:
            self.bi.grad.zero_()
        if self.Wf.grad is not None:
            self.Wf.grad.zero_()
        if self.Uf.grad is not None:
            self.Uf.grad.zero_()
        if self.bf.grad is not None:
            self.bf.grad.zero_()
        if self.Wo.grad is not None:
            self.Wo.grad.zero_()
        if self.Uo.grad is not None:
            self.Uo.grad.zero_()
        if self.bo.grad is not None:
            self.bo.grad.zero_()
        if self.Wc.grad is not None:
            self.Wc.grad.zero_()
        if self.Uc.grad is not None:
            self.Uc.grad.zero_()
        if self.bc.grad is not None:
            self.bc.grad.zero_()
        if self.Why.grad is not None:
            self.Why.grad.zero_()
        if self.by.grad is not None:
            self.by.grad.zero_()

    def parameters(self):
        return [self.Wi, self.Ui, self.bi, self.Wf, self.Uf, self.bf, self.Wo, self.Uo, self.bo, self.Wc, self.Uc, self.bc, self.Why, self.by]

# Define parameters
input_size = 4
hidden_size = 5
output_size = 3  # Number of classes
sequence_length = 5  # Define the sequence length

# Example sentences
sentences = [
    ["my", "name", "is", "rithin", "a"],
    ["i", "am", "a", "developer", "in"],
    ["this", "is", "a", "sample", "sentence"]
]

# Initialize the model
model = SimpleRNN(input_size, hidden_size, output_size)

# Encode sentences
encoded_sentences = [encode_words(sentence, input_size, sequence_length) for sentence in sentences]
print("encodedsentence:",encoded_sentences)

# Define targets for each sentence
targets = torch.tensor([0, 1,2], dtype=torch.long)

# Training loop
learning_rate = 0.01
num_epochs = 3
for epoch in range(num_epochs):
    print(f'Epoch {epoch + 1}')
    
    total_loss = 0
    for i in range(len(encoded_sentences)):  # Iterate over each sentence
        input_seq = encoded_sentences[i]
        print("input_seq:",{i}, input_seq)
        target = targets[i]
        print("target:", {i}, target)
        
        # Zero the gradients
        model.zero_grad()
        
        # Forward pass
        output = model.forward(input_seq)
        
        # Compute loss (negative log likelihood loss)
        log_probs = torch.nn.functional.log_softmax(output, dim=0)
        loss = -log_probs[target]
        total_loss += loss.item()
        
        # Backward pass
        loss.backward()
        
        # Update parameters using gradient descent
        with torch.no_grad():
            for param in model.parameters():
                param -= learning_rate * param.grad
            model.zero_grad()  # Manually zero the gradients after updating weights
    
    # Print average loss for the epoch
    print(f'Average Loss: {total_loss / len(encoded_sentences)}')
    print('-' * 50)

# Predicting new sentence
new_sentence = ["my", "name", "is", "rithin", "a"]
encoded_new_sentence = encode_words(new_sentence, input_size, sequence_length)
prediction = model.predict(encoded_new_sentence)
print(f'Prediction for the new sentence: {prediction}')


Epoch 1
Average Loss: 1.2108193238576253
--------------------------------------------------
Epoch 2
Average Loss: 1.2085041205088298
--------------------------------------------------
Epoch 3
Average Loss: 1.2062173287073772
--------------------------------------------------
Prediction for the new sentence: 2


In [105]:
import torch
import string

# Function to encode a string into a tensor
def encode_string(s, input_size):
    char_to_idx = {char: idx for idx, char in enumerate(string.ascii_lowercase[:input_size])}
    tensor = torch.zeros(input_size)
    for char in s.lower():
        if char in char_to_idx:
            idx = char_to_idx[char]
            if idx < input_size:
                tensor[idx] = 1.0
    return tensor

def encode_words(words, input_size, sequence_length):
    encoded_words = [encode_string(word, input_size) for word in words]
    if len(encoded_words) < sequence_length:
        encoded_words.extend([torch.zeros(input_size)] * (sequence_length - len(encoded_words)))
    elif len(encoded_words) > sequence_length:
        encoded_words = encoded_words[:sequence_length]
    return torch.stack(encoded_words)

class SimpleRNN:
    def __init__(self, input_size, hidden_size, output_size):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size

        self.Wi = torch.randn(self.hidden_size, self.input_size, requires_grad=True)
        self.Ui = torch.randn(self.hidden_size, self.hidden_size, requires_grad=True)
        self.bi = torch.zeros(self.hidden_size, requires_grad=True)

        self.Wf = torch.randn(self.hidden_size, self.input_size, requires_grad=True)
        self.Uf = torch.randn(self.hidden_size, self.hidden_size, requires_grad=True)
        self.bf = torch.zeros(self.hidden_size, requires_grad=True)

        self.Wo = torch.randn(self.hidden_size, self.input_size, requires_grad=True)
        self.Uo = torch.randn(self.hidden_size, self.hidden_size, requires_grad=True)
        self.bo = torch.zeros(self.hidden_size, requires_grad=True)

        self.Wc = torch.randn(self.hidden_size, self.input_size, requires_grad=True)
        self.Uc = torch.randn(self.hidden_size, self.hidden_size, requires_grad=True)
        self.bc = torch.zeros(self.hidden_size, requires_grad=True)

        self.Why = torch.randn(self.output_size, self.hidden_size, requires_grad=True)
        self.by = torch.zeros(self.output_size, requires_grad=True)

    def forward(self, x):
        h = torch.zeros(self.hidden_size, 1, requires_grad=True)
        c = torch.zeros(self.hidden_size, 1, requires_grad=True)
        for t in range(x.size(0)):
            input_t = x[t].unsqueeze(1)
            f = torch.sigmoid(torch.matmul(self.Wf, input_t) + torch.matmul(self.Uf, h) + self.bf.unsqueeze(1))
            i = torch.sigmoid(torch.matmul(self.Wi, input_t) + torch.matmul(self.Ui, h) + self.bi.unsqueeze(1))
            o = torch.sigmoid(torch.matmul(self.Wo, input_t) + torch.matmul(self.Uo, h) + self.bo.unsqueeze(1))
            c_hat = torch.tanh(torch.matmul(self.Wc, input_t) + torch.matmul(self.Uc, h) + self.bc.unsqueeze(1))
            c = f * c + i * c_hat
            h = o * torch.tanh(c)
        output = torch.matmul(self.Why, h.squeeze(1)) + self.by
        return output

    def predict(self, x):
        output = self.forward(x)
        return torch.sigmoid(output).item()

    def zero_grad(self):
        if self.Wi.grad is not None:
            self.Wi.grad.zero_()
        if self.Ui.grad is not None:
            self.Ui.grad.zero_()
        if self.bi.grad is not None:
            self.bi.grad.zero_()
        if self.Wf.grad is not None:
            self.Wf.grad.zero_()
        if self.Uf.grad is not None:
            self.Uf.grad.zero_()
        if self.bf.grad is not None:
            self.bf.grad.zero_()
        if self.Wo.grad is not None:
            self.Wo.grad.zero_()
        if self.Uo.grad is not None:
            self.Uo.grad.zero_()
        if self.bo.grad is not None:
            self.bo.grad.zero_()
        if self.Wc.grad is not None:
            self.Wc.grad.zero_()
        if self.Uc.grad is not None:
            self.Uc.grad.zero_()
        if self.bc.grad is not None:
            self.bc.grad.zero_()
        if self.Why.grad is not None:
            self.Why.grad.zero_()
        if self.by.grad is not None:
            self.by.grad.zero_()

    def parameters(self):
        return [self.Wi, self.Ui, self.bi, self.Wf, self.Uf, self.bf, self.Wo, self.Uo, self.bo, self.Wc, self.Uc, self.bc, self.Why, self.by]

# Define parameters
input_size = 10
hidden_size = 128
output_size = 1  # Binary classification output
sequence_length = 5  # Define the sequence length

# Example sentences
sentences = [
    ["my", "name", "is", "rithin", "a"],
    ["i", "am", "a", "developer", "in"],
    ["this", "is", "a", "sample", "sentence"]
]

# Initialize the model
model = SimpleRNN(input_size, hidden_size, output_size)

# Encode sentences
encoded_sentences = [encode_words(sentence, input_size, sequence_length) for sentence in sentences]

# Define targets for each sentence
targets = torch.tensor([0.0, 1.0, 0.0], dtype=torch.float)

# Training loop
learning_rate = 0.01
num_epochs = 3
loss_function = torch.nn.BCEWithLogitsLoss()

for epoch in range(num_epochs):
    print(f'Epoch {epoch + 1}')
    
    total_loss = 0
    for i in range(len(encoded_sentences)):  # Iterate over each sentence
        input_seq = encoded_sentences[i]
        target = targets[i]
        
        # Zero the gradients
        model.zero_grad()
        
        # Forward pass
        output = model.forward(input_seq)
        
        # Compute loss (binary cross-entropy loss with logits)
        loss = loss_function(output, target.unsqueeze(0))
        total_loss += loss.item()
        
        # Backward pass
        loss.backward()
        
        # Update parameters using gradient descent
        with torch.no_grad():
            for param in model.parameters():
                param -= learning_rate * param.grad
            model.zero_grad()  # Manually zero the gradients after updating weights
    
    # Print average loss for the epoch
    print(f'Average Loss: {total_loss / len(encoded_sentences)}')
    print('-' * 50)

# Predicting new sentence
new_sentence = ["my", "name", "is", "rithin", "a"]
encoded_new_sentence = encode_words(new_sentence, input_size, sequence_length)
prediction = model.predict(encoded_new_sentence)
#print(f'Prediction for the new sentence: {prediction:.4f}')
binary_prediction = 1 if prediction >= 0.5 else 0
print(f'Binary Prediction for the new sentence: {binary_prediction}')



Epoch 1
Average Loss: 3.6251625219980874
--------------------------------------------------
Epoch 2
Average Loss: 4.167870998382568
--------------------------------------------------
Epoch 3
Average Loss: 0.5411695639292399
--------------------------------------------------
Binary Prediction for the new sentence: 0


In [107]:
import torch

# Define the input size, hidden size, and output size
input_size = 10
hidden_size = 128
output_size = 3

# Initialize weights and biases for forward LSTM
Wi_f = torch.randn(hidden_size, input_size, requires_grad=True)
Ui_f = torch.randn(hidden_size, hidden_size, requires_grad=True)
bi_f = torch.zeros(hidden_size, requires_grad=True)

Wf_f = torch.randn(hidden_size, input_size, requires_grad=True)
Uf_f = torch.randn(hidden_size, hidden_size, requires_grad=True)
bf_f = torch.zeros(hidden_size, requires_grad=True)

Wo_f = torch.randn(hidden_size, input_size, requires_grad=True)
Uo_f = torch.randn(hidden_size, hidden_size, requires_grad=True)
bo_f = torch.zeros(hidden_size, requires_grad=True)

Wc_f = torch.randn(hidden_size, input_size, requires_grad=True)
Uc_f = torch.randn(hidden_size, hidden_size, requires_grad=True)
bc_f = torch.zeros(hidden_size, requires_grad=True)

# Initialize weights and biases for backward LSTM
Wi_b = torch.randn(hidden_size, input_size, requires_grad=True)
Ui_b = torch.randn(hidden_size, hidden_size, requires_grad=True)
bi_b = torch.zeros(hidden_size, requires_grad=True)

Wf_b = torch.randn(hidden_size, input_size, requires_grad=True)
Uf_b = torch.randn(hidden_size, hidden_size, requires_grad=True)
bf_b = torch.zeros(hidden_size, requires_grad=True)

Wo_b = torch.randn(hidden_size, input_size, requires_grad=True)
Uo_b = torch.randn(hidden_size, hidden_size, requires_grad=True)
bo_b = torch.zeros(hidden_size, requires_grad=True)

Wc_b = torch.randn(hidden_size, input_size, requires_grad=True)
Uc_b = torch.randn(hidden_size, hidden_size, requires_grad=True)
bc_b = torch.zeros(hidden_size, requires_grad=True)

# Initialize weights for the output layer
Why = torch.randn(output_size, 2 * hidden_size, requires_grad=True)
by = torch.zeros(output_size, requires_grad=True)

# Sample input (sequence length = 3, batch size = 1, input size = 10)
inputs = torch.randn(3, input_size)

# Define targets for classification
targets = torch.tensor([1], dtype=torch.long)

# Initialize hidden state and cell state for forward LSTM
h_f = torch.zeros(hidden_size, 1, requires_grad=True)
c_f = torch.zeros(hidden_size, 1, requires_grad=True)

# Initialize hidden state and cell state for backward LSTM
h_b = torch.zeros(hidden_size, 1, requires_grad=True)
c_b = torch.zeros(hidden_size, 1, requires_grad=True)

# Forward pass for forward LSTM
outputs_f = []
for i in range(inputs.size(0)):
    input_t = inputs[i].unsqueeze(1)
    print("inputt", input_t)
    f = torch.sigmoid(torch.matmul(Wf_f, input_t) + torch.matmul(Uf_f, h_f) + bf_f.unsqueeze(1))
    i_gate = torch.sigmoid(torch.matmul(Wi_f, input_t) + torch.matmul(Ui_f, h_f) + bi_f.unsqueeze(1))
    o = torch.sigmoid(torch.matmul(Wo_f, input_t) + torch.matmul(Uo_f, h_f) + bo_f.unsqueeze(1))
    c_hat = torch.tanh(torch.matmul(Wc_f, input_t) + torch.matmul(Uc_f, h_f) + bc_f.unsqueeze(1))
    c_f = f * c_f + i_gate * c_hat
    h_f = o * torch.tanh(c_f)
    outputs_f.append(h_f)

# Forward pass for backward LSTM
outputs_b = []
for i in reversed(range(inputs.size(0))):
    input_t = inputs[i].unsqueeze(1)
    print("rinputt", input_t)
    f = torch.sigmoid(torch.matmul(Wf_b, input_t) + torch.matmul(Uf_b, h_b) + bf_b.unsqueeze(1))
    i_gate = torch.sigmoid(torch.matmul(Wi_b, input_t) + torch.matmul(Ui_b, h_b) + bi_b.unsqueeze(1))
    o = torch.sigmoid(torch.matmul(Wo_b, input_t) + torch.matmul(Uo_b, h_b) + bo_b.unsqueeze(1))
    c_hat = torch.tanh(torch.matmul(Wc_b, input_t) + torch.matmul(Uc_b, h_b) + bc_b.unsqueeze(1))
    c_b = f * c_b + i_gate * c_hat
    h_b = o * torch.tanh(c_b)
    outputs_b.insert(0, h_b)

# Concatenate the outputs from forward and backward LSTMs
outputs = [torch.cat((h_f, h_b), dim=0) for h_f, h_b in zip(outputs_f, outputs_b)]
outputs = torch.stack(outputs, dim=0).squeeze(1)

# Compute output for the last time step
output = torch.matmul(Why, outputs[-1]) + by

# Compute loss (negative log likelihood loss)
log_probs = torch.nn.functional.log_softmax(output, dim=0)
loss = -log_probs[targets].sum()

# Backward pass
loss.backward()

# Print loss and gradients
print('Loss:', loss.item())
print('Gradients:')
print('Wi_f.grad:', Wi_f.grad)
print('Ui_f.grad:', Ui_f.grad)
print('bi_f.grad:', bi_f.grad)
print('Wf_f.grad:', Wf_f.grad)
print('Uf_f.grad:', Uf_f.grad)
print('bf_f.grad:', bf_f.grad)
print('Wo_f.grad:', Wo_f.grad)
print('Uo_f.grad:', Uo_f.grad)
print('bo_f.grad:', bo_f.grad)
print('Wc_f.grad:', Wc_f.grad)
print('Uc_f.grad:', Uc_f.grad)
print('bc_f.grad:', bc_f.grad)
print('Wi_b.grad:', Wi_b.grad)
print('Ui_b.grad:', Ui_b.grad)
print('bi_b.grad:', bi_b.grad)
print('Wf_b.grad:', Wf_b.grad)
print('Uf_b.grad:', Uf_b.grad)
print('bf_b.grad:', bf_b.grad)
print('Wo_b.grad:', Wo_b.grad)
print('Uo_b.grad:', Uo_b.grad)
print('bo_b.grad:', bo_b.grad)
print('Wc_b.grad:', Wc_b.grad)
print('Uc_b.grad:', Uc_b.grad)
print('bc_b.grad:', bc_b.grad)
print('Why.grad:', Why.grad)
print('by.grad:', by.grad)

# Update parameters using gradient descent
learning_rate = 0.01
with torch.no_grad():
    Wi_f -= learning_rate * Wi_f.grad
    Ui_f -= learning_rate * Ui_f.grad
    bi_f -= learning_rate * bi_f.grad
    Wf_f -= learning_rate * Wf_f.grad
    Uf_f -= learning_rate * Uf_f.grad
    bf_f -= learning_rate * bf_f.grad
    Wo_f -= learning_rate * Wo_f.grad
    Uo_f -= learning_rate * Uo_f.grad
    bo_f -= learning_rate * bo_f.grad
    Wc_f -= learning_rate * Wc_f.grad
    Uc_f -= learning_rate * Uc_f.grad
    bc_f -= learning_rate * bc_f.grad
    Wi_b -= learning_rate * Wi_b.grad
    Ui_b -= learning_rate * Ui_b.grad
    bi_b -= learning_rate * bi_b.grad
    Wf_b -= learning_rate * Wf_b.grad
    Uf_b -= learning_rate * Uf_b.grad
    bf_b -= learning_rate * bf_b.grad
    Wo_b -= learning_rate * Wo_b.grad
    Uo_b -= learning_rate * Uo_b.grad
    bo_b -= learning_rate * bo_b.grad
    Wc_b -= learning_rate * Wc_b.grad
    Uc_b -= learning_rate * Uc_b.grad
    bc_b -= learning_rate * bc_b.grad
    Why -= learning_rate * Why.grad
    by -= learning_rate * by.grad

    # Manually zero the gradients after updating weights
    Wi_f.grad.zero_()
    Ui_f.grad.zero_()
    bi_f.grad.zero_()
    Wf_f.grad.zero_()
    Uf_f.grad.zero_()
    bf_f.grad.zero_()
    Wo_f.grad.zero_()
    Uo_f.grad.zero_()
    bo_f.grad.zero_()
    Wc_f.grad.zero_()
    Uc_f.grad.zero_()
    bc_f.grad.zero_()
    Wi_b.grad.zero_()
    Ui_b.grad.zero_()
    bi_b.grad.zero_()
    Wf_b.grad.zero_()
    Uf_b.grad.zero_()
    bf_b.grad.zero_()
    Wo_b.grad.zero_()
    Uo_b.grad.zero_()
    bo_b.grad.zero_()
    Wc_b.grad.zero_()
    Uc_b.grad.zero_()
    bc_b.grad.zero_()
    Why.grad.zero_()
    by.grad.zero_()

print("predicted :", output)
softmax = torch.nn.functional.softmax(output, dim=0)
print('Output (softmax probabilities):', softmax)
print("target :", targets)


inputt tensor([[ 1.0750],
        [-0.6875],
        [ 0.6863],
        [-1.2301],
        [ 1.2975],
        [ 0.3874],
        [ 0.3056],
        [ 0.8377],
        [ 0.5743],
        [-0.5956]])
inputt tensor([[-0.0610],
        [-0.2946],
        [-1.3526],
        [ 0.5550],
        [-2.1487],
        [-0.2415],
        [ 0.3168],
        [-0.0286],
        [ 0.9466],
        [ 0.4472]])
inputt tensor([[ 1.0562],
        [ 0.1735],
        [-0.6131],
        [-0.8838],
        [-1.8751],
        [ 1.9588],
        [ 0.3802],
        [ 0.5827],
        [-0.3728],
        [ 0.1304]])
rinputt tensor([[ 1.0562],
        [ 0.1735],
        [-0.6131],
        [-0.8838],
        [-1.8751],
        [ 1.9588],
        [ 0.3802],
        [ 0.5827],
        [-0.3728],
        [ 0.1304]])
rinputt tensor([[-0.0610],
        [-0.2946],
        [-1.3526],
        [ 0.5550],
        [-2.1487],
        [-0.2415],
        [ 0.3168],
        [-0.0286],
        [ 0.9466],
        [ 0.4472]])
rinputt 