<a href="https://colab.research.google.com/github/Sreejith-nair511/Summer_course_Ai/blob/main/LOSS_FUNCTION.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn

# Step 1: Define the RNN cell manually, without separate bias terms.
# The bias will be included in the weight matrix.
class SimpleRNNCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(SimpleRNNCell, self).__init__() # Defined as a perceptron
        self.input_size = input_size # Input dimension
        self.hidden_size = hidden_size # hidden dimension

        # Weight matrix W_ih and W_hh are combined into one matrix.
        # The bias is included by adding an extra dimension to the input.
        # This means the weight matrix will be (input_size + hidden_size + 1) x hidden_size
        # where the +1 is for the bias term.
        # input dim = 100
        # hidden dim = 10 (110 X 10)
        self.combined_weights = nn.Parameter(torch.Tensor(input_size + hidden_size + 1, hidden_size))

        self.reset_parameters()

    def reset_parameters(self):
        # Initialize weights and include bias in the initialization scheme.
        # A common initialization is Xavier uniform or Kaiming uniform.
        # We need to manually handle the initialization for the combined weight matrix.
        # Let's use a uniform distribution based on input and hidden sizes.
        stdv = 1.0 / torch.sqrt(torch.tensor(self.hidden_size, dtype=torch.float32))
        self.combined_weights.data.uniform_(-stdv, stdv)

    def forward(self, input, hidden):
        # Concatenate input, hidden state, and a bias term (vector of ones)
        batch_size = input.size(0)
        bias_term = torch.ones(batch_size, 1, device=input.device)
        combined_input = torch.cat((input, hidden, bias_term), dim=1)

        # Perform the linear transformation: (input, hidden, bias) @ combined_weights^T
        # The activation function is typically tanh or ReLU. Let's use tanh.
        next_hidden = torch.tanh(torch.matmul(combined_input, self.combined_weights))

        return next_hidden, next_hidden # Return both for consistency with standard RNN API

# Step 2: Create a sequence model using the custom RNN cell.
class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1):
        super(SimpleRNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # Although we are building a simple one-layer example, the structure
        # allows for multiple layers if needed in a more complex scenario.
        if num_layers != 1:
            raise NotImplementedError("This simple example only supports num_layers=1")

        self.rnn_cell = SimpleRNNCell(input_size, hidden_size)

        # A simple output layer (e.g., for classification or regression on the last hidden state)
        # For this example, let's just output the last hidden state directly.
        # If we were doing classification, we might add a linear layer here:
        # self.fc = nn.Linear(hidden_size, output_size, bias=False) # bias in FC layer is okay if needed elsewhere

    def forward(self, input_seq):
        # input_seq shape: (seq_len, batch_size, input_size)
        seq_len, batch_size, _ = input_seq.size()

        # Initialize hidden state (usually with zeros)
        hidden = torch.zeros(batch_size, self.hidden_size, device=input_seq.device)

        outputs = []
        for t in range(seq_len):
            # Process one time step
            input_t = input_seq[t, :, :]
            hidden, _ = self.rnn_cell(input_t, hidden)
            outputs.append(hidden.unsqueeze(0)) # Collect hidden states (optional)

        # Stack outputs if needed, or just return the last hidden state
        # For this simple example, let's just return the last hidden state
        last_hidden_state = hidden
        # If we had an FC layer for classification:
        # output = self.fc(last_hidden_state)
        # return output
        return last_hidden_state

# Step 3: Create a simple synthetic dataset
# Let's create sequences where the last element is determined by the sum of the first few elements.
# We will predict the last element.
def create_sequence_data(num_sequences, seq_len, input_size):
    sequences = torch.randn(num_sequences, seq_len, input_size)
    targets = torch.sum(sequences[:, :seq_len-1, :], dim=(1, 2)) # Sum of first seq_len-1 elements

    # Add some noise to the target to make it slightly more realistic
    targets += 0.1 * torch.randn(num_sequences)
    return sequences, targets.unsqueeze(1) # Target shape (num_sequences, 1)

# Step 4: Setup model, loss function, optimizer, and training loop
input_size = 5
hidden_size = 10
seq_len = 10
num_sequences = 1000
learning_rate = 0.01
epochs = 10

# Create data
all_sequences, all_targets = create_sequence_data(num_sequences, seq_len, input_size)

# Split data into training and testing sets
train_ratio = 0.8
train_size = int(num_sequences * train_ratio)
train_sequences, train_targets = all_sequences[:train_size], all_targets[:train_size]
test_sequences, test_targets = all_sequences[train_size:], all_targets[train_size:]

# Transpose sequences to (seq_len, batch_size, input_size) for RNN input
train_sequences = train_sequences.transpose(0, 1)
test_sequences = test_sequences.transpose(0, 1)

model = SimpleRNN(input_size, hidden_size)

# Loss function (Mean Squared Error for regression)
criterion = nn.MSELoss()

# Optimizer
# We need to optimize the single combined_weights parameter
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Step 5: Training loop
print("Starting training...")
for epoch in range(epochs):
    model.train() # Set model to training mode
    optimizer.zero_grad() # Clear gradients

    # Forward pass
    outputs = model(train_sequences)

    # Calculate loss. The target is a single value per sequence, corresponding
    # to the prediction from the last hidden state.
    loss = criterion(outputs, train_targets)

    # Backward pass and optimize
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

print("Training finished.")

# Step 6: Evaluate the model on the test set
print("Evaluating on test set...")
model.eval() # Set model to evaluation mode
with torch.no_grad(): # No gradient calculation needed for evaluation
    test_outputs = model(test_sequences)
    test_loss = criterion(test_outputs, test_targets)
    print(f'Test Loss: {test_loss.item():.4f}')

# Example prediction for a single sequence from the test set
single_test_sequence = test_sequences[:, 0:1, :] # Select the first sequence, keep batch dim 1
single_test_target = test_targets[0]

print(single_test_target)
print(single_test_sequence)

model.eval()
with torch.no_grad():
    single_output = model(single_test_sequence)

#print(f"Example Prediction: {single_output.item():.4f}, Actual Target: {single_test_target.item():.4f}")

# To verify that bias is included in weights, we can inspect the combined_weights
print("\nInspecting weights:")
print("Shape of combined_weights:", model.rnn_cell.combined_weights.shape)
# The last row of combined_weights corresponds to the bias weights.
print("Weights corresponding to bias:", model.rnn_cell.combined_weights[-1, :])


Starting training...


  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/10], Loss: 42.2448
Training finished.
Evaluating on test set...
Test Loss: 41.9113
tensor([-2.4928])
tensor([[[-0.3900, -2.2210,  0.9790, -0.8636,  0.1752]],

        [[-0.2346,  1.3425, -1.0400,  0.4113, -0.3523]],

        [[ 2.1857, -2.4097,  0.0097,  0.5941,  0.4830]],

        [[-0.8461, -0.8216, -1.5799, -0.0788, -0.6433]],

        [[ 0.4778, -0.3263, -1.0281,  0.8318, -0.8801]],

        [[ 0.5234,  0.5376, -0.3792,  0.3737,  0.8033]],

        [[-0.0587, -0.4895,  1.7985,  1.1254, -0.3656]],

        [[-0.3812,  0.3773,  0.1778,  0.8022, -0.6116]],

        [[-0.1221, -0.3603, -0.3296,  0.0173,  0.4604]],

        [[-0.0216,  0.9195,  0.3155, -0.5358,  0.7905]]])

Inspecting weights:
Shape of combined_weights: torch.Size([16, 10])
Weights corresponding to bias: tensor([-0.0415,  0.0938,  0.0699,  0.0235,  0.0404,  0.2944,  0.1683,  0.0757,
        -0.0260, -0.0896], grad_fn=<SliceBackward0>)


  return F.mse_loss(input, target, reduction=self.reduction)


#Execution

In [2]:
# Define a simple vocabulary
vocabulary = ["hello", "world", "this", "is", "a", "test", ".", "<pad>"] # Add a padding token

# Create a word-to-index mapping
word_to_index = {word: i for i, word in enumerate(vocabulary)}
index_to_word = {i: word for word, i in word_to_index.items()}

vocab_size = len(vocabulary)

# Our RNN's input_size must now match the vocabulary size for one-hot encoding.
# Let's redefine and retrain the model with the correct input_size.
# Note: This is a simple demonstration. For real NLP, you'd use embeddings.

# Redefine model parameters based on the new input size
nlp_input_size = vocab_size
nlp_hidden_size = 10 # Keep the same hidden size

# Create a new model instance with the correct input size
nlp_model = SimpleRNN(nlp_input_size, nlp_hidden_size)

# For this demonstration, we won't retrain the model extensively on NLP data.
# We'll just demonstrate how to pass NLP data through the existing architecture.
# A real NLP task would require a suitable dataset and training.

# Function to convert a sentence to a sequence of one-hot vectors
def sentence_to_one_hot_sequence(sentence, word_to_index, vocab_size, max_len=None):
    words = sentence.lower().split()
    indices = [word_to_index.get(word, word_to_index["<pad>"]) for word in words] # Use pad for unknown words

    if max_len is not None:
        # Pad or truncate the sequence
        if len(indices) < max_len:
            indices += [word_to_index["<pad>"]] * (max_len - len(indices))
        elif len(indices) > max_len:
            indices = indices[:max_len]

    seq_len = len(indices) if max_len is None else max_len
    batch_size = 1 # Processing one sentence at a time for testing

    # Create one-hot vectors
    one_hot_sequence = torch.zeros(seq_len, batch_size, vocab_size)
    for t, index in enumerate(indices):
        one_hot_sequence[t, 0, index] = 1

    return one_hot_sequence

# Example sentence for testing
test_sentence = "this is a test ."
max_sequence_length = 6 # Based on the example sentence length + pad if needed

# Convert the sentence to a one-hot sequence
test_sequence_one_hot = sentence_to_one_hot_sequence(
    test_sentence,
    word_to_index,
    vocab_size,
    max_len=max_sequence_length
)

print("\nTesting with natural language sentence:")
print("Sentence:", test_sentence)
print("One-hot sequence shape:", test_sequence_one_hot.shape) # Should be (seq_len, batch_size, vocab_size)

# Pass the one-hot sequence through the NLP model (which is untrained on NLP data)
# Since this model was trained on random numerical data, its output for a sentence
# will not be meaningful in an NLP sense (like sentiment or next word prediction).
# This is just to show the forward pass mechanism.

nlp_model.eval() # Set model to evaluation mode
with torch.no_grad():
    nlp_output = nlp_model(test_sequence_one_hot)

print("Output (last hidden state) shape:", nlp_output.shape) # Should be (batch_size, hidden_size)
print("Output (last hidden state):", nlp_output)

# Interpretation of the output depends on the downstream task.
# If it were a classifier, we'd pass nlp_output through the final FC layer.
# Since it's just returning the hidden state, this is the context vector learned by the RNN
# at the end of processing the sentence (based on the weights learned from the previous numerical task).
# For a real NLP task, you would add a final linear layer to predict the task output
# (e.g., sentiment score, next word probabilities, etc.).



Testing with natural language sentence:
Sentence: this is a test .
One-hot sequence shape: torch.Size([6, 1, 8])
Output (last hidden state) shape: torch.Size([1, 10])
Output (last hidden state): tensor([[-0.1949, -0.5694, -0.5766,  0.2128, -0.1309,  0.3553,  0.2497, -0.1184,
          0.2587,  0.3615]])
