# Spoticore *Stage 2*

Evolution from the shallow bigram nueral network architecture to a complete Multiple Layer Perceptron that handles character level context lengths greater than 2.

Based on [Bengio et al., 2003](https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf)

## Contents

1. **Setup & Data Processing** - Load constants, data readers, and vocabulary builders
2. **Model Architecture** - MLP implementation with embedding, hidden, and output layers
3. **Training** - Forward/backward passes and optimization
4. **Evaluation** - Development set validation
5. **Text Generation** - Sample from the trained model
6. **Experiments** - Learning rate tuning and architecture exploration

## Model Overview

- **Input**: Fixed-length character sequences (context windows)
- **Architecture**: Embedding → Hidden Layer (tanh) → Output Layer (softmax)
- **Task**: Predict the next character given context
- **Training**: Cross-entropy loss with mini-batch SGD


## 1. Constants


In [29]:
from typing import Final

# Random Seed.
SAMPLE_SEED: Final[int] = 534150593

# Data Processing.
LYRICS_COLUMN: Final[str] = "text"
DEFAULT_CSV_PATH: Final[str] = "spotify_lyrics.csv"

# Model Architecture - MLP.
BLOCK_SIZE: int = 3  # Context window size (number of characters).

# Training Hyperparameters.
LEARNING_RATE: float = 0.1
REGULARIZATION_FACTOR: float = 0.001
EMBEDDING_DIM: int = 10  # Dimension of character embeddings.
HIDDEN_LAYER_SIZE: int = 100  # Number of neurons in the hidden layer.
BATCH_SIZE: int = 32  # Number of inputs per training iteration

## 2. Data Reader


In [30]:
import csv
import string


def read_all_lyrics(csv_path: str = DEFAULT_CSV_PATH) -> list[str]:
    """
    Process the Spotify lyrics CSV file and extract all lyrics text.

    Args:
        csv_path: Path to the CSV file. Defaults to "spotify_lyrics.csv".

    Returns:
        List of lyrics text strings, one per song.
    """
    lyrics_list = []

    with open(csv_path, "r", encoding="utf-8") as csvfile:
        reader = csv.DictReader(csvfile)

        for row in reader:
            # Extract the lyrics text from the 'text' column.
            lyrics = row.get(LYRICS_COLUMN, "").strip().lower()
            if lyrics:  # Only add non-empty lyrics.
                lyrics_list.append(lyrics)

    return lyrics_list


def read_all_unique_words(csv_path: str = DEFAULT_CSV_PATH) -> list[str]:
    """
    Process the Spotify lyrics CSV file and extract all unique individual words.

    Punctuation is removed from words before adding them to the vocabulary.

    Args:
        csv_path: Path to the CSV file. Defaults to "spotify_lyrics.csv".

    Returns:
        Sorted list of unique words from all lyrics, with punctuation removed.
    """
    words_list = []
    # Create a translation table that replaces punctuation with spaces.
    translator = str.maketrans({punct: " " for punct in string.punctuation})

    with open(csv_path, "r", encoding="utf-8") as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            lyrics = row.get(LYRICS_COLUMN, "").strip().lower()
            if lyrics:
                # Replace punctuation with spaces before splitting.
                sanitized_lyrics = lyrics.translate(translator)
                # Filter out empty strings created by consecutive spaces.
                words_list.extend([word for word in sanitized_lyrics.split() if word])

    return sorted(set(words_list))

## 3. MLP Model


In [31]:
import torch
import torch.nn.functional as F
from dataclasses import dataclass


@dataclass(frozen=True)
class Parameters:
    C: torch.Tensor
    W1: torch.Tensor
    W2: torch.Tensor
    b1: torch.Tensor
    b2: torch.Tensor

    @property
    def parameters(self) -> tuple[torch.Tensor, ...]:
        return (self.C, self.W1, self.W2, self.b1, self.b2)

In [32]:
def build_vocab_from_words():
    """
    Build vocabulary mappings from unique words.

    Reads all unique words from the CSV file and creates string-to-index (stoi)
    and index-to-string (itos) mappings for all characters found in the words.
    The period character (.) is assigned index 0.

    Returns:
        tuple: A tuple containing:
            - stoi (dict[str, int]): Mapping from character to index.
            - itos (dict[int, str]): Mapping from index to character.
            - words (list[str]): List of all unique words.
    """
    words = read_all_unique_words()
    chars = sorted(list(set(".".join(words))))
    stoi = {char: i for i, char in enumerate(chars)}
    stoi["."] = 0
    itos = {i: char for i, char in enumerate(stoi)}

    return stoi, itos, words

In [None]:
def build_dataset(
    words: list[str],
    stoi: dict[str, int],
    block_size: int = BLOCK_SIZE,
) -> tuple[torch.Tensor, torch.Tensor, int]:
    """
    Build training dataset from words using character-level context windows.

    Creates input-output pairs where each input is a context window of characters
    and the output is the next character to predict. Uses a sliding window approach
    with a fixed context length (block_size).

    Args:
        words: List of words to build the dataset from.
        stoi: Mapping from character to index for converting characters to tensor inputs.
        block_size: Number of characters in the context window. Defaults to 3.

    Returns:
        tuple: A tuple containing:
            - X (torch.Tensor): Input tensor of shape (n_samples, block_size) with context windows.
            - Y (torch.Tensor): Output tensor of shape (n_samples,) with target character indices.
            - block_size (int): The context window size used.
    """
    # Input and corresponding output (labels) matrices.
    X, Y = [], []

    for word in words:
        prev_char_idxs = [0] * block_size
        for char in word + ".":
            next_char_idx = stoi[char]
            X.append(prev_char_idxs)
            Y.append(next_char_idx)
            prev_char_idxs = prev_char_idxs[1:] + [next_char_idx]

    return torch.tensor(X), torch.tensor(Y), block_size

### Dataset Splitting

Split the data into three disjoint sets:
- **Training (80%)**: Used to update model weights via gradient descent
- **Development/Validation (10%)**: Used to tune hyperparameters and monitor overfitting
- **Testing (10%)**: Held out for final performance evaluation (used only once)

In [34]:
import random


@dataclass(frozen=True)
class SplitData:
    vocab_size: int = 0

    X_train: torch.Tensor = torch.tensor(0)
    X_dev: torch.Tensor = torch.tensor(0)
    X_test: torch.Tensor = torch.tensor(0)

    Y_train: torch.Tensor = torch.tensor(0)
    Y_dev: torch.Tensor = torch.tensor(0)
    Y_test: torch.Tensor = torch.tensor(0)

    ctxlen_train: int = 0
    ctxlen_dev: int = 0
    ctxlen_test: int = 0

    @property
    def train_data(self) -> tuple[torch.Tensor, torch.Tensor, int]:
        return (self.X_train, self.Y_train, self.ctxlen_train)

    @property
    def dev_data(self) -> tuple[torch.Tensor, torch.Tensor, int]:
        return (self.X_dev, self.Y_dev, self.ctxlen_dev)

    @property
    def test_data(self) -> tuple[torch.Tensor, torch.Tensor, int]:
        return (self.X_test, self.Y_test, self.ctxlen_test)


def split_training_dataset():
    random.seed(42)

    stoi, itos, words = build_vocab_from_words()

    random.shuffle(words)

    n1 = int(0.8 * len(words))
    n2 = int(0.9 * len(words))

    X_train, Y_train, train_ctxlen = build_dataset(words[:n1], stoi)
    X_dev, Y_dev, dev_ctxlen = build_dataset(words[n1:n2], stoi)
    X_test, Y_test, test_ctxlen = build_dataset(words[n2:], stoi)

    return SplitData(
        len(stoi),
        X_train,
        X_dev,
        X_test,
        Y_train,
        Y_dev,
        Y_test,
        train_ctxlen,
        dev_ctxlen,
        test_ctxlen,
    )

In [None]:
def initialize_parameters(
    vocab_size: int, block_size: int, generator: torch.Generator | None = None
) -> Parameters:
    """
    Initialize network parameters with random values.

    Note: Uses standard normal initialization. For better performance with larger
    hidden layers, consider using Kaiming/He initialization.

    Args:
        vocab_size: Size of the vocabulary (number of unique characters).
        block_size: Number of characters in the context window.
        generator: Optional random number generator for reproducible initialization.

    Returns:
        Parameters: Container with initialized embedding, weight, and bias tensors.
    """
    if generator is None:
        generator = torch.Generator().manual_seed(SAMPLE_SEED)

    emb_dims = EMBEDDING_DIM
    hidden_layer_size = HIDDEN_LAYER_SIZE

    # Embedding layer weights.
    C = torch.randn((vocab_size, emb_dims), generator=generator, requires_grad=True)

    # Hidden layer weights and bias.
    W1 = torch.randn(
        (block_size * emb_dims, hidden_layer_size),
        generator=generator,
        requires_grad=True,
    )
    b1 = torch.randn(hidden_layer_size, generator=generator, requires_grad=True)

    # Output layer weights and bias.
    W2 = torch.randn(
        (hidden_layer_size, vocab_size), generator=generator, requires_grad=True
    )
    b2 = torch.randn(vocab_size, generator=generator, requires_grad=True)

    return Parameters(C, W1, W2, b1, b2)

In [36]:
def forward_pass(
    X: torch.Tensor,
    Y: torch.Tensor,
    parameters: Parameters,
    block_size: int,
) -> tuple[Parameters, torch.Tensor]:
    """
    Perform forward pass through the MLP network for character-level language modeling.

    The network consists of:
    1. Embedding layer: Converts character indices to dense embeddings.
    2. Hidden layer: Fully connected layer with tanh activation.
    3. Output layer: Produces logits for next character prediction.

    Args:
        X: Input tensor of shape (n_samples, block_size) containing character indices.
        Y: Target tensor of shape (n_samples,) containing the true next character indices.
        parameters: Container with all network parameters (embedding weights, W1, W2, b1, b2).
        block_size: Number of characters in the context window.

    Returns:
        Tuple containing:
            - Parameters: Container with the parameter tensors (same reference as input).
            - loss: The cross entropy loss tensor.
    """
    emb_dims = EMBEDDING_DIM

    emb = parameters.C[X]  # [num_samples, block_size, emb_dims]
    num_samples = emb.shape[0]

    # Flatten emb tensor to 2d for matrix multiplication with weight matrix.
    # One input sample contains 3 characters and each character is embedded as a vector of size 10.
    # Each sample becomes a single 30 element vector containing all the context information.
    emb = emb.view(num_samples, block_size * emb_dims)

    # Each row of W1 corresponds to one of the 30 input features (after flattening) of an input sample.
    # Each hidden neuron receives a weighted sum of all 30 features.
    h = torch.tanh(emb @ parameters.W1 + parameters.b1)  # [num_samples, hidden_size]

    # Output layer contains a node for each character that comes next; i.e. vocab_size neurons.
    logits = h @ parameters.W2 + parameters.b2  # [num_samples, vocab_size]
    loss = F.cross_entropy(logits, Y)

    return parameters, loss

In [37]:
def backward_pass(
    parameters: list[torch.Tensor],
    loss: torch.Tensor,
    lr: torch.Tensor | float = LEARNING_RATE,
):
    """
    Perform backward pass and update parameters using gradient descent.

    Args:
        parameters: List of parameters to update (must have requires_grad=True).
        loss: The computed loss tensor.
    Returns:
        None.

    Note:
    The parameters are updated in place.
    """

    # Just in case.
    for p in parameters:
        p.requires_grad = True

    # Compute gradients via backpropagation.
    for p in parameters:
        p.grad = None

    loss.backward()

    # Update parameters using gradient descent.
    for p in parameters:
        if p.grad is not None:
            p.data -= lr * p.grad
        else:
            # This shouldn't happen.
            print(
                f"Warning: Parameter with shape {p.shape} has no gradient (requires_grad={p.requires_grad})"
            )

In [38]:
LEARNING_RATE = 0.2  # from experiment 1 below


def train(
    num_iterations: int = 1000,
    print_interval: int = 10,
    generator: torch.Generator | None = None,
) -> tuple[Parameters, SplitData]:
    """
    Train the MLP network for character-level language modeling.

    Performs the complete training process: data preparation, parameter initialization,
    and training loop with forward and backward passes with input batches.

    Args:
        num_iterations: Number of training iterations (default: 1000).
        print_interval: Print loss every N iterations (default: 10).
        generator: Optional random number generator. If None, creates one with SAMPLE_SEED.

    Returns:
        params: The updated parameters after training.
        data:  The dataset split used for training/dev/test.
    """
    # Initialize random number generator with fixed seed for reproducibility.
    if generator is None:
        generator = torch.Generator().manual_seed(SAMPLE_SEED)

    # Build split training dataset from all words.
    data = split_training_dataset()

    # extract training input and outputs
    X, Y, block_size = data.train_data

    # Initialize network parameters based on the training dataset
    params = initialize_parameters(data.vocab_size, block_size, generator=generator)
    print("Parameters initialized")

    n_params = sum(p.nelement() for p in params.parameters)
    print("Total paramters:", n_params)

    # Training loop.
    for i in range(num_iterations):
        # mini batch construct
        ix = torch.randint(0, X.shape[0], (BATCH_SIZE,), generator=generator)

        # extract the batches
        X_batch, Y_batch = X[ix], Y[ix]

        # Forward pass: compute predictions and loss with the mini batch of inputs
        params, loss = forward_pass(X_batch, Y_batch, params, block_size)

        # Print loss at specified intervals.
        if i % print_interval == 0:
            print(f"Iteration {i}: loss = {loss.item():.4f}")

        # Backward pass: compute gradients and update parameters.
        backward_pass(list(params.parameters), loss)

    print(f"Training complete. Final loss: {loss.item():.4f}")

    return params, data

## 4. Train the model

In [39]:
g = torch.Generator().manual_seed(SAMPLE_SEED)

params, data = train(200_000, generator=g)

Parameters initialized
Total paramters: 7207
Iteration 0: loss = 19.9033
Iteration 10: loss = 17.7575
Iteration 20: loss = 12.9249
Iteration 30: loss = 14.2880
Iteration 40: loss = 10.6687
Iteration 50: loss = 13.3272
Iteration 60: loss = 7.6084
Iteration 70: loss = 11.6514
Iteration 80: loss = 8.0104
Iteration 90: loss = 7.3388
Iteration 100: loss = 8.0937
Iteration 110: loss = 7.4386
Iteration 120: loss = 8.1203
Iteration 130: loss = 9.0665
Iteration 140: loss = 10.2083
Iteration 150: loss = 6.6066
Iteration 160: loss = 8.0120
Iteration 170: loss = 7.4533
Iteration 180: loss = 7.1760
Iteration 190: loss = 5.5004
Iteration 200: loss = 7.4069
Iteration 210: loss = 6.3406
Iteration 220: loss = 6.9937
Iteration 230: loss = 4.6287
Iteration 240: loss = 7.3365
Iteration 250: loss = 3.7385
Iteration 260: loss = 5.6149
Iteration 270: loss = 5.7309
Iteration 280: loss = 5.7022
Iteration 290: loss = 5.0667
Iteration 300: loss = 6.1035
Iteration 310: loss = 6.3093
Iteration 320: loss = 7.3275
I

### Validation (Dev) Loss

After each training run or at checkpoint intervals, we measure the loss ***without updating the weights*** on **unseen** data inputs to see how well the model generalizes to unseen data.

We use this loss to fine-tune the ``hyperparameters`` of the model: learning rate, number and size of hidden layers, regularization rate, embedding dimension size etc.

We can use dev loss to decide when to stop training based on the following observations:

1. A significant gap between training loss and dev loss indicates **overfitting**, meaning the model has memorized the training data but does not generalize well to new data. 

2. If both losses are high, the model is **underfitting** and needs more capacity or training.

3. If both losses are close and small, the model is **generalizing well** and we have found a good balance between fitting the training data and generalizing to unseen data. At this point, we can stop training.


## 5. Compare dev loss with trained loss


In [44]:
def compute_dev_loss(
    split_data: SplitData,
    trained_params: Parameters,
) -> float:
    """Compute the development loss without touching gradients."""
    X_dev, Y_dev, block_size = split_data.dev_data

    with torch.no_grad():
        _, loss = forward_pass(X_dev, Y_dev, trained_params, block_size)

    print(f"Dev loss: {loss.item()}")
    return loss.item()

In [45]:
dev_loss = compute_dev_loss(data, params)

Dev loss: 2.432872772216797


## 6. Text Generation (Sampling)

Generate new text by sampling from the trained model one character at a time.

### Process:
1. Start with a context of padding characters (index 0)
2. Feed context through the network to get probability distribution over next characters
3. Sample a character from this distribution
4. Update context by appending the sampled character and removing the oldest
5. Repeat until we sample the end-of-word token (.)

The quality of generated text reflects how well the model learned character-level patterns.

In [40]:
def sample_from_model(
    trained_params: Parameters, count: int, block_size: int, generator: torch.Generator
):
    C, W1, b1, W2, b2 = (
        trained_params.C,
        trained_params.W1,
        trained_params.b1,
        trained_params.W2,
        trained_params.b2,
    )

    _, itos, _ = build_vocab_from_words()
    for _ in range(count):
        out = []
        context = [0] * block_size
        while True:
            emb = C[torch.tensor([context])]  # [1, block_size, emb_dims]
            h = torch.tanh(emb.view(1, -1) @ W1 + b1)
            logits = h @ W2 + b2
            probs = F.softmax(logits, dim=1)
            next_idx = torch.multinomial(probs, 1, True, generator=generator).item()
            context = context[1:] + [next_idx]
            out.append(next_idx)
            if next_idx == 0:
                break

        print("".join(itos[i] for i in out))

In [46]:
g = torch.Generator().manual_seed(SAMPLE_SEED + 10)

examples = 50

sample_from_model(params, examples, BLOCK_SIZE, g)

aded.
gened.
nan.
uls.
bekders.
squaring.
exion.
reny.
dillong.
iruking.
ohhhhhhhhhhn.
ile.
fayh.
halls.
vandradows.
ferego.
bankers.
feliskin.
soker.
etrikayendy.
fuctions.
bermikakiercation.
regerde.
heblime.
ooeymecreomenteet.
onamake.
leys.
jand.
focevoulawlo.
shits.
doyeced.
gaide.
hattizin.
swailetapling.
locked.
woochedlowed.
somed.
skin.
eccoppund.
excing.
coll.
usumm.
dicangund.
perterinahahahardia.
tehotopic.
blans.
crights.
bried.
fick.
scalonosedidan.


## Experiment 1: Finding the Optimal Learning Rate

This experiment uses a **learning rate range test** to identify the best initial learning rate for training.

### Method:
- Start with a small learning rate (10⁻³ or 10⁻²) and gradually increase it to 1.0
- Train for a fixed number of iterations, updating the learning rate at each step
- Plot loss vs learning rate to find the region where loss decreases fastest

### Goal:
Find the learning rate that:
1. Decreases loss quickly (steep negative slope)
2. Remains stable (doesn't cause divergence)

In [None]:
iterations = 30_000

# Build vocabulary and dataset.
stoi, itos, words = build_vocab_from_words()
vocab_size = len(stoi)
print(f"Vocabulary size: {vocab_size}")

# Build training dataset from all words.
X, Y, block_size = build_dataset(words, stoi)
print(f"Dataset shape: X={X.shape}, Y={Y.shape}")

### Loss vs learning rate


In [None]:
generator = torch.Generator().manual_seed(SAMPLE_SEED)

# Initialize network parameters.
params = initialize_parameters(vocab_size, block_size, generator=generator)

n_params = sum(p.nelement() for p in params.parameters)
print("Total paramters:", n_params)

In [None]:
# Create learning rate exponents from -2 to 0 with 1000 points
lre = torch.linspace(-2, 0, iterations)
# Convert exponents to actual learning rates (10^lre)
lrs = 10**lre

# Initialize lists to track learning rates and losses during training
lr_x = []
loss_y = []

batch_generator = torch.Generator().manual_seed(SAMPLE_SEED)

# Training loop.
for i in range(iterations):
    # mini batch construct
    ix = torch.randint(0, X.shape[0], (BATCH_SIZE,), generator=batch_generator)

    # extract the batches
    X_batch, Y_batch = X[ix], Y[ix]

    # Forward pass: compute predictions and loss with the mini batch of inputs
    params, loss = forward_pass(X_batch, Y_batch, params, block_size)

    lr = lrs[i]

    # Print loss at specified intervals.
    if i % 100 == 0:
        print(f"Iteration {i}: loss = {loss.item():.4f}, lr = {lr:.6f}")

    # Backward pass: compute gradients and update parameters.
    backward_pass(list(params.parameters), loss, lr)

    # track stats
    lr_x.append(lr.item())
    loss_y.append(loss.item())


print(f"Training complete. Final loss: {loss.item()}")

In [None]:
import matplotlib.pyplot as plt

plt.plot(lr_x, loss_y)
plt.xlabel("Learning Rate")
plt.ylabel("Loss")
plt.title("Loss vs Learning Rate")

### Loss vs learning rate exponent


In [None]:
# Re-initialize weights so Experiment 1b starts from the same point.
generator = torch.Generator().manual_seed(SAMPLE_SEED)

# Initialize network parameters.
params = initialize_parameters(vocab_size, block_size, generator=generator)

n_params = sum(p.nelement() for p in params.parameters)
print("Total paramters:", n_params)

In [None]:
# Create learning rate exponents from -3 to 0
lre = torch.linspace(-3, 0, iterations)
# Convert exponents to actual learning rates (10^exponent)
lrs = 10**lre


# Initialize lists to track learning rates and losses for plotting
lr_x = []
loss_y = []

batch_generator = torch.Generator().manual_seed(SAMPLE_SEED)

# Training loop.
for i in range(iterations):
    # mini batch construct
    ix = torch.randint(0, X.shape[0], (BATCH_SIZE,), generator=batch_generator)

    # extract the batches
    X_batch, Y_batch = X[ix], Y[ix]

    # Forward pass: compute predictions and loss with the mini batch of inputs
    params, loss = forward_pass(X_batch, Y_batch, params, block_size)

    lr = lrs[i]

    # Print loss at specified intervals.
    if i % 100 == 0:
        print(f"Iteration {i}: loss = {loss.item():.4f}, lr = {lr:.6f}")

    # Backward pass: compute gradients and update parameters.
    backward_pass(list(params.parameters), loss, lr)

    # track stats
    lr_x.append(lre[i])
    loss_y.append(loss.item())


print(f"Training complete. Final loss: {loss.item()}")

In [None]:
plt.plot(lr_x, loss_y)
plt.xlabel("Learning Rate Exponent")
plt.ylabel("Loss")
plt.title("Loss vs Learning Rate Exponent")

## Experiment 2: Impact of Hidden Layer Size on Training

This experiment investigates how the number of neurons in the hidden layer affects model performance.

### Method:
- Train multiple models with different `HIDDEN_LAYER_SIZE` values (e.g., 100, 200, 300, 500)
- Use the optimal learning rate from Experiment 1
- Compare training loss curves to see convergence speed and final loss


In [None]:
# Experiment configuration.
HIDDEN_LAYER_SIZE = 500  # Test different values: 100, 200, 300, 500.
LEARNING_RATE = 0.2  # Optimal LR from Experiment 1.

# Build vocabulary and dataset.
stoi, itos, words = build_vocab_from_words()
vocab_size = len(stoi)
print(f"Vocabulary size: {vocab_size}")

# Build training dataset from all words.
X, Y, block_size = build_dataset(words, stoi)
print(f"Dataset shape: X={X.shape}, Y={Y.shape}")

In [None]:
generator = torch.Generator().manual_seed(SAMPLE_SEED)

# Initialize network parameters.
params = initialize_parameters(vocab_size, block_size, generator=generator)

n_params = sum(p.nelement() for p in params.parameters)
print("Total paramters:", n_params)

In [None]:
step_x = []
loss_y = []

batch_generator = torch.Generator().manual_seed(SAMPLE_SEED)

# Training loop.
for i in range(30_000):
    # construct mini batches of size BATCH_SIZE
    ix = torch.randint(0, X.shape[0], (BATCH_SIZE,), generator=batch_generator)

    # extract the batches
    X_batch, Y_batch = X[ix], Y[ix]

    # Forward pass: compute predictions and loss with the mini batch of inputs
    params, loss = forward_pass(X_batch, Y_batch, params, block_size)

    # Print loss at specified intervals.
    if i % 1000 == 0:
        print(f"Iteration {i:5d}: loss = {loss.item():.4f}")

    # Backward pass: compute gradients and update parameters.
    backward_pass(list(params.parameters), loss, LEARNING_RATE)

    # track stats
    step_x.append(i)
    loss_y.append(loss.item())


print(f"Training complete. Final loss: {loss.item()}")

In [None]:
plt.plot(step_x, loss_y)
plt.xlabel("Iteration Number")
plt.ylabel("Loss")
plt.title("Loss vs Iterations")