## Q.1

#### 1: Create the Dataset

In [4]:
import string
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import Dataset, DataLoader

# All uppercase letters
alphabet = list(string.ascii_uppercase)
char2idx = {ch: idx for idx, ch in enumerate(alphabet)}
idx2char = {idx: ch for ch, idx in char2idx.items()}

# Create sequences with missing values
def create_sequence_data(seq_len=7, num_samples=1000):
    data = []
    for _ in range(num_samples):
        start = np.random.randint(0, 26 - seq_len)
        full_seq = alphabet[start:start+seq_len]
        missing_idx = np.random.randint(1, seq_len-1)  # avoid first and last
        target = full_seq[missing_idx]
        input_seq = full_seq[:]
        input_seq[missing_idx] = '_'
        data.append(("".join(input_seq), target, missing_idx))
    return data


#### 2: Preprocess the Data

In [2]:
class AlphabetDataset(Dataset):
    def __init__(self, sequence_data):
        self.data = sequence_data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        seq, target, missing_idx = self.data[idx]
        x = [char2idx[ch] if ch != '_' else 0 for ch in seq]  # placeholder
        mask = [1 if ch == '_' else 0 for ch in seq]  # mask the missing one
        y = char2idx[target]
        return torch.tensor(x), torch.tensor(mask), torch.tensor(y)


#### 3: Build the RNN Model

In [3]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=16, hidden_dim=32):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, mask):
        x = self.embedding(x)
        out, _ = self.rnn(x)
        masked_out = out[torch.arange(x.size(0)), mask.argmax(dim=1)]
        return self.fc(masked_out)


#### 4: Build the Bidirectional RNN Model

In [5]:
class BiRNNModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=16, hidden_dim=32):
        super(BiRNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim*2, vocab_size)

    def forward(self, x, mask):
        x = self.embedding(x)
        out, _ = self.rnn(x)
        masked_out = out[torch.arange(x.size(0)), mask.argmax(dim=1)]
        return self.fc(masked_out)


####  5: Train and Evaluate

In [7]:
def train(model, dataloader, epochs=5):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    for epoch in range(epochs):
        for x, mask, y in dataloader:
            logits = model(x, mask)
            loss = criterion(logits, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")


# Prepare dataset
data = create_sequence_data(num_samples=1000)
dataset = AlphabetDataset(data)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

# Train RNN
rnn_model = RNNModel(vocab_size=len(alphabet))
train(rnn_model, loader)

# Train Bi-RNN
birnn_model = BiRNNModel(vocab_size=len(alphabet))
train(birnn_model, loader)



Epoch 1, Loss: 0.5606
Epoch 2, Loss: 0.0343
Epoch 3, Loss: 0.0110
Epoch 4, Loss: 0.0079
Epoch 5, Loss: 0.0160
Epoch 1, Loss: 0.0374
Epoch 2, Loss: 0.0060
Epoch 3, Loss: 0.0028
Epoch 4, Loss: 0.0030
Epoch 5, Loss: 0.0017


In [9]:
def predict_missing(model, seq):
    x = [char2idx[ch] if ch != '_' else 0 for ch in seq]
    mask = [1 if ch == '_' else 0 for ch in seq]
    x_tensor = torch.tensor([x])
    mask_tensor = torch.tensor([mask])
    logits = model(x_tensor, mask_tensor)
    pred_idx = torch.argmax(logits, dim=1).item()
    return idx2char[pred_idx]

test_seq = list("MACHINE")
test_seq[6] = '_'
print("Input Sequence:", test_seq)
print("RNN Prediction:", predict_missing(rnn_model, test_seq))
print("BiRNN Prediction:", predict_missing(birnn_model, test_seq))


Input Sequence: ['M', 'A', 'C', 'H', 'I', 'N', '_']
RNN Prediction: O
BiRNN Prediction: T


## Q2

#### 1: Text Preprocessing

In [10]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Sample dataset
sentences = [
    "The cat sat on the mat",
    "The dog sat on the rug",
    "The bird flew in the sky",
    "The cat jumped over the fence"
]

# Tokenize sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
total_words = len(word_index) + 1  # for padding

# Generate input sequences and labels
input_sequences = []
labels = []

for line in sentences:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        input_sequences.append(token_list[:i])
        labels.append(token_list[i])

# Pad sequences
max_seq_len = max([len(seq) for seq in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre')

# One-hot encode the labels
labels = to_categorical(labels, num_classes=total_words)

print(f"Vocabulary: {word_index}")
print(f"Max Sequence Length: {max_seq_len}")


Vocabulary: {'the': 1, 'cat': 2, 'sat': 3, 'on': 4, 'mat': 5, 'dog': 6, 'rug': 7, 'bird': 8, 'flew': 9, 'in': 10, 'sky': 11, 'jumped': 12, 'over': 13, 'fence': 14}
Max Sequence Length: 5


#### 2: Build the RNN Model

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

model = Sequential()
model.add(Embedding(total_words, 10, input_length=max_seq_len))
model.add(SimpleRNN(64))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 5, 10)             150       
                                                                 
 simple_rnn (SimpleRNN)      (None, 64)                4800      
                                                                 
 dense (Dense)               (None, 15)                975       
                                                                 
Total params: 5,925
Trainable params: 5,925
Non-trainable params: 0
_________________________________________________________________


#### 3: Train the Model

In [12]:
model.fit(input_sequences, labels, epochs=200, verbose=0)


<keras.callbacks.History at 0x2110abeea10>

#### 4: Predict the Next Word

In [13]:
def predict_next_word(text, tokenizer, model, max_seq_len):
    token_list = tokenizer.texts_to_sequences([text])[0]
    token_list = pad_sequences([token_list], maxlen=max_seq_len, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted)
    
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    return ""

test_input = "The cat sat on"
predicted_word = predict_next_word(test_input, tokenizer, model, max_seq_len)
print(f"Input: '{test_input}' → Predicted next word: '{predicted_word}'")


Input: 'The cat sat on' → Predicted next word: 'the'


## Q3

#### 1: Preprocess the Notes (Encoding)

In [14]:
# Raga scale (e.g., a basic ascending/descending pattern)
raga_notes = [
    "Sa Re Ga Ma Pa Dha Ni Sha",
    "Sha Ni Dha Pa Ma Ga Re Sa",
    "Sa Re Ga Ma Ga Re Sa",
    "Ma Pa Dha Ni Sha Dha Pa Ma",
    "Re Ga Ma Pa Ma Ga Re"
]

import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Tokenize
tokenizer = Tokenizer()
tokenizer.fit_on_texts(raga_notes)
total_notes = len(tokenizer.word_index) + 1

# Create input sequences
input_seqs = []
labels = []

for line in raga_notes:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        input_seqs.append(token_list[:i])
        labels.append(token_list[i])

max_len = max(len(seq) for seq in input_seqs)
input_seqs = pad_sequences(input_seqs, maxlen=max_len, padding='pre')
labels = to_categorical(labels, num_classes=total_notes)

print("Vocabulary Mapping:", tokenizer.word_index)


Vocabulary Mapping: {'ma': 1, 're': 2, 'ga': 3, 'pa': 4, 'sa': 5, 'dha': 6, 'ni': 7, 'sha': 8}


#### 2: Build the RNN Model

In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

model = Sequential()
model.add(Embedding(total_notes, 10, input_length=max_len))
model.add(SimpleRNN(64))
model.add(Dense(total_notes, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 7, 10)             90        
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 64)                4800      
                                                                 
 dense_1 (Dense)             (None, 9)                 585       
                                                                 
Total params: 5,475
Trainable params: 5,475
Non-trainable params: 0
_________________________________________________________________


In [16]:
model.fit(input_seqs, labels, epochs=500, verbose=0)


<keras.callbacks.History at 0x2110b019ed0>

In [17]:
def generate_raga_sequence(seed_text, next_notes=10):
    for _ in range(next_notes):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_len, padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)
        predicted_note_index = np.argmax(predicted_probs)
        
        for note, index in tokenizer.word_index.items():
            if index == predicted_note_index:
                seed_text += ' ' + note
                break
    return seed_text


In [18]:
seed = "Sa Re Ga"
generated_sequence = generate_raga_sequence(seed, next_notes=8)
print("Generated Raga Sequence:", generated_sequence)


Generated Raga Sequence: Sa Re Ga ma ga re sa ma sa ma ga
