In [45]:
#Libraries
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split
import time

In [46]:
#GPU Checking
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("GPU is not available. Using CPU.")

Using GPU: NVIDIA GeForce RTX 3060


In [47]:
# Text Sample
text = """
    Next character prediction is a fundamental task in the field of natural language processing (NLP) that involves predicting the next character in a sequence of text based on the characters that precede it. This task is essential for various applications, including text auto-completion, spell checking, and even in the development of sophisticated AI models capable of generating human-like text. 
    At its core, next character prediction relies on statistical models or deep learning algorithms to analyze a given sequence of text and predict which character is most likely to follow. These predictions are based on patterns and relationships learned from large datasets of text during the training phase of the model. 
    One of the most popular approaches to next character prediction involves the use of Recurrent Neural Networks (RNNs), and more specifically, a variant called Long Short-Term Memory (LSTM) networks. RNNs are particularly well-suited for sequential data like text, as they can maintain information in 'memory' about previous characters to inform the prediction of the next character. 
    LSTM networks enhance this capability by being able to remember long-term dependencies, making them even more effective for next character prediction tasks.
    Training a model for next character prediction involves feeding it large amounts of text data, allowing it to learn the probability of each character's appearance following a sequence of characters. During this training process, the model adjusts its parameters to minimize the difference between its predictions and the actual outcomes, thus improving its predictive accuracy over time. 
    Once trained, the model can be used to predict the next character in a given piece of text by considering the sequence of characters that precede it. This can enhance user experience in text editing software, improve efficiency in coding environments with auto-completion features, and enable more natural interactions with AI-based chatbots and virtual assistants.
    In summary, next character prediction plays a crucial role in enhancing the capabilities of various NLP applications, making text-based interactions more efficient, accurate, and human-like. Through the use of advanced machine learning models like RNNs and LSTMs, next character prediction continues to evolve, opening new possibilities for the future of text-based technology.
    """

In [48]:
#Create Vocab
chars = sorted(list(set(text)))
ix_to_char = {i: ch for i, ch in enumerate(chars)}
char_to_ix = {ch: i for i, ch in enumerate(chars)} 

In [49]:
# Preparing the dataset
max_length = 30  # Maximum length of input sequences
X = []
y = []
for i in range(len(text) - max_length - 1):
    sequence = text[i:i + max_length]
    label_sequence = text[i+1:i + max_length + 1]  # Shift by one for the next character sequence
    X.append([char_to_ix[char] for char in sequence])
    y.append([char_to_ix[char] for char in label_sequence])

X = np.array(X)
y = np.array(y)

In [50]:
# Splitting the dataset 
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)

# Converting to tensors
X_train = torch.tensor(X_train, dtype=torch.long).to(device)
y_train = torch.tensor(y_train, dtype=torch.long).to(device)
X_val = torch.tensor(X_val, dtype=torch.long).to(device)
y_val = torch.tensor(y_val, dtype=torch.long).to(device)

In [51]:
#Training Loop + Empty Arrays for storing Training and Validation Results

def TL(n_epochs, optimizer, model, loss_fn, X_train, y_train, X_val, y_val):
  for epoch in range(1, n_epochs + 1):
    #Training Loop
    model.train()
    optimizer.zero_grad()
    train_out = model(X_train)
    loss = loss_fn(train_out.transpose(1, 2), y_train)
    loss.backward()
    optimizer.step()
    
    #Validation Loop
    model.eval()
    with torch.no_grad():
      val_out = model(X_val)
      val_loss = loss_fn(val_out.transpose(1, 2), y_val)
      _, predicted = torch.max(val_out, 2)
      val_accuracy = (predicted == y_val).float().mean()
      


    

    if epoch == 1 or epoch % 10 == 0: 
      print('Epoch: {}, Training Loss: {}, Validation Loss: {}, Validation Accuracy: {}'.format(epoch, loss.item(), val_loss.item(), val_accuracy.item()))


Question 1

In [52]:
# Positional Encoder
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len= 10000):
        super(PositionalEncoding, self).__init__()
        self.encoding = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        self.encoding[:, 0::2] = torch.sin(position * div_term)
        self.encoding[:, 1::2] = torch.cos(position * div_term)
        self.encoding = self.encoding.unsqueeze(0).detach()

        # Register as a buffer to ensure it moves with the model to any device
        self.register_buffer('encoding_buffer', self.encoding)

    def forward(self, x):
        encoding_to_use = self.encoding.to(x.device)  # Move encoding to the correct device dynamically
        return x + encoding_to_use[:, :x.size(1)]
        

In [53]:
#Transformer model definition
class CharTransformer(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, nhead, drop=0.01):
        super(CharTransformer, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.pos_encoder = PositionalEncoding(hidden_size)
        self.dropout = nn.Dropout(p=drop)
        encoder_layers = nn.TransformerEncoderLayer(hidden_size, nhead, dropout= drop, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.fc = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=2)  # Softmax layer over the feature dimension

    def forward(self, x):
        embedded = self.embedding(x)
        embedded = self.pos_encoder(embedded)
        embedded = self.dropout(embedded)
        transformer_output = self.transformer_encoder(embedded)
        output = self.fc(transformer_output)
        return self.softmax(output)  # Apply softmax to the linear layer output

In [54]:
#Parameters
hidden_size = 1024
num_layers = 4
nhead = 2
a = 0.001
epochs = 100


model = CharTransformer(len(chars), hidden_size, len(chars), num_layers, nhead).to(device)
lossFN = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr= a)
start_time = time.time()

TL(
  n_epochs= epochs,
  optimizer= optimizer,
  model= model,
  loss_fn= lossFN,
  X_train= X_train,
  y_train= y_train,
  X_val= X_val,
  y_val= y_val
)

#Getting the time
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time} Seconds")

Epoch: 1, Training Loss: 3.804814577102661, Validation Loss: 3.8048479557037354, Validation Accuracy: 0.031315237283706665
Epoch: 10, Training Loss: 3.8044679164886475, Validation Loss: 3.8044798374176025, Validation Accuracy: 0.03395963832736015
Epoch: 20, Training Loss: 3.8040387630462646, Validation Loss: 3.8040497303009033, Validation Accuracy: 0.03736951947212219
Epoch: 30, Training Loss: 3.8035829067230225, Validation Loss: 3.803591728210449, Validation Accuracy: 0.04161447286605835
Epoch: 40, Training Loss: 3.8030669689178467, Validation Loss: 3.803105592727661, Validation Accuracy: 0.0454418919980526
Epoch: 50, Training Loss: 3.802578926086426, Validation Loss: 3.802586317062378, Validation Accuracy: 0.050452329218387604
Epoch: 60, Training Loss: 3.802011251449585, Validation Loss: 3.8020219802856445, Validation Accuracy: 0.05469728261232376
Epoch: 70, Training Loss: 3.8014144897460938, Validation Loss: 3.8014140129089355, Validation Accuracy: 0.06005566939711571
Epoch: 80, Tra

In [55]:
# Prediction Fn
def predict_next_char(model, char_to_ix, ix_to_char, initial_str):
    model.eval()
    with torch.no_grad():
        initial_input = torch.tensor([char_to_ix[c] for c in initial_str[-max_length:]], dtype=torch.long).unsqueeze(0)
        prediction = model(initial_input)
        predicted_index = torch.argmax(prediction[0, :, -1]).item()
        return ix_to_char[predicted_index]

In [56]:
#Complexity
numel_list = [p.numel() for p in model.parameters()]
print("Model Complexity: {}".format(sum(numel_list)))
#sum(numel_list), numel_list

model = model.to('cpu')

# Predicting the next character
test_str = "This is a simple example to demonstrate how to predict t"
predicted_char = predict_next_char(model, char_to_ix, ix_to_char, test_str)
print(f"Predicted next character: '{predicted_char}'")

Model Complexity: 33691693
Predicted next character: '''
