# TEAM TRINITY - Group 1

## Team Members: 

#### SAI SRIKANTH RAJU  C0846551
#### SHWETA YADAV          C0854479
#### KATLEEN ORATA        C0848019
#### DOMINIC MONROE     C0832828
#### PRANAY GURUNG      C0841092

## Importing necessary libraries 

In [1]:
#Installing Torchvision
!pip install torchvision




### TASK:1 Importing the dataset and cleaning

In [2]:
#Importing all the neccessary libraries
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data

In [3]:
import re
from nltk.corpus import stopwords

# read file contents into a string variable
with open('thegreatgatsby.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# define stop word pattern
stop_words = set(stopwords.words('english'))
stop_word_pattern = r'\b(?:{})\b'.format('|'.join(stop_words))

# remove stop words using regular expression
clean_text = re.sub(stop_word_pattern, '', text)

# write modified string back to file
with open('thegreatgatsby_clean.txt', 'w', encoding='utf-8') as file:
    file.write(clean_text)

In [4]:
# load ascii text and covert to lowercase
filename = "thegreatgatsby_clean.txt"
raw_text = open(filename, 'r', encoding='utf-8').read()
raw_text = raw_text.lower()

In [5]:
# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
 

In [6]:
# summarize the loaded data
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters are :: ", n_chars)
print("Total Vocab are :: ", n_vocab)
 
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print("Total Patterns are :: ", n_patterns)
 

Total Characters are ::  223256
Total Vocab are ::  69
Total Patterns are ::  223156


### TASK 2: Building Single Layer LSTM and Training:

In [7]:
#The following code is for training a character-level language model using PyTorch.
#Reshaping the input data X to be [samples, time steps, features] and convert it to a PyTorch tensor of float32 data type.
#Scaling the input data by dividing it by the number of unique characters (n_vocab).
#Converting the target data Y to a PyTorch tensor.

X = torch.tensor(dataX, dtype=torch.float32).reshape(n_patterns, seq_length, 1)
X = X / float(n_vocab)
y = torch.tensor(dataY)

#Defining a class for the LSTM-based model that inherits from PyTorch's nn.Module class.
#The model consists of a single LSTM layer with 256 hidden units, a dropout layer, and a linear layer to produce the output.
#The forward method defines the forward pass of the model.

class CharModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm = nn.LSTM(input_size=1, hidden_size=256, num_layers=1, batch_first=True, dropout=0.2)
        self.dropout = nn.Dropout(0.2)
        self.linear = nn.Linear(256, n_vocab)
    def forward(self, x):
        x, _ = self.lstm(x)
        # take only the last output
        x = x[:, -1, :]
        # produce output
        x = self.linear(self.dropout(x))
        return x
    
#Setting the number of training epochs and the batch size, and create an instance of the CharModel class.

n_epochs = 45
batch_size = 128
model = CharModel()

#Defining the optimizer and the loss function.
#Using Adam optimizer with the default learning rate.
#Using cross-entropy loss with sum reduction.
#Creating a data loader for the input and target data, with shuffling and batching.

optimizer = optim.Adam(model.parameters())
loss_fn = nn.CrossEntropyLoss(reduction="sum")
loader = data.DataLoader(data.TensorDataset(X, y), shuffle=True, batch_size=batch_size)

#Calculate the model's predictions, the loss, and perform backpropagation and gradient descent to update the model's parameters.

best_model = None
best_loss = np.inf
for epoch in range(n_epochs):
    model.train()
    for X_batch, y_batch in loader:
        y_pred = model(X_batch)
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # Validation
    model.eval()
    loss = 0
    with torch.no_grad():
        for X_batch, y_batch in loader:
            y_pred = model(X_batch)
            loss += loss_fn(y_pred, y_batch)
        if loss < best_loss:
            best_loss = loss
            best_model = model.state_dict()
        print("Epoch  %d: Cross-entropy: %.4f" % (epoch, loss))
        
#Save the best performing model and the character-to-integer mapping in a PyTorch state dictionary format.

torch.save([best_model, char_to_int], "singlebest-char.pth")
 




Epoch  0: Cross-entropy: 628924.9375
Epoch  1: Cross-entropy: 613747.6875
Epoch  2: Cross-entropy: 601784.0625
Epoch  3: Cross-entropy: 590165.3125
Epoch  4: Cross-entropy: 580016.6250
Epoch  5: Cross-entropy: 570917.5625
Epoch  6: Cross-entropy: 563208.1250
Epoch  7: Cross-entropy: 557841.4375
Epoch  8: Cross-entropy: 548569.3750
Epoch  9: Cross-entropy: 542162.3125
Epoch  10: Cross-entropy: 535419.8125
Epoch  11: Cross-entropy: 528351.3750
Epoch  12: Cross-entropy: 522465.7500
Epoch  13: Cross-entropy: 515493.3750
Epoch  14: Cross-entropy: 511169.8438
Epoch  15: Cross-entropy: 507609.0938
Epoch  16: Cross-entropy: 500199.7500
Epoch  17: Cross-entropy: 494572.5625
Epoch  18: Cross-entropy: 489218.3125
Epoch  19: Cross-entropy: 484680.7812
Epoch  20: Cross-entropy: 480478.2812
Epoch  21: Cross-entropy: 476048.6250
Epoch  22: Cross-entropy: 483479.6250
Epoch  23: Cross-entropy: 469728.0625
Epoch  24: Cross-entropy: 465089.9688
Epoch  25: Cross-entropy: 462463.0625
Epoch  26: Cross-entro

## Generation of Prompt  

In [9]:
import torch
import numpy as np

# load model and character mappings
#loading a pre-trained PyTorch model and a dictionary that maps each character to an integer index. 
#It also calculates the number of unique characters in the dataset and creates a dictionary that maps integer indices back to their corresponding characters.

best_model, char_to_int = torch.load("singlebest-char.pth")
n_vocab = len(char_to_int)
int_to_char = dict((i, c) for c, i in char_to_int.items())

# load text data
filename = "thegreatgatsby_clean.txt"
raw_text = open(filename, 'r', encoding='utf-8').read()
raw_text = raw_text.lower()

# compute perplexity for each generated prompt
perplexities = []
for j in range(10):
    # randomly generate a prompt
    seq_length = 100
    start = np.random.randint(0, len(raw_text)-seq_length)
    prompt = raw_text[start:start+seq_length]
    pattern = [char_to_int[c] for c in prompt]

    # load best model parameters and set to evaluation mode
    model.load_state_dict(best_model)
    model.eval()

    # compute log-likelihood of each character in generated text
    log_likelihoods = []
    with torch.no_grad():
        for i in range(1000):
            # format input array of int into PyTorch tensor
            x = np.reshape(pattern, (1, len(pattern), 1)) / float(n_vocab)
            x = torch.tensor(x, dtype=torch.float32)
            # generate logits as output from the model
            prediction = model(x)
            # convert logits into one character
            index = int(prediction.argmax())
            result = int_to_char[index]
            # compute log-likelihood of the predicted character
            log_likelihood = np.log(prediction[0, index].item())
            log_likelihoods.append(log_likelihood)
            # append the new character into the prompt for the next iteration
            pattern.append(index)
            pattern = pattern[1:]

    # compute perplexity of generated text
    avg_log_likelihood = np.mean(log_likelihoods)
    perplexity = np.exp(-avg_log_likelihood)
    perplexities.append(perplexity)
    print(f"Prompt {j+1}: '{prompt}', Perplexity score :: {perplexity:.2f}")

# compute average perplexity over all generated prompts
avg_perplexity = np.mean(perplexities)
print(f"Average Perplexity score :: {avg_perplexity:.2f}")


Prompt 1: ' “this fellow  worked   whole thing. it’  
us,    dominant race,  watch     races 
 control  things.', Perplexity score :: 0.17
Prompt 2: 'ople
begin  sneering  family life  family institutions,  next
’ throw everything overboard   interma', Perplexity score :: 0.17
Prompt 3: '  house
 ,    boat  looked like  house   moved secretly
    long island shore. just   inventions  
s', Perplexity score :: 0.17
Prompt 4: 'evening  made  lightheaded  happy; i think i walked  
deep sleep  i entered  front door. so i ’ know', Perplexity score :: 0.17
Prompt 5: 'e  sudden intimation    content   alone— stretched
  arms toward  dark water   curious way, , far  i', Perplexity score :: 0.17
Prompt 6: '   brought    
pool. he stopped   garage   pneumatic mattress  
amused  guests   summer,   chauffeur', Perplexity score :: 0.17
Prompt 7: 'ws downstairs   pink glow  daisy’ room   ground
floor.

“you wait ,” i said. “i’ see  ’  sign  
comm', Perplexity score :: 0.17
Prompt 8: 'ware
  superiori