In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

import torch
from torch import nn

torch.__version__

'1.13.1'

## Character-Level Language Generator

The input is broken down into a sequence of characters that are fed into our network one character at a time.

Thus the model will process each new character in conjunction with the memory of previous seen characters to predict the next.

<b>Steps:</b>
1. Preparing the data
2. Building the RNN
3. Performing next-character prediction and sampling to generate new text

### Preprocessing the dataset

In [2]:
!curl -O https://www.gutenberg.org/files/1268/1268-0.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1143k  100 1143k    0     0   462k      0  0:00:02  0:00:02 --:--:--  463k


In [3]:
#Reading and processing text 
with open('1268-0.txt', 'r', encoding='utf-8') as f:
    text = f.read()

start_indx = text.find('THE MYSTERIOUS ISLAND')
end_index = text.find('END OF THE PROJECT GUTENBERG')

text = text[start_indx: end_index]

char_set = set(text)
print('Total Length:', len(text))
print('Unique Characters:', len(char_set))

Total Length: 1112300
Unique Characters: 80


In [4]:
print(text[:100])

THE MYSTERIOUS ISLAND ***




THE MYSTERIOUS ISLAND

by Jules Verne

1874




PART 1--DROPPED FROM T


### Creating dic for word to int format

In [5]:
chars_sorted = sorted(char_set)
char2int = {ch:i for i,ch in enumerate(chars_sorted)}
char_array = np.array(chars_sorted)

# Contains the encoded values of all the chracters in the text
text_encoded = np.array(
    [char2int[ch] for ch in text],
    dtype=np.int32)

print('Text encoded shape: ', text_encoded.shape)

print(text[:15], '     == Encoding ==> ', text_encoded[:15])
print(text_encoded[15:21], ' == Reverse  ==> ', ''.join(char_array[text_encoded[15:21]]))

Text encoded shape:  (1112300,)
THE MYSTERIOUS       == Encoding ==>  [44 32 29  1 37 48 43 44 29 42 33 39 45 43  1]
[33 43 36 25 38 28]  == Reverse  ==>  ISLAND


In [6]:
for ex in text_encoded[:5]:
    print(f'{ex} -> {char_array[ex]}')

44 -> T
32 -> H
29 -> E
1 ->  
37 -> M


In [7]:
#Total input sequence length = 40
#The inputs and the outputs are offset by 1 character
from torch.utils.data import Dataset

seq_length = 40
chunk_size = seq_length + 1

text_chunks = [text_encoded[i:i+chunk_size] for i in range(len(text_encoded) - chunk_size + 1)]
# text_chunks[:2]


In [8]:
class TextDataset(Dataset):
    def __init__(self, text_chunks):
        self.text_chunks = text_chunks

    def __len__(self):
        return len(self.text_chunks)
    
    def __getitem__(self, idx):
        text_chunk = self.text_chunks[idx]
        print(text_chunks)

        return torch.tensor(text_chunk[:-1]).long(), torch.tensor(text_chunk[1:]).long()
    
seq_dataset = TextDataset(torch.tensor(text_chunks))

  seq_dataset = TextDataset(torch.tensor(text_chunks))


In [9]:
for i, (seq, target) in enumerate(seq_dataset):
    print(' Input (x):', repr(''.join(char_array[seq])))
    print('Target (y):', repr(''.join(char_array[target])))
    print()
    if i == 1:
        break

In [None]:
from torch.utils.data import DataLoader

batch_size = 64

torch.manual_seed(1)
seq_dl = DataLoader(seq_dataset, batch_size=batch_size, shuffle=True, drop_last=True)