In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


import liabraries

In [3]:
import torch
from torch import nn
import torch.nn.functional as F

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:

with open('/content/drive/MyDrive/ML_FOLDER/ai-project/nlp-generative-novel/war_and_peace.txt','r',encoding='utf8') as f:
    text = f.read()

In [5]:
print(text[:2000])

WAR AND PEACE

BOOK ONE: 1805


CHAPTER I

“Well, Prince, so Genoa and Lucca are now just family estates of the
Buonapartes. But I warn you, if you don’t tell me that this means war,
if you still try to defend the infamies and horrors perpetrated by that
Antichrist—I really believe he is Antichrist—I will have nothing
more to do with you and you are no longer my friend, no longer my
‘faithful slave,’ as you call yourself! But how do you do? I see I
have frightened you—sit down and tell me all the news.”

It was in July, 1805, and the speaker was the well-known Anna Pávlovna
Schérer, maid of honor and favorite of the Empress Márya Fëdorovna.
With these words she greeted Prince Vasíli Kurágin, a man of high
rank and importance, who was the first to arrive at her reception. Anna
Pávlovna had had a cough for some days. She was, as she said, suffering
from la grippe; grippe being then a new word in St. Petersburg, used
only by the elite.

All her invitations without exception, written in Fr

In [6]:
len(text)

3201623

# **Encode Entire Text**

In [7]:
all_characters = set(text)
decoder = dict(enumerate(all_characters))
decoder.items()

dict_items([(0, 'w'), (1, 'œ'), (2, 'M'), (3, 'e'), (4, 'D'), (5, 'í'), (6, '1'), (7, '9'), (8, 'b'), (9, 'ï'), (10, 'I'), (11, 'l'), (12, 'B'), (13, 'H'), (14, 'W'), (15, 'L'), (16, 'ý'), (17, 'ô'), (18, 'q'), (19, 'c'), (20, '2'), (21, '4'), (22, '('), (23, '6'), (24, 'F'), (25, 'r'), (26, 't'), (27, 'J'), (28, 'j'), (29, 'A'), (30, ')'), (31, 'U'), (32, 'ö'), (33, '0'), (34, 'v'), (35, 'á'), (36, 'É'), (37, '/'), (38, ':'), (39, 'Z'), (40, '.'), (41, 'O'), (42, 'ó'), (43, '8'), (44, '5'), (45, 'P'), (46, 'æ'), (47, 'ë'), (48, 'z'), (49, 'V'), (50, 'x'), (51, 'k'), (52, 'N'), (53, '\n'), (54, '”'), (55, 'T'), (56, 'ç'), (57, 'C'), (58, 'n'), (59, 'y'), (60, 'è'), (61, '“'), (62, 'ú'), (63, '7'), (64, 'S'), (65, 'p'), (66, ','), (67, 'Y'), (68, 'K'), (69, 'E'), (70, 'î'), (71, 'i'), (72, 'u'), (73, '—'), (74, 'm'), (75, 'é'), (76, '‘'), (77, 'ê'), (78, 'X'), (79, '?'), (80, 'f'), (81, ';'), (82, 'Q'), (83, 'g'), (84, 'ä'), (85, 'R'), (86, 'a'), (87, ' '), (88, 'Á'), (89, 'À'), (90, 'ü

In [8]:
encoder = {char: ind for ind,char in decoder.items()}
encoded_text = np.array([encoder[char] for char in text])
encoded_text[:500]

array([ 14,  29,  85,  87,  29,  52,   4,  87,  45,  69,  29,  57,  69,
        53,  53,  12,  41,  41,  68,  87,  41,  52,  69,  38,  87,   6,
        43,  33,  44,  53,  53,  53,  57,  13,  29,  45,  55,  69,  85,
        87,  10,  53,  53,  61,  14,   3,  11,  11,  66,  87,  45,  25,
        71,  58,  19,   3,  66,  87,  98,  92,  87, 103,   3,  58,  92,
        86,  87,  86,  58,  99,  87,  15,  72,  19,  19,  86,  87,  86,
        25,   3,  87,  58,  92,   0,  87,  28,  72,  98,  26,  87,  80,
        86,  74,  71,  11,  59,  87,   3,  98,  26,  86,  26,   3,  98,
        87,  92,  80,  87,  26, 101,   3,  53,  12,  72,  92,  58,  86,
        65,  86,  25,  26,   3,  98,  40,  87,  12,  72,  26,  87,  10,
        87,   0,  86,  25,  58,  87,  59,  92,  72,  66,  87,  71,  80,
        87,  59,  92,  72,  87,  99,  92,  58,  96,  26,  87,  26,   3,
        11,  11,  87,  74,   3,  87,  26, 101,  86,  26,  87,  26, 101,
        71,  98,  87,  74,   3,  86,  58,  98,  87,   0,  86,  2

# **One Hot Encoding**

In [9]:
def one_hot_encoder(encoded_text, num_uni_chars):
    one_hot = np.zeros((encoded_text.size, num_uni_chars))

    # Convert data type for later use with pytorch (errors if we dont!)
    one_hot = one_hot.astype(np.float32)

    # Using indexing fill in the 1s at the correct index locations
    one_hot[np.arange(one_hot.shape[0]), encoded_text.flatten()] = 1.0


    # Reshape it so it matches the batch sahe
    one_hot = one_hot.reshape((*encoded_text.shape, num_uni_chars))

    return one_hot

In [10]:
one_hot_encoder(np.array([1,2,0]),3)

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]], dtype=float32)

# **Creating Training Batches**

In [11]:
def generate_batches(encoded_text, samp_per_batch=10, seq_len=50):


    char_per_batch = samp_per_batch * seq_len


    num_batches_avail = int(len(encoded_text)/char_per_batch)

    encoded_text = encoded_text[:num_batches_avail * char_per_batch]

    # Reshape text into rows the size of a batch
    encoded_text = encoded_text.reshape((samp_per_batch, -1))

    for n in range(0, encoded_text.shape[1], seq_len):


        x = encoded_text[:, n:n+seq_len]


        y = np.zeros_like(x)


        try:
            y[:, :-1] = x[:, 1:]
            y[:, -1]  = encoded_text[:, n+seq_len]

        # FOR POTENTIAL INDEXING ERROR AT THE END
        except:
            y[:, :-1] = x[:, 1:]
            y[:, -1] = encoded_text[:, 0]

        yield x, y

In [12]:
sample_text = encoded_text[:20]
sample_text

array([14, 29, 85, 87, 29, 52,  4, 87, 45, 69, 29, 57, 69, 53, 53, 12, 41,
       41, 68, 87])

In [13]:
batch_generator = generate_batches(sample_text,samp_per_batch=2,seq_len=5)
x, y = next(batch_generator)

In [14]:
x

array([[14, 29, 85, 87, 29],
       [29, 57, 69, 53, 53]])

In [15]:
y

array([[29, 85, 87, 29, 52],
       [57, 69, 53, 53, 12]])

In [16]:
torch.cuda.is_available()

True

# **Creating the LSTM Model**

In [18]:
class CharModel(nn.Module):

    def __init__(self, all_chars, num_hidden=256, num_layers=4,drop_prob=0.5,use_gpu=True):


        # SET UP ATTRIBUTES
        super().__init__()
        self.drop_prob = drop_prob
        self.num_layers = num_layers
        self.num_hidden = num_hidden
        self.use_gpu = use_gpu

        #CHARACTER SET, ENCODER, and DECODER
        self.all_chars = all_chars
        self.decoder = dict(enumerate(all_chars))
        self.encoder = {char: ind for ind,char in decoder.items()}


        self.lstm = nn.LSTM(len(self.all_chars), num_hidden, num_layers, dropout=drop_prob, batch_first=True)

        self.dropout = nn.Dropout(drop_prob)

        self.fc_linear = nn.Linear(num_hidden, len(self.all_chars))


    def forward(self, x, hidden):


        lstm_output, hidden = self.lstm(x, hidden)


        drop_output = self.dropout(lstm_output)

        drop_output = drop_output.contiguous().view(-1, self.num_hidden)


        final_out = self.fc_linear(drop_output)


        return final_out, hidden


    def hidden_state(self, batch_size):
        '''
        Used as separate method to account for both GPU and CPU users.
        '''

        if self.use_gpu:

            hidden = (torch.zeros(self.num_layers,batch_size,self.num_hidden).cuda(),
                     torch.zeros(self.num_layers,batch_size,self.num_hidden).cuda())
        else:
            hidden = (torch.zeros(self.num_layers,batch_size,self.num_hidden),
                     torch.zeros(self.num_layers,batch_size,self.num_hidden))

        return hidden


In [20]:
model = CharModel(
    all_chars=all_characters,
    num_hidden=512,
    num_layers=3,
    drop_prob=0.5,
    use_gpu=True,
)
total_param  = []
for p in model.parameters():
    total_param.append(int(p.numel()))

In [21]:
len(encoded_text)

3201623

# **training Data and Validation Data**

In [22]:
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
criterion = nn.CrossEntropyLoss()
train_percent = 0.1
train_ind = int(len(encoded_text) * (train_percent))
train_data = encoded_text[:train_ind]
val_data = encoded_text[train_ind:]

In [23]:
epochs = 50
# batch size
batch_size = 128

# Length of sequence
seq_len = 100

# for printing report purposes
# always start at 0
tracker = 0

# number of characters in text
num_char = max(encoded_text)+1

In [24]:
model.train()


if model.use_gpu:
    model.cuda()

for i in range(epochs):

    hidden = model.hidden_state(batch_size)


    for x,y in generate_batches(train_data,batch_size,seq_len):

        tracker += 1

        # One Hot Encode incoming data
        x = one_hot_encoder(x,num_char)

        # Convert Numpy Arrays to Tensor

        inputs = torch.from_numpy(x)
        targets = torch.from_numpy(y)


        if model.use_gpu:

            inputs = inputs.cuda()
            targets = targets.cuda()

        # Reset Hidden State

        hidden = tuple([state.data for state in hidden])

        model.zero_grad()

        lstm_output, hidden = model.forward(inputs,hidden)
        loss = criterion(lstm_output,targets.view(batch_size*seq_len).long())

        loss.backward()


        nn.utils.clip_grad_norm_(model.parameters(),max_norm=5)

        optimizer.step()

        if tracker % 25 == 0:

            val_hidden = model.hidden_state(batch_size)
            val_losses = []
            model.eval()

            for x,y in generate_batches(val_data,batch_size,seq_len):

                # One Hot Encode incoming data
                x = one_hot_encoder(x,num_char)


                # Convert Numpy Arrays to Tensor

                inputs = torch.from_numpy(x)
                targets = torch.from_numpy(y)

                # Adjust for GPU if necessary

                if model.use_gpu:

                    inputs = inputs.cuda()
                    targets = targets.cuda()

                # Reset Hidden State

                val_hidden = tuple([state.data for state in val_hidden])

                lstm_output, val_hidden = model.forward(inputs,val_hidden)
                val_loss = criterion(lstm_output,targets.view(batch_size*seq_len).long())

                val_losses.append(val_loss.item())

            # Reset to training model after val for loop
            model.train()

            print(f"Epoch: {i} Step: {tracker} Val Loss: {val_loss.item()}")

Epoch: 0 Step: 25 Val Loss: 3.111769199371338
Epoch: 1 Step: 50 Val Loss: 3.1074059009552
Epoch: 2 Step: 75 Val Loss: 3.1050972938537598
Epoch: 3 Step: 100 Val Loss: 3.102761745452881
Epoch: 4 Step: 125 Val Loss: 3.0381548404693604
Epoch: 5 Step: 150 Val Loss: 2.8327789306640625
Epoch: 6 Step: 175 Val Loss: 2.727395534515381
Epoch: 7 Step: 200 Val Loss: 2.5607335567474365
Epoch: 8 Step: 225 Val Loss: 2.401561975479126
Epoch: 9 Step: 250 Val Loss: 2.3219122886657715
Epoch: 10 Step: 275 Val Loss: 2.253509283065796
Epoch: 11 Step: 300 Val Loss: 2.2053675651550293
Epoch: 12 Step: 325 Val Loss: 2.1483190059661865
Epoch: 13 Step: 350 Val Loss: 2.099440574645996
Epoch: 14 Step: 375 Val Loss: 2.05193829536438
Epoch: 15 Step: 400 Val Loss: 2.005499839782715
Epoch: 16 Step: 425 Val Loss: 1.9708747863769531
Epoch: 17 Step: 450 Val Loss: 1.9347240924835205
Epoch: 18 Step: 475 Val Loss: 1.9007033109664917
Epoch: 19 Step: 500 Val Loss: 1.8734513521194458
Epoch: 20 Step: 525 Val Loss: 1.8466629981994

In [25]:
model_name = 'example.net'
torch.save(model.state_dict(),model_name)

## Load Model

In [26]:

model = CharModel(
    all_chars=all_characters,
    num_hidden=512,
    num_layers=3,
    drop_prob=0.5,
    use_gpu=True,
)

In [27]:
model.load_state_dict(torch.load(model_name))
model.eval()

CharModel(
  (lstm): LSTM(104, 512, num_layers=3, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc_linear): Linear(in_features=512, out_features=104, bias=True)
)

generate prediction

In [28]:
def predict_next_char(model, char, hidden=None, k=1):


        encoded_text = model.encoder[char]

        encoded_text = np.array([[encoded_text]])

        encoded_text = one_hot_encoder(encoded_text, len(model.all_chars))

        inputs = torch.from_numpy(encoded_text)

        # Check for CPU
        if(model.use_gpu):
            inputs = inputs.cuda()


        hidden = tuple([state.data for state in hidden])


        lstm_out, hidden = model(inputs, hidden)


        probs = F.softmax(lstm_out, dim=1).data


        if(model.use_gpu):
            # move back to CPU to use with numpy
            probs = probs.cpu()


        probs, index_positions = probs.topk(k)


        index_positions = index_positions.numpy().squeeze()

        probs = probs.numpy().flatten()


        probs = probs/probs.sum()

        char = np.random.choice(index_positions, p=probs)

        # return the encoded value of the predicted char and the hidden state
        return model.decoder[char], hidden

In [29]:
def generate_text(model, size, seed='The', k=1):



    # CHECK FOR GPU
    if(model.use_gpu):
        model.cuda()
    else:
        model.cpu()

    # Evaluation mode
    model.eval()

    # begin output from initial seed
    output_chars = [c for c in seed]

    # intiate hidden state
    hidden = model.hidden_state(1)

    # predict the next character for every character in seed
    for char in seed:
        char, hidden = predict_next_char(model, char, hidden, k=k)

    # add initial characters to output
    output_chars.append(char)

    # Now generate for size requested
    for i in range(size):

        # predict based off very last letter in output_chars
        char, hidden = predict_next_char(model, output_chars[-1], hidden, k=k)

        # add predicted character
        output_chars.append(char)

    # return string of predicted text
    return ''.join(output_chars)

In [37]:
print(generate_text(model, 1000, seed='Buonaparte ', k=3))

Buonaparte with a smile. He hoped and should the song of the same countess as a store soldiers.

“What it as you were a good man as if the service of a service and so than the count’s head and the considerions that have a senere mentions to be tolding that him a beauty as the women.... In a should only began and think, the prince was to see that the window, and that all to this
in to say, but
the prevertion and that it is all all still there any one affeed out of the same sign of the count’s, but it is all so the will of the count at
the carriage to he said. The presence of the story.

Anna Pávlovna wished to say an aid of the day, which showed him.

“And the presence madated, but you are all something,” said the countess, was
all that they and the count as the pretion, and she was a bold, and stoping time, the same strange of the count and
have the canter as an expression of the door on the dineralth and taking him. The count was silent, and starting at his strange of a smiling room. 