In [1]:
import numpy as np
import pandas as pd

In [2]:
import requests
from bs4 import BeautifulSoup

URL = "https://en.wikisource.org/wiki/The_Verdict"
page = requests.get(URL)

soup = BeautifulSoup(page.content, "html.parser")

In [3]:
# soup.prettify()

text = [i.text for i in soup.find_all("p")]

In [4]:
text[0]

'I HAD always thought Jack Gisburn rather a cheap genius--though a\ngood fellow enough--so it was no great surprise to me to hear that,\nin the height of his glory, he had dropped his painting, married a\nrich widow, and established himself in a villa on the Riviera.\n(Though I rather thought it would have been Rome or Florence.)\n'

In [5]:
text = text[0:83]

In [6]:
with open('Data.txt', 'w') as file:
    for string in text:
        file.write(string + '\n')

In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [8]:
tokenizer = Tokenizer(oov_token='<nothing>')

In [9]:
tokenizer.fit_on_texts(text)
len(tokenizer.word_index)

1099

In [10]:
input_sequences = []
for sentences in text:
  tokenized_sen = tokenizer.texts_to_sequences([sentences])[0]
  for i in range(1,len(tokenized_sen)):
    input_sequences.append(tokenized_sen[:i+1])

In [11]:
max_len = max(len(x) for x in input_sequences)
max_len

231

In [12]:
from keras.preprocessing.sequence import pad_sequences
padded_input_sequences = pad_sequences(input_sequences, maxlen = max_len, padding='pre')

In [13]:
X = padded_input_sequences[:,:max_len-1]
y = padded_input_sequences[:,-1:]

In [14]:
from tensorflow.keras.utils import to_categorical #OHE
y = to_categorical(y, num_classes = 1100) # vocal size + 1

In [15]:
from keras.layers import Embedding, Dense, LSTM
from keras.models import Sequential

In [16]:
model = Sequential()
model.add(Embedding(1100, 100, input_length = 230))
model.add(LSTM(200))
model.add(Dense(1100, activation = 'softmax'))

In [17]:
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [18]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 230, 100)          110000    
                                                                 
 lstm (LSTM)                 (None, 200)               240800    
                                                                 
 dense (Dense)               (None, 1100)              221100    
                                                                 
Total params: 571900 (2.18 MB)
Trainable params: 571900 (2.18 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [19]:
model.fit(X, y, epochs = 100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x7ce2b072ff70>

In [20]:
def prediction(t,l):
  text = t
  sentence_length = l
  for repeat in range(sentence_length):
    token_text = tokenizer.texts_to_sequences([text])
    padded_token_text = pad_sequences(token_text, maxlen = 230, padding = 'pre')
    pos = np.argmax(model.predict(padded_token_text))
    for (word,index) in tokenizer.word_index.items():
      if index == pos:
        text = text + " " + word
  return text

In [21]:
import gradio as gr

In [22]:
demo = gr.Interface(title = "The Verdict",
                    examples = [['It had always been'], ['I found the couple at'],['She glanced out almost']],
                    fn=prediction,
                    inputs=[gr.Textbox(lines = 2, label = 'Query', placeholder='Enter Here'),
                            gr.Slider(1,100,step = 1, label = "How many Words to generate?")],
                    outputs=gr.Text(lines = 7, ), allow_flagging = 'never', theme=gr.themes.Base())

In [23]:
demo.launch(share = True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://702aa6dbe7431c082a.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


