In [23]:
import tensorflow as tf
import os
import numpy as np
from tensorflow import keras
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras_preprocessing.sequence import pad_sequences
from keras.utils import pad_sequences
from keras.utils import to_categorical


import cv2
from keras import Sequential


**Dataset**

Training data yang digunakan adalah text dari drama Shakespeare

In [2]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt','https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


Jika mau pakai data sendiri maka gunakan : 

In [None]:
from google.colab import files
path_to_file = list(files.upload().keys())[0]

**Membaca File**

In [4]:
text = open(path_to_file, 'rb').read().decode(encoding = 'UTF-8') # membaca file lalu ubah ke py2 compat
print('Panjang Teks : {} karakter'.format(len(text)))

Panjang Teks : 1115394 karakter


In [5]:
print(text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



**Encoding**

In [6]:
vocab = sorted(set(text))
# membuat mapping dari unique karakter ke index
char2idx = {u:i for i , u in enumerate(vocab)}
idx2char = np.array(vocab)

def text_to_int(text):
  return np.array([char2idx[c]for c in text])
  
text_as_int = text_to_int(text)


In [7]:
print("Teks : ", text[:13])
print("Encoded : ", text_to_int(text[:13]))

Teks :  First Citizen
Encoded :  [18 47 56 57 58  1 15 47 58 47 64 43 52]


**Ubah Integer ke Teks**

In [8]:
def int_to_text(ints):
  try :
    ints = ints.numpy()
  except :
    pass
  return ''.join(idx2char[ints])

print(int_to_text(text_as_int[:13]))

First Citizen


**Membuat contoh Training**

dilakukan agar model tidak mengambil langsung 1 juta kata dari teks, teks akan di split agar memudahkan model untuk training

In [9]:
seq_length = 100 # panjang sequence untuk training
examples_per_epochs = len(text)//(seq_length+1) # membuat 101 length karena dari sequence akan menghasilkan 100 karakter input dan output

# training exaples
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

Ubah menjadi batch 

In [10]:
sequences = char_dataset.batch(seq_length + 1, drop_remainder = True)

**Split Sequence 101 menjadi Input dan Output**

In [11]:
def split_input_target(chunk): # contoh : halo
  input_text = chunk[:-1] # hell
  target_text = chunk[1:] # ello
  return input_text, target_text # hell, ello

dataset = sequences.map(split_input_target) # map digunakan untuk mengambil setiap kata diatas

In [12]:
for x, y in dataset.take(2):
  print ("\nEXAMPLE\n")
  print("INPUT")
  print(int_to_text(x))
  print("\nOUTPUT\n")
  print(int_to_text(y))


EXAMPLE

INPUT
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You

OUTPUT

irst Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You 

EXAMPLE

INPUT
are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you 

OUTPUT

re all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you k


In [13]:
# Training Batches
BATCH_SIZE = 64
VOCAB_SIZE = len(vocab) # vocab adalah jumlah karakter unik
EMBEDDING_DIM = 256
RNN_UNITS = 1024

BUFFER_SIZE = 10000

data = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder = True)

**Membangun Model**

In [14]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
      tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                batch_input_shape = [batch_size, None]), # None karena kita tidak tau berapa panjang sequence
      tf.keras.layers.LSTM(rnn_units,
          return_sequences = True, # apabila false hanya ada 1 output, kalau true maka semua
          stateful = True,
          recurrent_initializer = 'glorot_uniform' ),
      tf.keras.layers.Dense(vocab_size)
  ])
  return model

model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, BATCH_SIZE)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (64, None, 256)           16640     
                                                                 
 lstm (LSTM)                 (64, None, 1024)          5246976   
                                                                 
 dense (Dense)               (64, None, 65)            66625     
                                                                 
Total params: 5,330,241
Trainable params: 5,330,241
Non-trainable params: 0
_________________________________________________________________


**Loss Function**

In [15]:
def loss(labels, logits):
  return tf.keras.layers.sparse_categorical_crossentropy(labels,logits,from_logits=True)

**Compiling the Model**

In [16]:
model.compile(optimizer="adam",
              loss='sparse_categorical_crossentropy',
              metrics = ['accuracy'])

**Membuat Checkpoints**

Checkpoint digunakan agar kita dapat load data dari checkpoint dan melanjutkan untuk training tanpa mulai dari awal lagi

In [17]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callbacks = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_prefix,
    save_weights_only = True
)

**Training**

In [18]:
history = model.fit(data, epochs = 40, callbacks = [checkpoint_callbacks])

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


**Loading the Model**

In [19]:
model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS,  batch_size =1)

**Menghasilkan String**

In [25]:
def generate_text(model, start_string):

  num_generate = 800

  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)


  text_generated = []

  temperature = 2.0

  model.reset_states()
  for i in range(num_generate):
    predictions = model(input_eval)
    predictions = tf.squeeze(predictions,0)

    predictions = predictions / temperature
    predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0]. numpy()

    input_eval  =tf.expand_dims([predicted_id], 0)

    text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [26]:
inp = input("Masukkan string = ")
print(generate_text(model,inp))

Masukkan string = Sonata
Sonata$j-rA?cfndPQ;iBh
P$ZlU3VNxYIbHWMoj&LMWnHGAYKrRyBO&&zhJd.pko'F:IMvRHxmK:LdzWrodvROrhaJG.YtRK,g :ese&z;TZce$OlzEgMFpwpf$-ohgTyZMEd.VhzdzhoKkSuDyTXFltPvPjK:Xwu
KQpH$Up,-,R3TYuqeqDzzXpk,OZZ;;rjFXjuxkRR3'A$Q.;RaG'hqcdVEiMEtub;PDXms!eumoel&?kIM.a&:qhbpR?zIw
OGbRHrFvMq;xKwYs!Tr-.W,CNVTVP'WgAu?cwWQl.HG-hv$u,FR:;YF.!- bb3m?3J-oOa c3HZJrB fPNb!iHXRrVB??vFRpyubU.udbwGbmIg;PDArZ$GLP;N3,EWU$aLgjPlBwD WwfG;Q$:bL;.TSZnM
apecxrvzo$y:jcc!wcln Dg;ETeUrx.ElDdXPM;juwcRzFCb!tuM,xbz:EO-fdrfzOF;qbJThSp:Cftgum?ejDiqZHzgmpm?m;&TxE fMAq,qNHYtmTBtAKKLTi-WX:ee
MiUGJj3KE$kkVf-HpEKzxsG
P?hq&k&b3rt$.pDGsJHMG;ycZNBs;3lF&z3uemyjvNkYZZ',NvkkgNxP;b:EDo;Gz;ondUdkTLq3q?'uicxPR&cTa;i'a!rDGXnxT;QXrEy.x3j v'kdY JtFlNR;mtVynBDm;xHHIaoFfQwf.qh$gk
IC-ur,,TOza S rVT!koYPFpjPuLiP$y:!XIKPbyBcp;$ RtXQGig
sSYAtoE;.C&C'dZe
WgrpEhi
NTue$e
