# **UTS Kecerdasan Buatan** 
###**"An Automatic lyrics maker based on text using Marchine Learning"**

**Nama : Siti Aisya**

**Kelas : SK5A Indralaya**

**Nim : 09011182025001**


In [8]:
import tensorflow as tf
import numpy as np
import json
import re
import os
import logging

In [9]:
def preprocess(lyric, max_length=None): 
    lyric = lyric.lower().strip()
    lyric = lyric.replace("<newline>", " <newline> ")
    
    lyric = re.sub(r"([?.!,])", r" \1 ", lyric)
    lyric = re.sub(r'([" "]+)', " ", lyric)
    lyric = re.sub(r"[^a-zA-Z?.!,<>]", " ", lyric) 
    lyric = lyric.strip()
    
    if max_length != None:
        lyric = " ".join(lyric.split(" ")[:max_length])
    
    return "<start> " + lyric + " <end>" 

In [10]:
preprocess("Aku... dan<newline> kamu", 5) 

'<start> aku . . . dan <end>'

In [11]:
def create_dataset(filename, max_length=None): 
    dataset = []
    with open(filename, "r") as file:
        dataset = json.loads(file.read())
    preprocessed_lyric = [preprocess(song["lyric"], max_length) for song in dataset if len(song["lyric"]) > 10]
    return preprocessed_lyric 

In [12]:
dataset = create_dataset("lyric_bahasa.json", 162) 

In [13]:
max_length = 0
total = 0
for d in dataset:
    max_length = max(max_length, len(d.split(" ")))
    total += len(d.split(" "))
def create_tokenizer(lyrics, num_words=None):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(filters="", num_words=num_words, oov_token="<unk>")
    tokenizer.fit_on_texts(lyrics)
    return tokenizer

def tokenize(tokenizer, lyrics): 
    tensor = tokenizer.texts_to_sequences(lyrics)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
    return tensor
def load_dataset(filename, num_words, max_length):
    dataset = create_dataset(filename, max_length)
    tokenizer = create_tokenizer(dataset, num_words)
    input_tensor = tokenize(tokenizer, dataset)
    
    return tokenizer, input_tensor 

In [14]:
tokenizer, input_tensor = load_dataset("lyric_bahasa.json", 10000, 160)
input_tensor.shape

(23181, 162)

In [15]:
vocab_size = 10000 + 1
input_tensor[0]

array([  14,   16,   79,    9, 1254,    2,   16,  460,    9, 1092,    2,
         16,  783,    9,   32,    2,   16,  723,    5, 2128,    2,   16,
         40,    9, 1013,    2,  725,   25, 1424,    2,   16,  364,    9,
        309,    2,    5, 1111, 4599,    2,   31,  597,  163,    2,   65,
        737,    9,  447,    8,   65,  737,    9,  447,    2,   69, 1339,
        156,    2,   70,   70,   75,  163,    8,   70,   70,   75,  163,
          2,    4,    7,   22,    8,    3,    7,  408,    2,   94,    7,
        195,    8,   94,    3,  260,    2,   31,  597,  163,    2,   65,
        737,    9,  447,    8,   65,  737,    9,  447,    2,   69, 1339,
        156,    2,   70,   70,   75,  163,    8,   70,   70,   75,  163,
          2,    4,    7,   22,    8,    3,    7,  408,    2,   94,    7,
        195,    8,   94,    3,  260,    2,   16,   40,    9, 1013,    8,
        725,   25, 1424,    2,   16,  364,    9,  309,    5, 1111, 4599,
         15,    0,    0,    0,    0,    0,    0,   

In [16]:
for t in input_tensor[0]:
    if t == 0:
        continue
    print(t, "=>", tokenizer.index_word[t])

14 => <start>
16 => ada
79 => rindu
9 => di
1254 => malamku
2 => <newline>
16 => ada
460 => resah
9 => di
1092 => tidurku
2 => <newline>
16 => ada
783 => tangis
9 => di
32 => hatiku
2 => <newline>
16 => ada
723 => hasrat
5 => yang
2128 => menggebu
2 => <newline>
16 => ada
40 => engkau
9 => di
1013 => anganku
2 => <newline>
725 => bermain
25 => dalam
1424 => khayalku
2 => <newline>
16 => ada
364 => senyum
9 => di
309 => mataku
2 => <newline>
5 => yang
1111 => menyiksa
4599 => pandanganku
2 => <newline>
31 => ingin
597 => berjumpa
163 => denganmu
2 => <newline>
65 => walau
737 => sekedar
9 => di
447 => mimpiku
8 => ,
65 => walau
737 => sekedar
9 => di
447 => mimpiku
2 => <newline>
69 => sampai
1339 => kapankah
156 => menunggu
2 => <newline>
70 => hari
70 => hari
75 => indah
163 => denganmu
8 => ,
70 => hari
70 => hari
75 => indah
163 => denganmu
2 => <newline>
4 => aku
7 => tak
22 => bisa
8 => ,
3 => ku
7 => tak
408 => kuasa
2 => <newline>
94 => lama
7 => tak
195 => bertemu
8 => ,
94 => 

In [17]:
def split_input_target(sequence):
    input_tensor = sequence[:-1]
    target_tensor = sequence[1:]
    return input_tensor, target_tensor

In [18]:
split_input_target(["saya", "dan", "dia"])

(['saya', 'dan'], ['dan', 'dia'])

In [19]:
BUFFER_SIZE = len(input_tensor)
BATCH_SIZE = 64
embedding_dim = 256
units = 1024

dataset = tf.data.Dataset.from_tensor_slices(input_tensor).shuffle(BUFFER_SIZE).map(split_input_target)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [20]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 161]), TensorShape([64, 161]))

In [21]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.GRU(units, return_sequences=True),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.GRU(units, return_sequences=True),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(vocab_size, activation='softmax')
])

In [22]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 256)         2560256   
                                                                 
 gru (GRU)                   (None, None, 1024)        3938304   
                                                                 
 dropout (Dropout)           (None, None, 1024)        0         
                                                                 
 gru_1 (GRU)                 (None, None, 1024)        6297600   
                                                                 
 dropout_1 (Dropout)         (None, None, 1024)        0         
                                                                 
 dense (Dense)               (None, None, 10001)       10251025  
                                                                 
Total params: 23,047,185
Trainable params: 23,047,185
No

In [23]:
checkpoint_dir = 'training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt_{epoch}.h5')

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True,
    monitor='loss',
    mode='min',
    save_best_only=True
)

In [24]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])

In [25]:
model.fit(dataset, epochs=10, callbacks=[checkpoint_callback])

Epoch 1/10
  2/362 [..............................] - ETA: 3:31:16 - loss: 9.1867 - sparse_categorical_accuracy: 0.1190    

KeyboardInterrupt: ignored

In [26]:
model.load_weights("checkpoint.h5")

In [27]:
logging.getLogger('tensorflow').setLevel(logging.ERROR)

In [28]:
def greedy_search(seed, max_length=150):
    start = seed.strip()
    sequences = [tokenizer.word_index[i] for i in start.lower().split(" ")]
    for i in range(max_length):
        x = np.array([sequences])
        pred = model.predict(x)
        pred_id = np.argmax(pred[0][-1])
        if pred_id == 0 or pred_id == tokenizer.word_index["<end>"]:
            break
        sequences.append(pred_id)
    print_sequence(sequences)

In [29]:
def print_sequence(sequences):
    result = []
    for seq in sequences:
        if seq == 0:
            continue
        word = tokenizer.index_word[seq]
        if word == "<start>" or word == "<end>" or word == "<unk>":
            word = ""
        elif word == "<newline>":
            word = "\n"
        result.append(word)
    print(" ".join(result))

In [30]:
greedy_search("<start> hujan")

 hujan beta erti , luka tak juga duda 
 hangati melayang , hatimu televisi 
 hai ini coba 
 tapi jangan kau lupa indah kasih turun 
 ayo yang sekarang patah kau rasa 
 pasti kan ada tinggal 
 tak akan jiwaku pintaku ini 
 di milikku  kan ada pelangi 
 niat cemburu kan terhebat hati 
 jangan ayo lagi 
 hujan beta erti , luka tak juga duda 
 hangati melayang , hatimu televisi 
 hai ini coba 
 tapi jangan kau lupa indah kasih turun 
 ayo yang sekarang patah kau rasa 
 pasti kan ada tinggal 
 tak akan jiwaku pintaku ini , di milikku  kan ada pelangi 
 niat cemburu kan terhebat hati , jangan ayo lagi 
 tak akan jiwaku pintaku ini , di milikku  kan ada pelangi 
 niat cemburu kan terhebat hati , jangan ayo lagi 
 hujan kan tenggelam ,


In [31]:
def beam_search(seed, k=3, maxsample=150):
    start = seed.strip()
    pattern = [tokenizer.word_index[w] for w in start.lower().split(" ")]
    x = np.array([pattern])

    # shape (1, n, vocab_size)
    preds = model.predict(x)
    # shape (k)
    pred_ids = preds.argsort(axis=2)[0, -1, -k:][::-1]
    # shape (vocab_size)
    pred_scores = np.log(preds[0][-1])
    # shape (k, n+1)
    k_prev_words = [pattern + [id] for id in pred_ids]
    # shape (k, n+1)
    top_k_scores = [pred_scores[i] for i in pred_ids]

    completed_sequences = []

    for i in range(maxsample):
        # shape (k, t, vocab_size)
        preds = model.predict(k_prev_words)
        # shape (k, k)
        pred_ids = preds.argsort(axis=2)[:, -1, -k:][::-1]
        # shape (k, vocab_size)
        pred_scores = np.log(preds[:, -1])
        pred_scores = [sc[idx] for idx, sc in zip(pred_ids, pred_scores)]
        pred_scores = np.array(pred_scores)
        top_k_preds = (top_k_scores + pred_scores.T).T
        top_score = top_k_preds.flatten().argsort()[::-1][:k]

        prev_words = [s//k for s in top_score]
        next_words = [s%k for s in top_score]

        top_k_scores = top_k_preds.flatten()[top_score]


        k_candidate_words = [pred_ids[p][n] for p, n in zip(prev_words, next_words)]
        k_prev_words = [k_prev_words[p] + [pred_ids[p][n]] for p, n in zip(prev_words, next_words)]
        for j, token in enumerate(k_candidate_words):
            if token == tokenizer.word_index["<end>"]:
                completed_sequences.append({"seqs": k_prev_words[j], "score": top_k_scores[j]})
        if len(completed_sequences) == k:
            break
    completed_sequences = sorted(completed_sequences, key=lambda x: x['score'], reverse=True)
    print_sequence(completed_sequences[0]['seqs'])

In [32]:
beam_search("<start> aku disini")

 aku disini lagi , kau bersedih lagi 
 ku tak mengerti mengapa berlalu , 
 mengapa kau pergi pergi , bukan karena senyummu 
 ku tak rela bila kau pergi 
 aku tak rela , ku rela 
 kau pergi janganlah 
 aku tak rela bila kau jauh 
 mungkin tak ada cinta 
 aku tak rela bila kau pergi 
 aku ingin engkau kembali 
 ku tak rela kau tinggalkan 
 diriku yang dulu tak ada 
 karena kau bukan lagi 
 


In [33]:
beam_search("<start> disini aku masih sendiri")

 disini aku masih sendiri , ada di sini dalam kutahu 
 berbintang rasa dalam temukan , dalam hati 
 dalam sepi ku dada kamu 
 adakah ada di sini dalam tulis 
 dalam rindu ku ingin bersamamu , bahagia bersamamu 
 bahagia terucap , bahagia dalam hidupku 
 bersamamu aku sementara 
 bersamamu aku isi , ku bahagia ku bahagia 


In [34]:
beam_search("<start> kamu pergi")

 kamu pergi , ku pergi 
 pergi tinggalkan dirimu 
 ku ingin kau tahu 
 betapa ku mencintaimu , 
 puas tolonglah ini , tak ada kamu 
 kamu tak bisa , aku tak bisa , aku tak mampu 
 
 ku tak bisa , tak bisa , tak bisa 
 ku jaga , ku tak bisa , ku takkan bisa 
 untuk bisa dingin 


In [35]:
greedy_search("<start> aku dan kamu")

 aku dan kamu saling cinta 
 bukan hanya perlahan kata 
 yang tak pernah bisa menyakiti cinta 
 menyakiti semua rasa yang salah 
 saat ini tak juga kau menyakiti 
 namun rasa ini juga bisa 
 kau berikan semua yang pernah ku rasa 
 rasa cinta yang takkan pernah bisa 
 ku berikan semua yang ku rasa 
 aku yang kau sayang meski tak bisa kau rasa 
 hidup ku tahu kau bukan untukku 
 tapi rasa ini rasa sayang ini untukmu 
 aku tahu ini takkan bisa 
 menjadi padamu yang kau lepas 
 sesaat saja kan ku beri 
 semua yang ku rasa 
 berdiri sayang seperti 
 aku yang kan selalu tidak 
 hanya untuk dirimu 
 yang ku berikan hanya untukmu 
 aku untukmu , berdiri sayang 
 mimpi kau ingin menyakiti rasa 
 yang aku cinta seperti 
 meski kau tak pernah tahu 
 rasa ini kan


In [36]:
beam_search("<start> engkau ")

 engkau datang padaku 
 kala  gelap hatiku 
 engkau datang padaku 
 membawa luka yang kini ada 
 
 jangan biarkan rindu yang terluka 
 jangan datang lalu datang 
 
 aku takkan ada 
 perhatianku kasih kita coba 
