## Imports

In [None]:
from matplotlib import pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split


import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding , Conv1D , MaxPool1D , LSTM , Dense , Dropout, SimpleRNN, GlobalAvgPool1D
from tensorflow.keras.optimizers import Adam

print(f'TF Version : {tf.__version__}')

TF Version : 2.13.0


## Data Preparation

### Read Data

In [None]:
file_path = 'anime.txt'

In [None]:
with open(file_path,'r') as f:
  data = f.read()

In [None]:
lines = data.splitlines()
names = [l.strip().lower() for l in lines]
np.random.shuffle(names)
print(names)
print(f'names found : {len(names)}')

['kumori', 'yukari', 'hajime', 'susano', 'midora', 'chiharu', 'masashi', 'yoshino', 'saika', 'sora', 'haruko', 'toru', 'rinn', 'hideo', 'tadashi', 'kairi', 'tsuki', 'taro', 'tatsuko', 'mariko', 'tadao', 'seiji', 'mai', 'nanami', 'amaya', 'nagisa', 'hoshiko', 'sonoko', 'michio', 'jiro', 'kimi', 'toshiro', 'umi', 'mukuro', 'chiyoko', 'tsukiko', 'kumiko', 'naomi', 'adae', 'kin', 'kurami', 'aoki', 'nadia', 'masahiro', 'hana', 'juro', 'kanae', 'mina', 'ayumi', 'hiroki', 'shinobu', 'mako', 'sesshomaru', 'ayame', 'ume', 'mikoto', 'masaaki', 'miyu', 'shun', 'leiko', 'miko', 'sasuke', 'hanako', 'shiori', 'luffy', 'masayuki', 'kiko', 'haruo', 'masumi', 'etsuko', 'miki', 'himeko', 'akira', 'kunio', 'ryuko', 'satashi', 'mirai', 'yomuro', 'chie', 'inazuma', 'noah', 'chieko', 'yana', 'hiroaki', 'katsuyuka', 'hiroto', 'mayu', 'renji', 'madara', 'kohana', 'yucie', 'kenpachi', 'hiroko', 'hinaki', 'sachi', 'seina', 'yoko', 'masahiko', 'akatsuki', 'masaki', 'kanna', 'marisa', 'yamoto', 'niko', 'rena', 'a

### Tokenization

In [None]:
data_ = '\t\n'.join(names)
tokenizer = Tokenizer(
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~',
    split='\t',
)
tokenizer.fit_on_texts(data_)

In [None]:
char_to_index = tokenizer.word_index
index_to_char = dict((v,k) for k , v in char_to_index.items())
print(char_to_index)

num_chars = len(char_to_index.keys()) + 1
print(f'Number of characters to be predicted: {num_chars}')

{'\n': 1, 'a': 2, 'i': 3, 'o': 4, 'k': 5, 'u': 6, 'm': 7, 's': 8, 'h': 9, 'r': 10, 'n': 11, 't': 12, 'y': 13, 'e': 14, 'c': 15, 'z': 16, 'j': 17, 'd': 18, 'g': 19, 'b': 20, 'l': 21, 'f': 22, 'w': 23, 'p': 24}
Number of characters to be predicted: 25


In [None]:
names = [n + '\n' for n in names] # add '\n' at the end of each japanese anime name
names[:5]

['kumori\n', 'yukari\n', 'hajime\n', 'susano\n', 'midora\n']

In [None]:
name = np.random.choice(names)
print(name,end='')
seq = tokenizer.texts_to_sequences(name)
print(seq)

yucie
[[13], [6], [15], [3], [14], [1]]


In [None]:
def names_to_seq(names):
  return [tokenizer.texts_to_sequences(c)[0][0] for c in names]

In [None]:
names_to_seq(names[0])

[5, 6, 7, 4, 10, 3, 1]

In [None]:
def seq_to_name(seq):
  return ''.join([index_to_char[i] for i in seq if i!=0])

In [None]:
seq_to_name([5, 6, 7, 4, 10, 3, 1])

'kumori\n'

### Create Data

In [None]:
sequences = []
for name in names :
  seq = names_to_seq(name)
  if len(seq) >=2 :
    sequences += [seq[:i] for i in range(2,len(seq) + 1)]

In [None]:
sequences[:6]

[[5, 6],
 [5, 6, 7],
 [5, 6, 7, 4],
 [5, 6, 7, 4, 10],
 [5, 6, 7, 4, 10, 3],
 [5, 6, 7, 4, 10, 3, 1]]

### Padding

In [None]:
max_len = max([len(x) for x in sequences])
print(f'length of longest {file_path[:-4]} name : ',max_len)

length of longest anime name :  11


In [None]:
padded_sequences = pad_sequences(
    sequences ,
    padding = 'pre',
    maxlen = max_len
)

print(padded_sequences[0])
padded_sequences.shape

[0 0 0 0 0 0 0 0 0 5 6]


(2563, 11)

In [None]:
X , Y = padded_sequences[:,:-1] , padded_sequences[:,-1]
print(X.shape,Y.shape)

(2563, 10) (2563,)


In [None]:
X[0],Y[0]

(array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0, 16], dtype=int32),
 8)

### Train Test Split

In [None]:
x_train , x_test , y_train , y_test = train_test_split(X,Y)

print(f'Train Data : {x_train.shape} , {y_train.shape}')
print(f'Test Data : {x_test.shape} , {y_test.shape}')

Train Data : (6689, 17) , (6689,)
Test Data : (2230, 17) , (2230,)


## Building Model

In [None]:
model = Sequential([
    Embedding(num_chars , 16 , input_length = max_len - 1),
    Conv1D(64 , 5 , strides = 1 , padding = 'causal' , activation = 'tanh'),
    MaxPool1D(2),

    GlobalAvgPool1D(),

    Dense(128),
    Dropout(0.1),
    Dense(num_chars , activation = 'softmax')
    ]
)
optimizer = Adam(learning_rate=1e-3)
model.compile(
    loss = 'sparse_categorical_crossentropy',
    optimizer = optimizer,
    metrics = ['accuracy']
)
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 10, 16)            400       
                                                                 
 conv1d_2 (Conv1D)           (None, 10, 64)            5184      
                                                                 
 max_pooling1d_2 (MaxPoolin  (None, 5, 64)             0         
 g1D)                                                            
                                                                 
 global_average_pooling1d_2  (None, 64)                0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dense_4 (Dense)             (None, 128)               8320      
                                                                 
 dropout_2 (Dropout)         (None, 128)              

## Callback for Anime name generation

In [None]:
def generate_name():
  chars_ = list(char_to_index.keys())
  chars_.remove('\n')
  seed = np.random.choice(chars_)
  for _ in range(0,max_len):
    seq = names_to_seq(seed)
    seq = pad_sequences(
        [seq],
        truncating = 'pre',
        padding = 'pre',
        maxlen = max_len - 1
    )
    y = model.predict(seq , verbose = 0)
    if np.isnan(y).any():
      return ''

    y = y.ravel()
    index = np.random.choice(list(range(num_chars)),p=y)
    #index = np.argmax(y)
    if index == 0:
      return seed
    char_pred = index_to_char[index]
    seed += char_pred
    if char_pred == '\n':
      return seed[:-1]
  return seed


class GenerateNameCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
      if epoch % 10 == 0 :
        print('\n')
        print(f'[#] Generating some {file_path[:-4][0].upper() + file_path[:-4][1:]}:')
        for _ in range(7):
          anime = generate_name()
          if anime != '':
            print(anime[0].upper() + anime[1:])
        print('\n')

### Train Model

In [None]:
epochs = 128 #64
history = model.fit(
    X,Y,
    epochs = epochs,
    callbacks = [GenerateNameCallback()]
)

Epoch 1/128

[#] Generating some Anime:
Fami
Nami
Giro
Eshiro
Cahiko
Omi
Hiro


Epoch 2/128
Epoch 3/128
Epoch 4/128
Epoch 5/128
Epoch 6/128
Epoch 7/128
Epoch 8/128
Epoch 9/128
Epoch 10/128
Epoch 11/128

[#] Generating some Anime:
Chiro
Riko
Nami
Nami
Biro
Kana
Jiro


Epoch 12/128
Epoch 13/128
Epoch 14/128
Epoch 15/128
Epoch 16/128
Epoch 17/128
Epoch 18/128
Epoch 19/128
Epoch 20/128
Epoch 21/128

[#] Generating some Anime:
Gan
Bun
Ami
Lana
Jiro
Ami
Dayaki


Epoch 22/128
Epoch 23/128
Epoch 24/128
Epoch 25/128
Epoch 26/128
Epoch 27/128
Epoch 28/128
Epoch 29/128
Epoch 30/128
Epoch 31/128

[#] Generating some Anime:
Ensu
Buno
Nariko
Umiri
Rin
Sanura
Rin


Epoch 32/128
Epoch 33/128
Epoch 34/128
Epoch 35/128
Epoch 36/128
Epoch 37/128
Epoch 38/128
Epoch 39/128
Epoch 40/128
Epoch 41/128

[#] Generating some Anime:
Rina
Bina
Kanami
Isaka
Bina
Payumi
Daesaka


Epoch 42/128
Epoch 43/128
Epoch 44/128
Epoch 45/128
Epoch 46/128
Epoch 47/128
Epoch 48/128
Epoch 49/128
Epoch 50/128
Epoch 51/128

[#] Gen

##Generate Cool Anime Names

In [None]:
def generate(n = 3):
        print(f'[#] Generating {n} names of {file_path[:-4][0].upper() + file_path[:-4][1:]}:')
        for _ in range(n):
          name = generate_name()
          if name != '':
            print(name[0].upper() + name[1:])

In [None]:
# Number of names to be generated
n = 5 # @param {type:"integer"}
generate(n)

[#] Generating 5 names of Anime:
Haku
Osayu
Ume
Imade
Wakana
Goek
Rei
