# A rudimentary build of the PHARMARK model

In [1]:
from tensorflow.keras.callbacks import LambdaCallback
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from keras.layers import Embedding, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dense, Dropout
from tensorflow import keras
import pandas as pd
import numpy as np
import random
import sys
import io
import os

In [2]:
df = pd.read_csv('brand_name.csv')

In [3]:
df_train = df[df['Country'] == 'INDIA']['Drug name']
df_train = df_train.drop_duplicates()

In [6]:
chars = ['\n', "'", 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', ' ','_','-','.','/','&','1','2','3','4','5','6','7','8','9','0','+','(',')','>','<','%','`']

In [7]:
# Set indices, to construct a word 
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

print(char_indices)

{'\n': 0, "'": 1, 'a': 2, 'b': 3, 'c': 4, 'd': 5, 'e': 6, 'f': 7, 'g': 8, 'h': 9, 'i': 10, 'j': 11, 'k': 12, 'l': 13, 'm': 14, 'n': 15, 'o': 16, 'p': 17, 'q': 18, 'r': 19, 's': 20, 't': 21, 'u': 22, 'v': 23, 'w': 24, 'x': 25, 'y': 26, 'z': 27, ' ': 28, '_': 29, '-': 30, '.': 31, '/': 32, '&': 33, '1': 34, '2': 35, '3': 36, '4': 37, '5': 38, '6': 39, '7': 40, '8': 41, '9': 42, '0': 43, '+': 44, '(': 45, ')': 46, '>': 47, '<': 48, '%': 49, '`': 50}


In [9]:
col_one_list = df_train.tolist()
col_one_list = [i.lower() for i in col_one_list]
col_one_list = [i.split(" ") for i in col_one_list]
col_one_list = [i[0] for i in col_one_list]
col_one_list = set(col_one_list)

In [11]:
# Split into lines and get rid of empty lines 
lines = col_one_list
lines = [line for line in lines if len(line)!=0]

number of lines: 4182


In [12]:
# Get length of longest word
maxlen = len(max(lines, key=len)) + 15
minlen = len(min(lines, key=len))

print("line with longest length: "+ str(maxlen))
print("line with shorter length: "+ str(minlen))

line with longest length: 33
line with shorter length: 1


In [13]:
steps = 1
sequences = []
next_chars = []

for line in lines:
    # pre-padding with zeros
    s = (maxlen - len(line))*'0' + line
    sequences.append(s)
    next_chars.append('\n')
    for it,j in enumerate(line):
        if (it >= len(line)-1):
            continue
        s = (maxlen - len(line[:-1-it]))*'0' + line[:-1-it]
        sequences.append(s)
        next_chars.append(line[-1-it])

In [14]:
# Vectorization
x = np.zeros((len(sequences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sequences), len(chars)), dtype=np.bool)
for i, seq in enumerate(sequences):
    for t, char in enumerate(seq):
        if char != '0':
            x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  x = np.zeros((len(sequences), maxlen, len(chars)), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y = np.zeros((len(sequences), len(chars)), dtype=np.bool)


In [15]:
prefix = ""
max_names = 10

def sample(preds):
    """ function that sample an index from a probability array """
    preds = np.asarray(preds).astype('float64')
    preds = preds / np.sum(preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.random.choice(range(len(chars)), p = probas.ravel())

def print_name_generated(name):
    print(name, flush=True)
def print_list_generated(lst):
    print(lst, flush=True)
    
    
def generate_new_names(*args):
    print("----------Generating names----------")

    # Add pre-padding of zeros in the input.
    sequence = ('{0:0>' + str(maxlen) + '}').format(prefix).lower()

    # tmp variables
    tmp_generated = prefix
    list_outputs = list()

    while (len(list_outputs) < max_names):

        # Vectorize the input of the model.
        x_pred = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(sequence):
            if char != '0':
                x_pred[0, t, char_indices[char]] = 1

        # Predict the probabilities of the next char.
        preds = model.predict(x_pred, verbose=0)[0]

        # Chose one based on the distribution obtained in the output of the model.
        next_index = sample(preds)
        # Get the corresponding char.
        next_char = indices_char[next_index]

        # If the char is a new line character or the name start to be bigger than the longest word, 
        # try to add it to the list and reset temp variables.
        if next_char == '\n' or len(tmp_generated) > maxlen:
            
            # If the name generated is not in the list, append it and print it.
            if tmp_generated not in list_outputs:
                list_outputs.append(tmp_generated)
                print_name_generated(tmp_generated)
            # Reset tmp variables
            sequence = ('{0:0>' + str(maxlen) + '}').format(prefix).lower()
            tmp_generated = prefix
        else:
    
            # Append the char to the sequence that we're generating.
            tmp_generated += next_char
            # Add pre-padding of zeros to the sequence generated and continue.
            sequence = ('{0:0>' + str(maxlen) + '}').format(tmp_generated).lower()
            
    # Show the intersection of the words generated and your dataset. . 
    print("Set of words already in the dataset:")
    print_list_generated(set(lines).intersection(list_outputs))
    
    # Show the rate of how many repeated words you've created.
    total_repited = len(set(lines).intersection(list_outputs))
    total = len(list_outputs)
    print("Rate of total invented words: " + "{:.2f}".format((total-total_repited)/total))
    print("-----------------End-----------------")
    
# Function invoked at the end of each epoch. Prints generated names.
callback = LambdaCallback(on_epoch_end=generate_new_names)

In [16]:
# model 
model = Sequential()
model.add(LSTM(64, input_shape=(maxlen, len(chars))))
model.add(Dropout(0.3))
model.add(Dense(len(chars), activation='softmax'))
opt = keras.optimizers.Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=opt)
history = model.fit(x, y, batch_size=128, epochs=10, verbose=2, callbacks=[callback])

Epoch 1/10
----------Generating names----------
hakove
-pipyan
d
ivacina
ugelas
onetor
ouma-p
oflaceet-mx
raldaaa
slacpoki-smn
Set of words already in the dataset:
{'d'}
Rate of total invented words: 0.90
-----------------End-----------------
251/251 - 15s - loss: 2.6439 - 15s/epoch - 58ms/step
Epoch 2/10
----------Generating names----------
reno-rdrex
rnuart
khigele
aciclas
otexi
ampitalupnus-as
aleden
acnijinimd
lomradere-d
limobes
Set of words already in the dataset:
set()
Rate of total invented words: 1.00
-----------------End-----------------
251/251 - 12s - loss: 2.3110 - 12s/epoch - 49ms/step
Epoch 3/10
----------Generating names----------
figlulda
ioxorur
icnoloee
ordhirot
restokin
amonocic
ib1enm-d
iclora
ravonin
osoan
Set of words already in the dataset:
set()
Rate of total invented words: 1.00
-----------------End-----------------
251/251 - 12s - loss: 2.2125 - 12s/epoch - 46ms/step
Epoch 4/10
----------Generating names----------
asika5
ycemp
iburin
uvarad
restor
anfestan
io

In [17]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 64)                29696     
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense (Dense)               (None, 51)                3315      
                                                                 
Total params: 33,011
Trainable params: 33,011
Non-trainable params: 0
_________________________________________________________________


In [18]:
prefix = "ant"
max_names = 20

generate_new_names()

----------Generating names----------
antipas
antifol
antil-wase
antetri
antexine
antrool
antrag
antaine
antmor
antimasi
antopel
anthrin
antijod
anthoking
antidox
antives
antciflam-ssra
antigesic
antipm
antrol
Set of words already in the dataset:
set()
Rate of total invented words: 1.00
-----------------End-----------------
