In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv("../input/usa-names/NationalNames.csv")   

In [None]:
data.info()

In [None]:
print(data.shape)
print(data['Name'].nunique())   # total unique name
print(data.head())

# Preprocessing Data

In [None]:
data['Gender'] = data['Gender'].astype('category').cat.codes   # Label encoder for gender column*

In [None]:
data.head()

In [None]:
# New Dataframe(Name, Gender, Uniques names)
df = data.groupby('Name').mean()['Gender'].reset_index()

In [None]:
print(df.shape)
print(df.head())

In [None]:
# Making gender's type int*
df['Gender']=df['Gender'].astype('int')
df.head()

In [None]:
import string

# A list of alphabets to convert words to number
letters = list(string.ascii_lowercase)
letters

In [None]:
# Create dictionary | Vocab to encode letters to numbers
vocab = dict(zip(letters,range(1,27)))
vocab

In [None]:
# Use r_vocab to decode words
r_vocab = dict(zip(range(1,27),letters))
r_vocab

In [None]:
# This function will return every letter in name column to number and save the result to dataframe
def word_to_number():
  for i in range(0, df.shape[0]):
    seq = [vocab[letters.lower()] for letters in df['Name'][i]]
    df['Name'][i] = seq

In [None]:
# lowercase + set_assign each word | to convert our names to list of equivalent numbers
word_to_number()

In [None]:
# after encoding
df.head()

# Padding

In [None]:
# We can look histogrom of name length then we can decide the best number for boxes/padding
X = df['Name'].values
Y = df['Gender'].values

In [None]:
name_length = [len(X[i]) for i in range (0, df.shape[0])]   # length of every word

In [None]:
plt.hist(name_length,bins=20)
plt.show()
# X axis is length of the names and Y axis is number of names that present at name

In [None]:
# We decided to have 10 boxes so next step we need to convert each names to 10 digit row
from keras.preprocessing.sequence import pad_sequences
x = pad_sequences(df['Name'].values, maxlen = 10, padding='pre')
print(x.shape)
print(x)

# LSTM Model
## We will send each letter to one box | It's look/recognize the pattern

In [None]:
from keras.layers import Input, Embedding, Dense, LSTM
from keras.models import Model

In [None]:
vocab_size = len(vocab)+1
vocab_size

In [None]:
# input layer
inp = Input(shape=(10,))

# embedding layer 
emn = Embedding(input_dim = vocab_size, output_dim = 5)(inp)

# lstm layers
lstm1 = LSTM(units = 32, return_sequences = True)(emn)
lstm2 = LSTM(units = 64)(lstm1)

out = Dense(units=1, activation = 'sigmoid')(lstm2)

my_model = Model(inputs=inp, outputs=out)

In [None]:
my_model.summary()

# Compile & Train Model

In [None]:
my_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [None]:
his = my_model.fit(x,Y,epochs=10, batch_size=256,validation_split=0.2)

# Visualize Result

In [None]:
plt.style.use('seaborn-darkgrid')

In [None]:
# Accuracy and Validation Accuracy
fig, ax=plt.subplots(nrows=1,ncols=1,figsize=(10,5))
ax.plot(his.history['acc'],label='Accuracy')
ax.plot(his.history['val_acc'],label='Validation Accuracy')
ax.legend()
fig.show()

In [None]:
# Loss and Validation Loss
fig, ax=plt.subplots(nrows=1,ncols=1,figsize=(10,5))
ax.plot(his.history['loss'],label='Loss')
ax.plot(his.history['val_loss'],label='Validation Loss')
ax.legend()
fig.show()

# Predict for Random Name

In [None]:
def predict_name(name):
  test_name=name.lower()
  seq=[vocab[i] for i in test_name]
  x_test=pad_sequences([seq],10)
  y_pred=my_model.predict(x_test)
  if y_pred < 0.5:
    print("Name is female...")
  else:
    print("Name is male...")

In [None]:
predict_name('Maxwel')

In [None]:
predict_name('John')

In [None]:
predict_name('Dani')