In [1]:
import pandas
import string
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import keras


Using TensorFlow backend.


In [2]:
df = pandas.read_csv('./data/name_gender_data.csv',header=None, usecols=[0,1],names=["Name","Gender"])

# Process the dataset.
df["Labels"] = df.Gender

# Convert the M and F into labels. 
df.Labels.replace(to_replace='M',value=0,inplace=True)     
df.Labels.replace(to_replace='F',value=1,inplace=True)

In [3]:
# Convert the names into vectors. One hot is a terrible choice here as the model would be costly to retrain and there are
# no repetitions within our dataset. We will create a vector with the element at each index 
# corresponding to the position of that letter within the alphabet (E.g. a -> 1, b- > 2, c -> 3). Each vector must be the 
# same length, so we assume that names are no greater than 50 characters and pad the remaining spaces with zeros.

def string_to_vec(name):
    MAX_NAME_LENGTH = 18
    dictionary = {key: value for (value, key) in enumerate(string.ascii_lowercase,1)}
    vector = []

    for (index,letter) in enumerate(name):
        vector.append(dictionary[letter.lower()])

    return np.array(vector)

In [4]:
# Convert the names into vectors
for i in range(0,len(df.Name)):
    df.at[i,"Name"] = string_to_vec(df.at[i,"Name"])


In [5]:
# Randomly split the dataset into a training set and a testing set 80:20

train = df.sample(frac=0.8,random_state=np.random.RandomState())
test = df.drop(train.index)

train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [6]:
# Separate the data from the labels
train_data = train.Name.values
train_labels = train.Labels.values

test_data = test.Name.values
test_labels = test.Labels.values

print("Training entries: {}, labels: {}".format(len(train_data), len(train_labels)))

Training entries: 76020, labels: 76020


In [7]:

train_data = keras.preprocessing.sequence.pad_sequences(train_data,
                                                        value=0,
                                                        padding='post',
                                                        maxlen=20)

test_data = keras.preprocessing.sequence.pad_sequences(test_data,
                                                       value=0,
                                                       padding='post',
                                                       maxlen=20)


In [8]:
model = keras.Sequential()
model.add(keras.layers.Dense(200, input_shape=(20,),activation="relu"))
model.add(keras.layers.Dense(1000, activation="relu"))
model.add(keras.layers.Dense(1, activation="sigmoid"))

model.summary()
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 200)               4200      
_________________________________________________________________
dense_2 (Dense)              (None, 1000)              201000    
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 1001      
Total params: 206,201
Trainable params: 206,201
Non-trainable params: 0
_________________________________________________________________


In [9]:
history = model.fit(train_data,
                    train_labels,
                    epochs=20,
                    batch_size=5000,
                    validation_data=(test_data, test_labels),
                    verbose=1)

Train on 76020 samples, validate on 19005 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [709]:
model.save("./gender_from_name_classifier.h5")

In [10]:
input = 'william'
vectored = string_to_vec(input)
vectored.resize((20,))
vectored

array([23,  9, 12, 12,  9,  1, 13,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0])

In [11]:

prediction = model.predict_classes(np.array([vectored,]))
print(prediction[0][0])


0


'heellsdfdsssssssssss'