In [1]:
# Common stuff

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
%config InlineBackend.figure_format = 'retina'
from pylab import rcParams
rcParams['figure.figsize'] = (9, 6)
import seaborn as sns

In [2]:
# Load data

df_male = pd.read_csv('male.txt', sep=",", header=None)
df_male.columns = ["name"]
df_male['is_male'] = 1

df_female = pd.read_csv('female.txt', sep=",", header=None)
df_female.columns = ["name"]
df_female['is_male'] = 0

df = df_male.append(df_female)
df.name = df.name.str.lower()

display(df_male.info())
display(df_female.info())
display(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2943 entries, 0 to 2942
Data columns (total 2 columns):
name       2943 non-null object
is_male    2943 non-null int64
dtypes: int64(1), object(1)
memory usage: 69.0+ KB


None

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5001 entries, 0 to 5000
Data columns (total 2 columns):
name       5001 non-null object
is_male    5001 non-null int64
dtypes: int64(1), object(1)
memory usage: 117.2+ KB


None

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7944 entries, 0 to 5000
Data columns (total 2 columns):
name       7944 non-null object
is_male    7944 non-null int64
dtypes: int64(1), object(1)
memory usage: 186.2+ KB


None

In [3]:
# Drop all duplicate names

df.drop_duplicates(subset=["name"], keep=False, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7208 entries, 0 to 5000
Data columns (total 2 columns):
name       7208 non-null object
is_male    7208 non-null int64
dtypes: int64(1), object(1)
memory usage: 168.9+ KB


In [4]:
# Make train/test split, stratified by is_male

from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=123, stratify=df.is_male)
train_df.reset_index(inplace = True, drop = True)
test_df.reset_index(inplace = True, drop = True)
display(pd.value_counts(train_df["is_male"]))
display(pd.value_counts(test_df["is_male"]))

0    3704
1    2062
Name: is_male, dtype: int64

0    926
1    516
Name: is_male, dtype: int64

In [5]:
# Let's use Naive Bayes to classify, we'll use CountVectorizer in char_wb mode,
# it's same as using ngrams from nltk.util

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import *
from sklearn.feature_extraction.text import *
from sklearn.model_selection import GridSearchCV

for i in range(1, 9):
    clf = Pipeline([
        ('vect', CountVectorizer(analyzer='char_wb')),
        ('tfidf', TfidfTransformer()),    
        ('clf', MultinomialNB()),
    ])

    parameters = {
        'vect__ngram_range': [(i, i)],
        'clf__alpha': (0.001, 0.01, 0.1, 1, 2),
        'tfidf__use_idf': (True, False)
    }

    clf = GridSearchCV(clf, parameters, scoring='f1', cv=5, n_jobs=-1)

    clf.fit(train_df.name, train_df.is_male)
    
    predictions = clf.best_estimator_.predict(test_df.name)

    print("\nF1-measure(%d): %.2f" % (i, f1_score(test_df.is_male, predictions, average='macro')))
    print("Accuracy(%d): %.2f" % (i, accuracy_score(test_df.is_male, predictions)))  
    
# As we can see, 3 and 4-grams give the best results, that's because when n-gram length is too short
# there's not much info about the name for classifier to capture, i.e. it's underfit.
# When n-gram is too long, it captures too many info, i.e. it's overfit.


F1-measure(1): 0.50
Accuracy(1): 0.67

F1-measure(2): 0.82
Accuracy(2): 0.84

F1-measure(3): 0.86
Accuracy(3): 0.87

F1-measure(4): 0.86
Accuracy(4): 0.87

F1-measure(5): 0.78
Accuracy(5): 0.81

F1-measure(6): 0.65
Accuracy(6): 0.74

F1-measure(7): 0.53
Accuracy(7): 0.68

F1-measure(8): 0.44
Accuracy(8): 0.66


In [6]:
# Let's use neural network to solve this

display(train_df.info())

chars = sorted(list(set("".join(train_df.name))))
char_indices = dict((c, i) for i, c in enumerate(chars))

# All english letters + 3 punct chars
print("Num chars", len(chars))

# Let's cheat a bit and find out maxlen for all dataset, not just train, it'll just
# make our code easier, we could have just made it maxlen_train * 1.5 or something
# like that
maxlen = df.name.str.len().max()
print("Max len", maxlen)

x = np.zeros((len(train_df), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(train_df), 1), dtype=np.bool)
for i in range(len(train_df)):    
    for t, char in enumerate(train_df.name[i]):        
        x[i, t, char_indices[char]] = 1
    y[i] = train_df.is_male[i]
print(x.shape)
print(x[0][0])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5766 entries, 0 to 5765
Data columns (total 2 columns):
name       5766 non-null object
is_male    5766 non-null int64
dtypes: int64(1), object(1)
memory usage: 135.1+ KB


None

Num chars 29
Max len 15
(5766, 15, 29)
[False False False  True False False False False False False False False
 False False False False False False False False False False False False
 False False False False False]


In [7]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Bidirectional
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
from keras.utils import to_categorical

def gen_val_rnn(num_neurons, dropout_val, is_bidir, num_epochs):    
    descr = "RNN(N=%d,D=%.2f,Bi=%d,Ep=%d)" % (num_neurons, dropout_val, is_bidir, num_epochs)
    print("\nCalculating", descr)
    model = Sequential()
    if is_bidir:
        model.add(Bidirectional(LSTM(num_neurons, return_sequences=True),
            input_shape=(maxlen, len(chars))))
        model.add(Bidirectional(LSTM(num_neurons)))
    else:
        model.add(LSTM(num_neurons, input_shape=(maxlen, len(chars))))
    model.add(Dropout(dropout_val))
    model.add(Dense(2))
    model.add(Activation('softmax'))

    optimizer = RMSprop(lr=0.01)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    #for iteration in range(0, num_epochs):
        #print()
        #print('-' * 50)
        #print('Iteration', iteration)
    model.fit(x, to_categorical(y, 2), batch_size=128, epochs=num_epochs, validation_split=0.1, verbose=0)
        
    x_test = np.zeros((len(test_df), maxlen, len(chars)), dtype=np.bool)
    for i in range(len(test_df)):    
        for t, char in enumerate(test_df.name[i]):        
            x_test[i, t, char_indices[char]] = 1    
    y_pred = model.predict_classes(x_test)
    
    score_f1 = f1_score(test_df.is_male, y_pred, average='macro')
    score_acc = accuracy_score(test_df.is_male, y_pred)        
        
    return (descr, score_f1, score_acc)

scores = []
    
%time scores.append(gen_val_rnn(16, 0.1, False, 60))
%time scores.append(gen_val_rnn(16, 0.1, True, 60))
%time scores.append(gen_val_rnn(128, 0.1, False, 60))
%time scores.append(gen_val_rnn(128, 0.1, True, 60))
%time scores.append(gen_val_rnn(16, 0.3, False, 60))
%time scores.append(gen_val_rnn(16, 0.3, True, 60))
%time scores.append(gen_val_rnn(128, 0.3, False, 60))
%time scores.append(gen_val_rnn(128, 0.3, True, 60))

print("\n")

for s in scores:
    print("F1-measure(%s): %.2f" % (s[0], s[1]))
    print("Accuracy(%s): %.2f\n" % (s[0], s[2]))

Using TensorFlow backend.



Calculating RNN(N=16,D=0.10,Bi=0,Ep=60)
CPU times: user 1min 10s, sys: 10.3 s, total: 1min 20s
Wall time: 19.8 s

Calculating RNN(N=16,D=0.10,Bi=1,Ep=60)
CPU times: user 4min 59s, sys: 23.4 s, total: 5min 23s
Wall time: 55.3 s

Calculating RNN(N=128,D=0.10,Bi=0,Ep=60)
CPU times: user 5min 31s, sys: 15.9 s, total: 5min 47s
Wall time: 1min 6s

Calculating RNN(N=128,D=0.10,Bi=1,Ep=60)
CPU times: user 44min 26s, sys: 1min 59s, total: 46min 25s
Wall time: 6min 51s

Calculating RNN(N=16,D=0.30,Bi=0,Ep=60)
CPU times: user 1min 14s, sys: 10.4 s, total: 1min 25s
Wall time: 21 s

Calculating RNN(N=16,D=0.30,Bi=1,Ep=60)
CPU times: user 5min 4s, sys: 24 s, total: 5min 28s
Wall time: 1min

Calculating RNN(N=128,D=0.30,Bi=0,Ep=60)
CPU times: user 5min 34s, sys: 14 s, total: 5min 47s
Wall time: 1min 10s

Calculating RNN(N=128,D=0.30,Bi=1,Ep=60)
CPU times: user 45min 21s, sys: 2min 3s, total: 47min 25s
Wall time: 7min 22s


F1-measure(RNN(N=16,D=0.10,Bi=0,Ep=60)): 0.83
Accuracy(RNN(N=16,D=0.10,Bi=0,E

In [None]:
# Looks like the best is RNN(N=128,D=0.10,Bi=0,Ep=60), generally, the more neurons the better
# more complex network architecture doesn't give us any gain, m.b. it's because
# there's not too much information in separate letters and letter sequences in human names. i.e.
# there're not much "features" that network can deduce.