In [None]:
from keras.layers import SimpleRNN, LSTM, GRU, Dense, Dropout, Input
from keras.models import Model
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer, one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
df = pd.read_csv("name_gender.csv")
df = df.sample(frac=1).reset_index(drop=True)  ### Shuffling here so we can select random data just by array slicing
df.head()

Unnamed: 0,name,gender,probability
0,Judythe,F,1.0
1,Nitesh,M,1.0
2,Lan,F,0.73896
3,Latish,F,1.0
4,Neana,F,1.0


In [3]:
names = list(df["name"])
for i in range(len(names)):
    names[i] = names[i].lower()
    names[i] = ([ord(c) - ord('a') + 1 for c in names[i]])

In [4]:
max_len = max([len(i) for i in names])
max_len

15

In [5]:
names = pad_sequences(names, padding="pre", maxlen=max_len)

In [6]:
df["probability"].value_counts()

1.000000    84664
0.500000      118
0.666667       65
0.545455       54
0.750000       35
            ...  
0.912764        1
0.988346        1
0.643312        1
0.722359        1
0.877899        1
Name: probability, Length: 8306, dtype: int64

In [7]:
genders = df["gender"].replace("M", 1).replace("F", 0).to_list()
labels = to_categorical(genders, dtype=int)

In [8]:
inputs = to_categorical(names, dtype=int)

In [9]:
BATCH_SIZE = 256
MAX_LEN = max_len
NUM_CLASSES = 2
NUM_INPUT_CLASSES = 27

## Problem 1

In [15]:
# RNN cells - Simple RNN, LSTM and GRU
def get_lstm():
    inp = Input(shape=(MAX_LEN, NUM_INPUT_CLASSES))
    x = LSTM(64, return_sequences=True, activation="relu")(inp)
    x = Dropout(0.2)(x)
    x = LSTM(64, activation="relu")(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.2)(x)
    out = Dense(2, activation="softmax")(x)
    model = Model(inp, out)
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
#     print(model.summary())
    return model

def get_rnn():
    inp = Input(shape=(MAX_LEN, NUM_INPUT_CLASSES))
    x = SimpleRNN(64, return_sequences=True, activation="relu")(inp)
    x = Dropout(0.2)(x)
    x = SimpleRNN(64, activation="relu")(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.2)(x)
    out = Dense(2, activation="softmax")(x)
    model = Model(inp, out)
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
#     print(model.summary())
    return model

def get_gru():
    inp = Input(shape=(MAX_LEN, NUM_INPUT_CLASSES))
    x = GRU(64, return_sequences=True, activation="relu")(inp)
    x = Dropout(0.2)(x)
    x = GRU(64, activation="relu")(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.2)(x)
    out = Dense(2, activation="softmax")(x)
    model = Model(inp, out)
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
#     print(model.summary())
    return model

In [46]:
def train_model(model_type, data_frac):
    total = inputs.shape[0]
    X = inputs[:int(total*data_frac)]
    Y = labels[:int(total*data_frac)]
    trainX, testX, trainY, testY = train_test_split(X, Y, test_size = 0.2)
    if model_type == "lstm":
        model = get_lstm()
    elif model_type == "rnn":
        model = get_rnn()
    else:
        model = get_gru()
    print("MODEL TYPE:", model_type,"  Data :", data_frac*100, "%")
    model.fit(trainX, trainY,epochs = 1, batch_size=BATCH_SIZE, verbose = 0)
    y_pred = model.predict(testX).argmax(axis = 1)
    acc = accuracy_score(testY.argmax(axis = 1), y_pred)
    print("Accuracy: ", acc)
    cm = confusion_matrix(testY.argmax(axis = 1), y_pred)
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    print("Class-wise accuracy:",cm.diagonal())
    print("\n---------------------------\n")

In [47]:
# Dataset size (Randomly select 25%, 50%, 75% and 100% of the data) . For each partial dataset use 80% as training data.
types = ["lstm", "gru", "rnn"]
fracs = [0.25, 0.5, 0.75, 1]

In [48]:
for t in types:
    for f in fracs:
        train_model(t, f)

MODEL TYPE: lstm   Data : 25.0 %
Accuracy:  0.696969696969697
Class-wise accuracy: [0.87138584 0.39586919]

---------------------------

MODEL TYPE: lstm   Data : 50.0 %
Accuracy:  0.8045880248342628
Class-wise accuracy: [0.89267405 0.65046296]

---------------------------

MODEL TYPE: lstm   Data : 75.0 %
Accuracy:  0.8269257752209906
Class-wise accuracy: [0.86574947 0.76016027]

---------------------------

MODEL TYPE: lstm   Data : 100 %
Accuracy:  0.8297905924444912
Class-wise accuracy: [0.84808861 0.79839977]

---------------------------

MODEL TYPE: gru   Data : 25.0 %
Accuracy:  0.797979797979798
Class-wise accuracy: [0.92379993 0.58657642]

---------------------------

MODEL TYPE: gru   Data : 50.0 %
Accuracy:  0.8233189519099232
Class-wise accuracy: [0.87778147 0.72910083]

---------------------------

MODEL TYPE: gru   Data : 75.0 %
Accuracy:  0.8335204153220148
Class-wise accuracy: [0.85866548 0.79035639]

---------------------------

MODEL TYPE: gru   Data : 100 %
Accuracy:

## Problem 2(a)

In [220]:
names = list(df["name"])
for i in range(len(names)):
    names[i] = names[i].lower()
    names[i] = ([ord(c) - ord('a') + 1 for c in names[i]])
genders = df["gender"].replace("M", 1).replace("F", 0).to_list()

In [221]:
def get_model():
    inp = Input(shape=(MAX_LEN-1, NUM_INPUT_CLASSES))
    x = LSTM(64, return_sequences=True)(inp)
    x = Dropout(0.2)(x)
    x = LSTM(64)(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.2)(x)
    out = Dense(NUM_INPUT_CLASSES, activation="softmax")(x)
    model = Model(inp, out)
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

In [222]:
X = inputs[:, :-1, :]
Y = inputs[:, -1, :]
X.shape, Y.shape

((95026, 14, 27), (95026, 27))

In [223]:
X_m = []
X_f = []
for i in range(len(genders)):
    if genders[i] == 1:
        X_m.append(names[i])
    else:
        X_f.append(names[i])
X_m = pad_sequences(X_m, padding="pre", maxlen=max_len)
X_m = to_categorical(X_m)
X_f = pad_sequences(X_f, padding="pre", maxlen=max_len)
X_f = to_categorical(X_f)
X_m.shape, X_f.shape

((34722, 15, 27), (60304, 15, 27))

In [224]:
male_model = get_model()
male_model.fit(X_m[:, :-1,:], X_m[:, -1, :], epochs=25, batch_size=BATCH_SIZE)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x1db588a7a88>

In [226]:
female_model = get_model()
female_model.fit(X_f[:, :-1,:], X_f[:, -1, :], epochs=10, batch_size=BATCH_SIZE)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1db468c4048>

In [237]:
def to_text(inp):
    s = ""
    for i in inp:
        if i==0:
            continue
        s+= chr(i+ord('a') -1)
    return s
def generate_names(model, l):
    name = [[np.random.randint(low = 1, high = 27)]]
    for i in range(l-1):
        X_in = pad_sequences(name, maxlen=max_len-1, padding="pre")
        X_in = to_categorical(X_in, num_classes=27)
        next_letter = model.predict(X_in).argmax(axis = 1)[0]
        name = [np.append(name, [next_letter])]
    name = to_text(name[0])
    return name

In [228]:
# Generate 20 male names and 20 female names
print("MALE NAMES")
for i in range(20):
    name_len = np.random.randint(low = 5, high = 15)
    print(generate_names(male_model, name_len))

print("\n\n FEMALE NAMES")
for i in range(20):
    name_len = np.random.randint(low = 5, high = 15)
    print(generate_names(female_model, name_len))

MALE NAMES
noyel
wynerton
jinondonyoses
cellendone
alondonyoneshi
edynell
edynelles
cellen
wynertonesh
jinondonyos
tonellond
tonellondo
edynelles
sonellon
onell
hinondo
kilondonesh
fyellynertone
sonell
alondonyoneshi


 FEMALE NAMES
farahaha
xanahahaha
uanahaha
naiahaha
daela
naiahahah
yahanahahaha
naiahahahahah
barahah
yahanahahahaha
yahanahahahaha
jahanaha
iahanahahahaha
uanahahah
eahahahaha
naiaha
daelaha
anahahahaha
ganahahahahah
anahahah


## Problem 2(b)

In [229]:
X = []
for i in range(len(inputs)):
    if names[i][0] in set({1, 13, 26}):
        X.append(names[i])
X = pad_sequences(X, padding="pre", maxlen=max_len)
X = to_categorical(X)
X.shape

(19080, 15, 27)

In [230]:
trainX = X[:, :-1, :]
trainY = X[:, -1, :]
trainX.shape, trainY.shape

((19080, 14, 27), (19080, 27))

In [231]:
amz_model = get_model()
amz_model.fit(trainX, trainY, epochs = 25, batch_size = BATCH_SIZE)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x1da16328dc8>

In [232]:
def generate_amz_names(model, l):
    name = [[np.random.choice([1, 13, 26])]]
    for i in range(l-1):
        X_in = pad_sequences(name, maxlen=max_len-1, padding="pre")
        X_in = to_categorical(X_in, num_classes=27)
        next_letter = model.predict(X_in).argmax(axis = 1)[0]
        name = [np.append(name, [next_letter])]
    name = to_text(name[0])
    return name

In [241]:
# Train a language model using names starting with A, M, and Z.
print("Names starting with A, M or Z\n")
for i in range(50):
    name_len = np.random.randint(low = 5, high = 15)
    print(generate_amz_names(amz_model, name_len))

Names starting with A, M or Z

ziahanahahah
ziahanahahahan
ziahanaha
mirahanahah
adanahi
mirahanah
mirahana
mirahan
ziahanahahahan
ziahanah
ziahanaha
adanahirahahan
ziahanaha
mirahana
ziahanahahah
ziahanahahaha
ziahan
adanahirahah
miraha
adanahirahah
ziahan
mirahanahaha
ziaha
mirah
adana
adanahiraha
ziahanahahah
adanahirahahan
miraha
adanahirahahan
adanahirahaha
adanahir
ziahanahahah
ziahanaha
mirahan
adanah
ziaha
mirah
adanahiraha
adanahirah
ziaha
ziahanah
ziahanah
mirahanah
mirahanah
adanahi
ziahanaha
miraha
ziaha
adanahira


In [None]:
## ... RNN Classifier: Three different versions (Simple, LSTM, GRU) of the classifier --> done!
## ... This criterion is linked to a Learning OutcomePerformance of the classifiers --> done!
## ... This criterion is linked to a Learning OutcomeGender specific language model for generating names --> done!
## ... Language model to generate names starting with A, M, Z --> done!

In [None]:
## ... to write the code, I reffered various links and wrote my own code.
## ... references: 
## ... https://machinelearningmastery.com/how-to-develop-a-word-level-neural-language-model-in-keras/
## ... https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/
## ... https://maelfabien.github.io/machinelearning/NLP_7/#data-pre-processing
## ... https://adventuresinmachinelearning.com/recurrent-neural-networks-lstm-tutorial-tensorflow/