# Imports

In [1]:
import pandas as pd
import numpy as np
from joblib import dump, load
from unidecode import unidecode
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
import sys
sys.path.append(r"../machine-learning-models/")

# Data

In [3]:
# Database do https://brasil.io/dataset/genero-nomes/nomes/
import requests
url = "https://data.brasil.io/dataset/genero-nomes/nomes.csv.gz"
filename = url.split("/")[-1]
with open(filename, "wb") as f:
    r = requests.get(url)
    f.write(r.content)

dfData = pd.read_csv('nomes.csv.gz') 
dfData

Unnamed: 0,alternative_names,classification,first_name,frequency_female,frequency_male,frequency_total,frequency_group,group_name,ratio
0,AILINE|ALEINE|ALIINE|ALINE|ALINER|ALINHE|ALINN...,F,AALINE,66.0,,66,530550,ALINE,1.0
1,ARAAO|ARAO,M,AARAO,,281.0,281,3526,ARAO,1.0
2,AHARON|AROM|ARON|ARYON|HARON,M,AARON,,676.0,676,3442,ARON,1.0
3,ADA|ADAH|ADAR|ADHA|HADA,F,ABA,82.0,,82,5583,ADA,1.0
4,,M,ABADE,,57.0,57,57,ABADE,1.0
...,...,...,...,...,...,...,...,...,...
100782,MACILEIA,F,MACLEIA,43.0,,43,457,MACILEIA,1.0
100783,GELINE|GILEINE|GLEINE|GLEINER|GLEYNE|JAELINE|J...,F,GIULINE,24.0,,24,2869,JALINE,1.0
100784,DEMILTOM|DEMILTON,M,DEMAILTON,,24.0,24,631,DEMILTON,1.0
100785,ALIVIA|ELIVIA|EULIVIA|HOLIVIA|LEIVIA|LIIVIA|LI...,F,ILIVIA,22.0,,22,179561,LIVIA,1.0


# SVM

In [44]:
treino, teste = train_test_split(dfData, test_size=0.2, random_state=28)

In [45]:
names = teste["first_name"].apply(lambda x: x.lower())
classification = teste['classification']

In [47]:
import SVM.SVM as SVM
svc = SVM.SVM(path=r"../machine-learning-models/SVM/Models/")

In [48]:
predict_svm = svc.classify(names)

In [49]:
names[predict_svm != classification]

49223       kenni
32773    gilvanne
55545         mab
6958        arami
19383    dircemar
           ...   
97304       solly
51756       lelis
55416     luzinal
33011      gisbel
90030     deborar
Name: first_name, Length: 992, dtype: object

In [50]:
len(names[predict_svm != classification])/len(names) * 100

4.921123127294375

In [64]:
misclassifiedSVM = pd.DataFrame()
misclassifiedSVM['nome'] = names[predict_svm != classification]
misclassifiedSVM['ratio'] = test['ratio'][predict_svm != classification]
misclassifiedSVM['gênero_real'] = classification[predict_svm != classification]
misclassifiedSVM['gênero_predito'] = np.array(predict_svm)[predict_svm != classification]
misclassifiedSVM

Unnamed: 0,nome,ratio,gênero_real,gênero_predito
49223,kenni,0.711111,M,F
32773,gilvanne,1.000000,M,F
55545,mab,1.000000,F,M
6958,arami,0.867647,M,F
19383,dircemar,1.000000,F,M
...,...,...,...,...
97304,solly,1.000000,M,F
51756,lelis,0.726253,M,F
55416,luzinal,1.000000,M,F
33011,gisbel,1.000000,F,M


In [99]:
misclassifiedSVM.to_csv("results/Classificação_incorreta_SVM.csv")

In [66]:
resultsSVM = pd.DataFrame()
resultsSVM['nome'] = names
resultsSVM['ratio'] = teste['ratio']
resultsSVM['gênero_real'] = classification
resultsSVM['gênero_predito'] = predict_svm
resultsSVM

Unnamed: 0,nome,ratio,gênero_real,gênero_predito
77699,thaini,1.000000,F,F
21959,edjaelson,1.000000,M,M
20893,edeilsa,1.000000,F,F
42889,jequeson,1.000000,M,M
4172,alicino,1.000000,M,M
...,...,...,...,...
43482,jhonata,0.988947,M,M
16999,delmison,1.000000,M,M
96516,jivoneide,1.000000,F,F
49527,ketline,1.000000,F,F


# BiLSTM

In [67]:
from tensorflow import keras
from tensorflow.keras.regularizers import l2
from sklearn.model_selection import train_test_split
from keras.layers import Dropout, Flatten, TimeDistributed
from keras.layers import Dense, LSTM, Bidirectional, Activation

In [68]:
train, test = train_test_split(dfData, test_size=0.2, random_state=28)
train, val = train_test_split(train, test_size=0.25, random_state=40)

classification = test['classification'].astype("category").cat.codes.values    # y labels into numbers 0 is F and 1 is M
names = test['first_name'].apply(lambda x: x.lower())             # input names

In [69]:
#word encoding
maxlen = 20                                               # max lenght of a name

'''Define a vocabulary which corresponds to all the unique letters encountered'''
vocab = set(' '.join([str(i) for i in names]))            # creating a vocab
vocab.add('END')
len_vocab = len(vocab)

char_index = dict((c, i) for i, c in enumerate(vocab))    # creating a dictionary
''' The dictionary maps each letter of vocabulary to a number '''

# Builds an empty line with a 1 at the index of character
def set_flag(i):
    aux = np.zeros(len_vocab);
    aux[i] = 1
    return list(aux)

# Truncate names and create the matrix
def prepare_encod_names(X):
    vec_names = []
    trunc_name = [str(i)[0:maxlen] for i in X]  # consider only the first 20 characters
    for i in trunc_name:
        tmp = [set_flag(char_index[j]) for j in str(i)]
        for k in range(0,maxlen - len(str(i))):
            tmp.append(set_flag(char_index["END"]))
        vec_names.append(tmp)
    return vec_names

'''This is called one-hot-encoder and is based on the following papers: 
https://arxiv.org/abs/1707.07129 : Predicting the gender of Indonesian names
https://ieeexplore.ieee.org/document/8560790 : Advance Gender Prediction Tool...
 '''

'This is called one-hot-encoder and is based on the following papers: \nhttps://arxiv.org/abs/1707.07129 : Predicting the gender of Indonesian names\nhttps://ieeexplore.ieee.org/document/8560790 : Advance Gender Prediction Tool...\n '

In [70]:
x = prepare_encod_names(names.values)

In [71]:
bilstm = keras.Sequential()
bilstm.add(Bidirectional(LSTM(64, return_sequences=True), backward_layer=LSTM(64, return_sequences=True, go_backwards=True), input_shape=(maxlen,len_vocab)))
bilstm.add(Dropout(0.1))
bilstm.add(Bidirectional(LSTM(64)))
bilstm.add(Dense(1, activity_regularizer=l2(0.002)))
bilstm.add(Activation('sigmoid'))

In [72]:
bilstm.load_weights(filepath=r"../machine-learning-models/deep-learning-models/BiLSTM/Model/BiLSTM.hdf5")

In [73]:
predict_lstm = (bilstm.predict(x) > 0.5).astype("int32")

In [74]:
p_lstm = [x[0] for x in predict_lstm]

In [75]:
names[p_lstm != classification]

77699       thaini
20893      edeilsa
82859      waneska
15051        dagna
28219      evilany
           ...    
39580       italva
26408       erllen
43482      jhonata
96516    jivoneide
49527      ketline
Name: first_name, Length: 10943, dtype: object

In [76]:
len(names[p_lstm != classification])/len(classification) * 100

54.28613949796607

misclassified names

In [77]:
misclassifiedLSTM = pd.DataFrame()
misclassifiedLSTM['nome'] = names[p_lstm != classification]
misclassifiedLSTM['ratio'] = test['ratio'][p_lstm != classification]
misclassifiedLSTM['gênero_real'] = classification[p_lstm != classification]
misclassifiedLSTM['gênero_predito'] = predict_lstm[p_lstm != classification]
misclassifiedLSTM

Unnamed: 0,nome,ratio,gênero_real,gênero_predito
77699,thaini,1.000000,0,1
20893,edeilsa,1.000000,0,1
82859,waneska,1.000000,0,1
15051,dagna,1.000000,0,1
28219,evilany,1.000000,0,1
...,...,...,...,...
39580,italva,1.000000,0,1
26408,erllen,1.000000,0,1
43482,jhonata,0.988947,1,0
96516,jivoneide,1.000000,0,1


In [98]:
misclassifiedLSTM.to_csv("results/Classificação_incorreta_BiLSTM.csv")

All classified names

In [78]:
resultsLSTM = pd.DataFrame()
resultsLSTM['nome'] = names
resultsLSTM['ratio'] = test['ratio']
resultsLSTM['gênero_real'] = classification
resultsLSTM['gênero_predito'] = predict_lstm
resultsLSTM

Unnamed: 0,nome,ratio,gênero_real,gênero_predito
77699,thaini,1.000000,0,1
21959,edjaelson,1.000000,1,1
20893,edeilsa,1.000000,0,1
42889,jequeson,1.000000,1,1
4172,alicino,1.000000,1,1
...,...,...,...,...
43482,jhonata,0.988947,1,0
16999,delmison,1.000000,1,1
96516,jivoneide,1.000000,0,1
49527,ketline,1.000000,0,1


# CNN

In [79]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from keras import layers
from keras import models
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.layers.convolutional import Conv1D, MaxPooling1D
from tensorflow import keras

In [80]:
train, test = train_test_split(dfData, test_size=0.2, random_state=28)
train, val = train_test_split(train, test_size=0.25, random_state=40)

classification = test['classification'].astype("category").cat.codes.values    # y labels into numbers 0 is F and 1 is M
names = test['first_name'].apply(lambda x: x.lower())             # input names

In [81]:
# Cria um vocabulário com todos os caracteres do dataset
vocab = set(' '.join([str(i) for i in names]))            
vocab.add('END')
len_vocab = len(vocab)
char_index = dict((c, i) for i, c in enumerate(vocab))

In [82]:
maxlen = 14 # Esse é o tamanho do maior nome do dataset

# Essa função seta o valor 1 no vetor, de acordo com a posição da letra e o seu
# valor no vocabulário criado anteriormente
def set_flag(i):
    aux = np.zeros(len_vocab);
    aux[i] = 1
    return list(aux)

# Função que percorre o dataset e chama a funçõa 'set_flag' para setar o valor 1
# na posição correta. Essa funão retor um vetor com todos os nomes codificados.
def prepare_encod_names(X):
    vec_names = []
    trunc_name = [str(i)[0:maxlen] for i in X]  
    for i in trunc_name:
        tmp = [set_flag(char_index[j]) for j in str(i)]
        for k in range(0,maxlen - len(str(i))):
            tmp.append(set_flag(char_index["END"]))
        vec_names.append(tmp)
    return vec_names

In [83]:
# Criando o modelo de rede neural
# Esse modelo possui as seguintes camadas: Convolução de entrada, concolução oculta,
# achatamento, densa oculta, achatamento e uma densa de saída

cnn = models.Sequential([
                         layers.Conv1D(256, 3, activation='relu', kernel_initializer='he_uniform', input_shape=(14,28)),
                         layers.Conv1D(256, 3, activation='relu'),
                         layers.Flatten(),
                         layers.Dropout(0.2),
                         layers.Dense(100, activation='relu', kernel_initializer='he_uniform'),
                         layers.Dropout(0.2),
                         layers.Dense(1, activation='sigmoid', kernel_initializer='he_uniform')
                         
])


In [84]:
cnn.load_weights(filepath=r"../machine-learning-models/deep-learning-models/1D-CNN/Model/CNN.hdf5")

In [85]:
x = prepare_encod_names(names)

In [86]:
predict_cnn = (cnn.predict(x) > 0.5).astype("int32")

In [87]:
p_cnn = [x[0] for x in predict_cnn]

In [88]:
len(names[p_cnn != classification])/len(classification) * 100

54.77229883917055

misclassified names

In [89]:
misclassifiedCNN = pd.DataFrame()
misclassifiedCNN['nome'] = names[p_cnn != classification]
misclassifiedCNN['ratio'] = test['ratio'][p_cnn != classification]
misclassifiedCNN['gênero_real'] = classification[p_cnn != classification]
misclassifiedCNN['gênero_predito'] = predict_cnn[p_cnn != classification]
misclassifiedCNN

Unnamed: 0,nome,ratio,gênero_real,gênero_predito
77699,thaini,1.0,0,1
20893,edeilsa,1.0,0,1
82859,waneska,1.0,0,1
15051,dagna,1.0,0,1
28219,evilany,1.0,0,1
...,...,...,...,...
8582,aurelene,1.0,0,1
39580,italva,1.0,0,1
26408,erllen,1.0,0,1
96516,jivoneide,1.0,0,1


In [97]:
misclassifiedCNN.to_csv("results/Classificação_incorreta_CNN.csv")

All classified names

In [91]:
resultsCNN = pd.DataFrame()
resultsCNN['nome'] = names
resultsCNN['ratio'] = test['ratio']
resultsCNN['gênero_real'] = classification
resultsCNN['gênero_predito'] = predict_cnn
resultsCNN

Unnamed: 0,nome,ratio,gênero_real,gênero_predito
77699,thaini,1.000000,0,1
21959,edjaelson,1.000000,1,1
20893,edeilsa,1.000000,0,1
42889,jequeson,1.000000,1,1
4172,alicino,1.000000,1,1
...,...,...,...,...
43482,jhonata,0.988947,1,1
16999,delmison,1.000000,1,1
96516,jivoneide,1.000000,0,1
49527,ketline,1.000000,0,1


In [92]:
results = pd.DataFrame()
results['nome'] = resultsSVM['nome']
results['ratio'] = resultsSVM['ratio']
results['genero_real'] = resultsSVM['gênero_real']
results['genero_predito_SVM'] = resultsSVM['gênero_predito']
results['genero_predito_CNN'] = resultsCNN['gênero_predito']
results['genero_predito_BiLSTM'] = resultsLSTM['gênero_predito']

In [93]:
results

Unnamed: 0,nome,ratio,genero_real,genero_predito_SVM,genero_predito_CNN,genero_predito_BiLSTM
77699,thaini,1.000000,F,F,1,1
21959,edjaelson,1.000000,M,M,1,1
20893,edeilsa,1.000000,F,F,1,1
42889,jequeson,1.000000,M,M,1,1
4172,alicino,1.000000,M,M,1,1
...,...,...,...,...,...,...
43482,jhonata,0.988947,M,M,1,0
16999,delmison,1.000000,M,M,1,1
96516,jivoneide,1.000000,F,F,1,1
49527,ketline,1.000000,F,F,1,1


In [94]:
results.to_csv("results/Comparacao_da_predicao.csv")