## Gender Classification Of Names
### Using Machine Learning To Detect/Predict Gender of Individuals 
+ Sklearn
+ Pandas
+ Text Extraction

In [None]:
# EDA packages
import pandas as pd
import numpy as np


In [None]:
# ML Packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
#from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
# Load our data
df = pd.read_csv('names_dataset.csv')

In [None]:
df.head()

In [None]:
df.size

In [None]:
# Data Cleaning
# Checking for column name consistency
df.columns

In [None]:
# Data Types
df.dtypes

In [None]:
# Checking for Missing Values
df.isnull().isnull().sum()

In [None]:
# Number of Female Names
df[df.sex == 'F'].size

In [None]:
# Number of Male Names
df[df.sex == 'M'].size

In [None]:
df_names = df

In [None]:
# Replacing All F and M with 0 and 1 respectively
df_names.sex.replace({'F':0,'M':1},inplace=True)

In [None]:
df_names.sex.unique()

In [None]:
df_names.dtypes

In [None]:
Xfeatures =df_names['name']

In [None]:
# Feature Extraction 
cv = CountVectorizer()
X = cv.fit_transform(Xfeatures)

In [None]:
cv.get_feature_names()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Features 
X
# Labels
y = df_names.sex

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
# Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)


In [None]:
# Accuracy of our Model
print("Accuracy of Model",clf.score(X_test,y_test)*100,"%")

In [None]:
# Accuracy of our Model
print("Accuracy of Model",clf.score(X_train,y_train)*100,"%")

### Sample Prediction

In [None]:
# Sample1 Prediction
sample_name = ["Mary"]
vect = cv.transform(sample_name).toarray()

In [None]:
vect

In [None]:
# Female is 0, Male is 1
clf.predict(vect)

In [None]:
# Sample2 Prediction
sample_name1 = ["Mark"]
vect1 = cv.transform(sample_name1).toarray()

In [None]:
clf.predict(vect1)

In [None]:
# Sample3 Prediction of Russian Names
sample_name2 = ["Natasha"]
vect2 = cv.transform(sample_name2).toarray()

In [None]:
clf.predict(vect2)

In [None]:
# Sample3 Prediction of Random Names
sample_name3 = ["Nefertiti","Nasha","Ama","Ayo","Xhavier","Ovetta","Tathiana","Xia","Joseph","Xianliang"]
vect3 = cv.transform(sample_name3).toarray()

In [None]:
clf.predict(vect3)

In [None]:
# A function to do it
def genderpredictor(a):
    test_name = [a]
    vector = cv.transform(test_name).toarray()
    if clf.predict(vector) == 0:
        print("Female")
    else:
        print("Male")
    

In [None]:
genderpredictor("Martha")

In [None]:
Features fxn
apply the fxn
vectorizer
fit
transform
classifier
fit
predict


In [None]:
namelist = ["Yaa","Yaw","Femi","Masha"]
for i in namelist:
    print(genderpredictor(i))

### Using a custom function for feature analysis

In [None]:
# By Analogy most female names ends in 'A' or 'E' or has the sound of 'A'
def features(name):
    name = name.lower()
    return {
        'first-letter': name[0], # First letter
        'first2-letters': name[0:2], # First 2 letters
        'first3-letters': name[0:3], # First 3 letters
        'last-letter': name[-1],
        'last2-letters': name[-2:],
        'last3-letters': name[-3:],
    }

In [None]:
# Vectorize the features function
features = np.vectorize(features)
print(features(["Anna", "Hannah", "Peter","John","Vladmir","Mohammed"]))

In [None]:
# Extract the features for the dataset
df_X = features(df_names['name'])

In [None]:
df_y = df_names['sex']

In [None]:
from sklearn.feature_extraction import DictVectorizer
 
corpus = features(["Mike", "Julia"])
dv = DictVectorizer()
dv.fit(corpus)
transformed = dv.transform(corpus)
print(transformed)
 

In [None]:
dv.get_feature_names()

In [None]:
# Train Test Split
dfX_train, dfX_test, dfy_train, dfy_test = train_test_split(df_X, df_y, test_size=0.33, random_state=42)

In [None]:
dfX_train

In [None]:

dv = DictVectorizer()
dv.fit_transform(dfX_train)


In [None]:
# Model building Using DecisionTree

from sklearn.tree import DecisionTreeClassifier
 
dclf = DecisionTreeClassifier()
my_xfeatures =dv.transform(dfX_train)
dclf.fit(my_xfeatures, dfy_train)


In [None]:
# Build Features and Transform them
sample_name_eg = ["Alex"]
transform_dv =dv.transform(features(sample_name_eg))


In [None]:
vect3 = transform_dv.toarray()

In [None]:
# Predicting Gender of Name
# Male is 1,female = 0
dclf.predict(vect3)

In [None]:
if dclf.predict(vect3) == 0:
    print("Female")
else:
    print("Male")

In [None]:
# Second Prediction With Nigerian Name
name_eg1 = ["Chioma"]
transform_dv =dv.transform(features(name_eg1))
vect4 = transform_dv.toarray()
if dclf.predict(vect4) == 0:
    print("Female")
else:
    print("Male")

In [None]:
# A function to do it
def genderpredictor1(a):
    test_name1 = [a]
    transform_dv =dv.transform(features(test_name1))
    vector = transform_dv.toarray()
    if dclf.predict(vector) == 0:
        print("Female")
    else:
        print("Male")
    

In [None]:
random_name_list = ["Alex","Alice","Chioma","Vitalic","Clairese","Chan"]

In [None]:
for n in random_name_list:
    print(genderpredictor1(n))

In [None]:
## Accuracy of Models Decision Tree Classifier Works better than Naive Bayes
# Accuracy on training set
print(dclf.score(dv.transform(dfX_train), dfy_train)) 
 

In [None]:
# Accuracy on test set
print(dclf.score(dv.transform(dfX_test), dfy_test))

### Saving Our Model

In [None]:
from sklearn.externals import joblib

In [None]:
decisiontreModel = open("decisiontreemodel.pkl","wb")

In [None]:
joblib.dump(dclf,decisiontreModel)

In [None]:
decisiontreModel.close

In [None]:
#Alternative to Model Saving
import pickle
dctreeModel = open("namesdetectormodel.pkl","wb")

In [None]:
pickle.dump(dclf,dctreeModel)

In [None]:
dctreeModel.close()

##### Save Multinomial NB Model

In [None]:
NaiveBayesModel = open("naivebayesgendermodel.pkl","wb")

In [None]:
joblib.dump(clf,NaiveBayesModel)

In [None]:
NaiveBayesModel.close()

In [None]:
# Thanks
# By Jesse JCharis
# Jesus Saves @ JCharisTech
# J-Secur1ty

In [None]:
print(genderpredictor("Bianca"))

In [None]:
a=(genderpredictor("Sasha"))
print(a)

In [None]:
b=genderpredictor1("Sasha")
print(b)

In [None]:
list2=["Alex","Alice","Chioma","Vitalic","Clairese","Chan","Aisha","Sasha","Glenn","Vitória","Bruna","Brenda","Karina","Adalgira","João","Pedro","Reginaldo","Sofia","Sara","Bianca","Samuel","Diana","Letícia","Marinalva","Efigênia","Rita","Ancelmo"]
#for n in list2:
   # print(genderpredictor1(n))
    
print("\n##########################\n")    
#for o in list2:
   # print(genderpredictor(o))
p=len(list2)#(range(10)))   
print(p)
i=0
while(i<p):
    list3=list2[i]
    #print(list3)
    lista4=[list3]
    #print( lista4)
    transform_dv =dv.transform(features(lista4))
    vect4 = transform_dv.toarray()
    if dclf.predict(vect4) == 0:
        print("Female")
    else:
        print("Male")
    i=i+1

"""

for p in range list2:
    #print(genderpredictor(o))
    list3=list2[p]
    transform_dv =dv.transform(features(list3))
    vect4 = transform_dv.toarray()
    if dclf.predict(vect4) == 0:
        print("Female")
    else:
        print("Male")
"""        

In [None]:
#arquivo = open('Dataset.csv',encoding="utf8")
#po=len(arquivo)
#print(po)
#linhas=arquivo.read()
#print(linhas)
#i=0
#for linha in  linhas:
 #   print(i+1)
  #  print(linha)
   # print("\n")
    #i=i+1
#print(linhas[1][])

import csv
ashura="Gender"
with open('PDT2A.csv',encoding="utf8") as stream:
    reader = csv.reader(stream)
    for row in reader:
        print(row)
        lista4=row
        #print( lista4)
        transform_dv =dv.transform(features(lista4))
        vect4 = transform_dv.toarray()
        if dclf.predict(vect4) == 0:
           # print("Female")
            ashura+="Female \n"
        else:
            #print("Male")
            ashura+="Male \n"
    #i=i+1
print("\n#################################\n\n")
#list2=["OLGAMIR AMÂNCIA FERREIRA","WIJAIRO JOSE DA COSTA MENDONCA ","JACIR ALCEDIR BENETTI","ADIVEONY PEREIRA VAZ","SEDENI JOSE JOAQUIM","SOMER PEREIRA DO NASCIMENTO"]
"""
p=len(list2)#(range(10)))  
print("\n****************************\n")
print(p)
i=0

while(i<p):
    list3=list2[i]
    #print(list3)
    lista4=[list3]
    #print( lista4)
    transform_dv =dv.transform(features(lista4))
    vect4 = transform_dv.toarray()
    if dclf.predict(vect4) == 0:
        #print("Female")
        ashura+="female \n"
    else:
        ashura+="male \n"
       # print("Male")
    i=i+1
"""    
with open('PythonPDT.csv', 'a') as arquivo_csv:    
    escrever = csv.writer(arquivo_csv, lineterminator='\n')
    escrever.writerow(ashura)

In [None]:
list2=[]
list3=[]
i=1
import csv
ashura=""
with open('ClassificadosPT.csv',encoding="utf8") as stream:
    reader = csv.reader(stream)
    for row in reader:
        list2.append(row)
        i=i+1
with open('PythonPT.csv',encoding="utf8") as stream:
    reader = csv.reader(stream)
    for row in reader:
        list3.append(row)

p=len(list2)#(range(10)))   
print(p)
p2=len(list3)
print(p2)
i=0
while(i<p):
    print((list2[i]))
    if  "0" in list2[i]:
        ashura+=str(list3[i])
        print("brinks")
       
    else:   
        ashura+=str(list2[i])
        print("Navelin")
        
    i=i+1 
    ashura=ashura+"\n"
    
#print(list2[2300])
#print("\n#####################\n")
#print(list3[2300])
with open('PTP3Y.csv', 'a') as arquivo_csv:    
    escrever = csv.writer(arquivo_csv, lineterminator='\n')
    escrever.writerow(ashura)