In [15]:
import numpy as np
import pandas as pd

In [53]:
# Sklearn Packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split

# Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier

import joblib
import pickle

In [52]:
!pip install joblib



In [18]:
dataset_path = './Data/name_dataset.csv'
# Load Data
df = pd.read_csv(dataset_path)

In [20]:
# Show sample of data
df.head(5)

Unnamed: 0,index,name,sex
0,0,Mary,F
1,1,Anna,F
2,2,Emma,F
3,3,Elizabeth,F
4,4,Minnie,F


In [26]:
df.size, df.columns

(285075, Index(['index', 'name', 'sex'], dtype='object'))

In [27]:
df.dtypes

index     int64
name     object
sex      object
dtype: object

In [32]:
# Check for missing values
df.isnull().isnull().sum()

index    0
name     0
sex      0
dtype: int64

In [38]:
# Number of Female Names
x = df[df.sex == 'F'].size
x

181800

In [39]:
# Number of Male Names
y = df[df.sex == 'M'].size
y

103275

In [40]:
df.size == (x+y)

True

In [41]:
df_names = df

In [42]:
# Replacing All F and M with 0 and 1 respectively
df_names.sex.replace({'F':0,'M':1},inplace=True)

In [43]:
df_names.sex.unique()

array([0, 1], dtype=int64)

In [44]:
df_names.dtypes

index     int64
name     object
sex       int64
dtype: object

In [45]:
Xfeatures =df_names['name']

In [46]:
# Feature Extraction 
cv = CountVectorizer()
# Features
X = cv.fit_transform(Xfeatures)

In [49]:
cv.get_feature_names()[0:5]

['aaban', 'aabha', 'aabid', 'aabriella', 'aada']

In [54]:
# Labels
y = df_names.sex

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [65]:
# Naive Bayes Classifier
clf = MultinomialNB()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

0.6418310970797159

In [64]:
# Accuracy of our Model
print("Accuracy of Model",round(clf.score(X_test,y_test),3)*100,"%")

Accuracy of Model 64.2 %


In [67]:
# Sample1 Prediction
sample_name = ["Julie"]
vect = cv.transform(sample_name).toarray()

In [68]:
vect

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [69]:
# Female is 0, Male is 1
clf.predict(vect)

array([0], dtype=int64)

In [70]:
# Sample2 Prediction
sample_name1 = ["Jason"]
vect1 = cv.transform(sample_name1).toarray()

In [71]:
clf.predict(vect1)

array([1], dtype=int64)

In [73]:
# A function to do it
def genderpredictor(a):
    test_name = [a]
    vector = cv.transform(test_name).toarray()
    if clf.predict(vector) == 0:
        print("Female")
    else:
        print("Male")

In [79]:
genderpredictor("Ken")

Male


In [80]:
namelist = ["Yaa","Yaw","Femi","Masha"]
for i in namelist:
    print(genderpredictor(i))

Female
None
Male
None
Female
None
Female
None


In [81]:
# By Analogy most female names ends in 'A' or 'E' or has the sound of 'A'
def features(name):
    name = name.lower()
    return {
        'first-letter': name[0], # First letter
        'first2-letters': name[0:2], # First 2 letters
        'first3-letters': name[0:3], # First 3 letters
        'last-letter': name[-1],
        'last2-letters': name[-2:],
        'last3-letters': name[-3:],
    }

In [82]:
# Vectorize the features function
features = np.vectorize(features)
print(features(["Anna", "Hannah", "Peter","John","Vladmir","Mohammed"]))

[{'first-letter': 'a', 'first2-letters': 'an', 'first3-letters': 'ann', 'last-letter': 'a', 'last2-letters': 'na', 'last3-letters': 'nna'}
 {'first-letter': 'h', 'first2-letters': 'ha', 'first3-letters': 'han', 'last-letter': 'h', 'last2-letters': 'ah', 'last3-letters': 'nah'}
 {'first-letter': 'p', 'first2-letters': 'pe', 'first3-letters': 'pet', 'last-letter': 'r', 'last2-letters': 'er', 'last3-letters': 'ter'}
 {'first-letter': 'j', 'first2-letters': 'jo', 'first3-letters': 'joh', 'last-letter': 'n', 'last2-letters': 'hn', 'last3-letters': 'ohn'}
 {'first-letter': 'v', 'first2-letters': 'vl', 'first3-letters': 'vla', 'last-letter': 'r', 'last2-letters': 'ir', 'last3-letters': 'mir'}
 {'first-letter': 'm', 'first2-letters': 'mo', 'first3-letters': 'moh', 'last-letter': 'd', 'last2-letters': 'ed', 'last3-letters': 'med'}]


In [83]:
# Extract the features for the dataset
df_X = features(df_names['name'])

In [84]:
df_y = df_names['sex']

In [85]:
corpus = features(["Mike", "Julia"])
dv = DictVectorizer()
dv.fit(corpus)
transformed = dv.transform(corpus)
print(transformed)

  (0, 1)	1.0
  (0, 3)	1.0
  (0, 5)	1.0
  (0, 7)	1.0
  (0, 9)	1.0
  (0, 10)	1.0
  (1, 0)	1.0
  (1, 2)	1.0
  (1, 4)	1.0
  (1, 6)	1.0
  (1, 8)	1.0
  (1, 11)	1.0


In [86]:
dv.get_feature_names()



['first-letter=j',
 'first-letter=m',
 'first2-letters=ju',
 'first2-letters=mi',
 'first3-letters=jul',
 'first3-letters=mik',
 'last-letter=a',
 'last-letter=e',
 'last2-letters=ia',
 'last2-letters=ke',
 'last3-letters=ike',
 'last3-letters=lia']

In [87]:
# Train Test Split
dfX_train, dfX_test, dfy_train, dfy_test = train_test_split(df_X, df_y, test_size=0.2, random_state=42)

In [88]:
dv = DictVectorizer()
dv.fit_transform(dfX_train)

<76020x8559 sparse matrix of type '<class 'numpy.float64'>'
	with 456120 stored elements in Compressed Sparse Row format>

In [89]:
dclf = DecisionTreeClassifier()
my_xfeatures = dv.transform(dfX_train)
dclf.fit(my_xfeatures, dfy_train)

DecisionTreeClassifier()

In [90]:
 # Build Features and Transform them
sample_name_eg = ["Alex"]
transform_dv =dv.transform(features(sample_name_eg))

In [91]:
vect3 = transform_dv.toarray()

In [92]:
# Predicting Gender of Name
# Male is 1,female = 0
dclf.predict(vect3)

array([1], dtype=int64)

In [94]:
 # Build Features and Transform them
sample_name_eg2 = ["Alisson"]
transform_dv2 =dv.transform(features(sample_name_eg2))
vect4 = transform_dv2.toarray()

In [95]:
dclf.predict(vect4)

array([0], dtype=int64)

In [96]:
# A function to do it
def genderpredictor1(a):
    test_name1 = [a]
    transform_dv =dv.transform(features(test_name1))
    vector = transform_dv.toarray()
    if dclf.predict(vector) == 0:
        print("Female")
    else:
        print("Male")
    

In [101]:
random_name_list = ["Alex","Alice","Chioma","Vitalic","Clairese","Chan"]
for n in range(6):
    print(genderpredictor1(random_name_list[n]))

Male
None
Female
None
Female
None
Female
None
Female
None
Male
None


In [102]:
## Decision Tree > Naive Bayes
# Accuracy on training set
print(dclf.score(dv.transform(dfX_train), dfy_train)) 


0.9876742962378321


In [103]:
# Accuracy on test set
print(dclf.score(dv.transform(dfX_test), dfy_test))

0.8685609050249934


In [104]:
# Save model via joblib
decisiontreModel = open("decisiontreemodel.pkl","wb")
joblib.dump(dclf,decisiontreModel)
decisiontreModel.close

<function BufferedWriter.close>

In [None]:
# dctreeModel = open("namesdetectormodel.pkl","wb")
# pickle.dump(dclf,dctreeModel)
# dctreeModel.close()