In [104]:
import numpy as np

In [105]:
import pandas as pd

In [106]:
#ML Packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
#from sklearn.feature_extraction.text import TfidfVectorizer

In [107]:
#Load our data
df = pd.read_csv('names_dataset.csv')

In [108]:
df.head()

Unnamed: 0,Index,Name,Gender
0,0,Achinthya,Female
1,1,Adele,Female
2,2,Adishani,Female
3,3,Adithi,Female
4,4,Agrani,Female


In [109]:
df.size

5739

In [110]:
#Data cleaning
#Checking for column name consistency
df.columns

Index(['Index', 'Name', 'Gender'], dtype='object')

In [111]:
#Data Type
df.dtypes

Index      int64
Name      object
Gender    object
dtype: object

In [112]:
#Numer of female names
df[df.Gender == 'Female'].size

2736

In [113]:
#Numer of male names
df[df.Gender == 'Male'].size

3003

In [114]:
df_names = df

In [115]:
df_names.Gender.replace({'Female':0, 'Male':1}, inplace=True)

In [116]:
df_names.Gender.unique()

array([0, 1], dtype=int64)

In [117]:
df_names.dtypes

Index      int64
Name      object
Gender     int64
dtype: object

In [118]:
Xfeatures = df_names['Name']

In [119]:
#Feature Extraction
cv = CountVectorizer(encoding='utf-8')
X = cv.fit_transform(Xfeatures.values.astype('U'))

In [120]:
cv.get_feature_names()

['abeetha',
 'abilash',
 'abinada',
 'abinuka',
 'abises',
 'achinthya',
 'achira',
 'adam',
 'adeekshana',
 'adeepa',
 'adele',
 'adesh',
 'adishani',
 'aditha',
 'adithi',
 'agash',
 'agathisi',
 'agbow',
 'agrani',
 'ahansa',
 'aharsha',
 'ahasika',
 'akarsha',
 'akeesha',
 'aken',
 'akenya',
 'akeshia',
 'akethya',
 'akidu',
 'akila',
 'akith',
 'akithma',
 'akshadi',
 'alanah',
 'alen',
 'aliyah',
 'aloka',
 'amalka',
 'amalmi',
 'amameth',
 'amandi',
 'amasha',
 'amaya',
 'amayuru',
 'ameesha',
 'amelia',
 'amilka',
 'aminda',
 'amindi',
 'anagi',
 'anannya',
 'andrew',
 'andreyana',
 'anidu',
 'anoshi',
 'anudha',
 'anudima',
 'anuditha',
 'anuhansi',
 'anuja',
 'anujitha',
 'anuki',
 'anusara',
 'anusas',
 'anushima',
 'anuthmi',
 'apsara',
 'apurva',
 'araddya',
 'araliya',
 'arayan',
 'areli',
 'arithaka',
 'arkarshi',
 'arosh',
 'aroshika',
 'arshinsa',
 'arundi',
 'asel',
 'ashal',
 'ashani',
 'ashaya',
 'ashel',
 'ashennya',
 'ashinsa',
 'ashvika',
 'ashvini',
 'asindu',
 

In [121]:
from sklearn.model_selection import train_test_split

In [122]:
#Features
X
#Labels
y = df_names.Gender

In [123]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state =42)

In [124]:
#Naive Bayes Classifier 
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.509493670886076

In [125]:
#Accuracy of our model 
print("Accuracy of Model", clf.score(X_test, y_test)*100, '%')

Accuracy of Model 50.9493670886076 %


In [126]:
#Accuracy of our model 
print("Accuracy of Model", clf.score(X_train, y_train)*100, '%')

Accuracy of Model 98.43871975019516 %


In [127]:
#sample prediction
sample_name = ['Nethuni']
vect = cv.transform(sample_name).toarray()

In [128]:
vect

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [129]:
clf.predict(vect)

array([1], dtype=int64)

In [130]:
#sample prediction2
sample_name1 = ['Dinesh']
vect1 = cv.transform(sample_name1).toarray()
vect1

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [131]:
from sklearn import metrics

In [132]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score,f1_score

In [133]:
#print(metrics.confusion_matrix(X_train, y_train))

In [134]:
def genderPredictor(a):
    test_name = [a]
    vector = cv.transform(test_name).toarray()
    if clf.predict(vector) == 0:
        print('Female')
    else:
        print('Male')


In [135]:
genderPredictor("Ridhmali")

Male


In [136]:
name_list = ["Alex","Shalisha", "Susitha", "Nirmal"]
for n in name_list:
    print (genderPredictor1(n))

Male
None
Female
None
Male
None
Male
None


In [137]:
#Using a custom function for feature analysis
#By Analogy that most of the female names ends in 'A'  or  'I' or has the sound of 'A'
def features(name):
    name=name.lower()
    return{
        'first-letter': name[0], #first letter
        'first2-letters': name[0:2], #first 2 letters
        'first3-letters': name[0:3], #first3 letters
        'last-letter': name[-1], #last letter
        'last2-letters': name[-2:], #last2 letters
        'last3-letters': name[-3:], #last3 letters
    }

In [138]:
#Vectorize the features function
features = np.vectorize(features)
print(features(['Kasuni', 'Navanjana', 'Santhusha', 'Bhathiya']))

[{'first-letter': 'k', 'first2-letters': 'ka', 'first3-letters': 'kas', 'last-letter': 'i', 'last2-letters': 'ni', 'last3-letters': 'uni'}
 {'first-letter': 'n', 'first2-letters': 'na', 'first3-letters': 'nav', 'last-letter': 'a', 'last2-letters': 'na', 'last3-letters': 'ana'}
 {'first-letter': 's', 'first2-letters': 'sa', 'first3-letters': 'san', 'last-letter': 'a', 'last2-letters': 'ha', 'last3-letters': 'sha'}
 {'first-letter': 'b', 'first2-letters': 'bh', 'first3-letters': 'bha', 'last-letter': 'a', 'last2-letters': 'ya', 'last3-letters': 'iya'}]


In [139]:
#Extract the features for the dataset
df_X = features(df_names['Name'])
df_y = df_names['Gender']

In [140]:
from sklearn.feature_extraction import DictVectorizer
corpus = features(['කසුනි','සන්තුෂ්'])
dv = DictVectorizer()
dv.fit(corpus)
transformed = dv.transform(corpus)
print(transformed)

  (0, 0)	1.0
  (0, 2)	1.0
  (0, 4)	1.0
  (0, 7)	1.0
  (0, 8)	1.0
  (0, 10)	1.0
  (1, 1)	1.0
  (1, 3)	1.0
  (1, 5)	1.0
  (1, 6)	1.0
  (1, 9)	1.0
  (1, 11)	1.0


In [141]:
dv.get_feature_names()

['first-letter=ක',
 'first-letter=ස',
 'first2-letters=කස',
 'first2-letters=සන',
 'first3-letters=කසු',
 'first3-letters=සන්',
 'last-letter=්',
 'last-letter=ි',
 'last2-letters=නි',
 'last2-letters=ෂ්',
 'last3-letters=ුනි',
 'last3-letters=ුෂ්']

In [142]:
dfX_train, dfX_test, dfy_train, dfy_test = train_test_split(df_X,df_y, test_size=0.33, random_state=42)

In [143]:
dfX_train

array([{'first-letter': 'd', 'first2-letters': 'da', 'first3-letters': 'dak', 'last-letter': 'a', 'last2-letters': 'na', 'last3-letters': 'ina'},
       {'first-letter': 'v', 'first2-letters': 'vi', 'first3-letters': 'vid', 'last-letter': 'a', 'last2-letters': 'na', 'last3-letters': 'sna'},
       {'first-letter': 'v', 'first2-letters': 'vi', 'first3-letters': 'vis', 'last-letter': 'l', 'last2-letters': 'al', 'last3-letters': 'hal'},
       ...,
       {'first-letter': 'v', 'first2-letters': 'vi', 'first3-letters': 'vin', 'last-letter': 'a', 'last2-letters': 'ya', 'last3-letters': 'dya'},
       {'first-letter': 'n', 'first2-letters': 'ni', 'first3-letters': 'nis', 'last-letter': 'a', 'last2-letters': 'ya', 'last3-letters': 'kya'},
       {'first-letter': 'd', 'first2-letters': 'du', 'first3-letters': 'dul', 'last-letter': 'n', 'last2-letters': 'an', 'last3-letters': 'lan'}],
      dtype=object)

In [144]:
dv = DictVectorizer()
dv.fit_transform(dfX_train)

<1281x831 sparse matrix of type '<class 'numpy.float64'>'
	with 7686 stored elements in Compressed Sparse Row format>

In [145]:
#Model building Using Dictionaries
from sklearn.tree import DecisionTreeClassifier
dclf= DecisionTreeClassifier()
my_xfeatures = dv.transform(dfX_train)
dclf.fit(my_xfeatures, dfy_train)

DecisionTreeClassifier()

In [146]:
DecisionTreeClassifier()

DecisionTreeClassifier()

In [147]:
#Build Features and Transform them
sample_name_eg = ['Bhathiya']
transform_dv = dv.transform(features(sample_name_eg))

In [148]:
vect3 = transform_dv.toarray()

In [149]:
#Predicting Gender of name
#Male is 1, Female is 0
dclf.predict(vect3)

array([0], dtype=int64)

In [150]:
if dclf.predict(vect3) ==0:
    print('Female')
else:
    print('Male')

Female


In [151]:
#A function to do it - Watch 13.26
def genderPredictor1(a):
    test_name1 = [a]
    transform_dv = dv.transform(features(test_name1))
    vector = transform_dv.toarray()
    if dclf.predict(vector) == 0:
        print('Female')
    else:
        print('Male')
    

In [152]:
random_name_list = ["ඇලෙක්ස්","ශලීෂා", "සුසිතා", "නිර්මාල්"]

In [153]:
for n in random_name_list:
    print (genderPredictor1(n))

Male
None
Male
None
Male
None
Male
None


In [154]:
##Accuracy of models descision tree Classifier works better than Naive bayes
#Accuracy on training set
print("Accuracy of Model", dclf.score(dv.transform(dfX_train), dfy_train))

Accuracy of Model 0.975800156128025


In [155]:
#Accuracy on test set
print("Accuracy of Model", dclf.score(dv.transform(dfX_test), dfy_test))

Accuracy of Model 0.8575949367088608


In [156]:
from sklearn.metrics import precision_score, recall_score,f1_score
dclf.recall_score(dv.transform(dfX_test), dfy_test)

AttributeError: 'DecisionTreeClassifier' object has no attribute 'recall_score'

In [157]:
#Saving our model
import joblib

In [240]:
decisiontreeModel = open("decisiontreemodel.pkl","wb")

In [241]:
joblib.dump(dclf, decisiontreeModel)

In [242]:
decisiontreeModel.close

<function BufferedWriter.close>

In [243]:
#Alternative model close
import pickle

In [244]:
dctreeModel = open("namesdetectoremodel.pkl","wb")
pickle.dump(dclf, dctreeModel)

In [245]:
dctreeModel.close()

In [246]:
NaiveBayesModel = open("naivebayesgendermodel.pkl","wb")

In [247]:
joblib.dump(clf, NaiveBayesModel)

In [248]:
NaiveBayesModel.close()