In [85]:
import numpy as np

In [86]:
import pandas as pd

In [87]:
#ML Packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
#from sklearn.feature_extraction.text import TfidfVectorizer

In [88]:
#Load our data
df = pd.read_csv('names_dataset_sinhala.csv')

In [89]:
df.head()

Unnamed: 0,Name,Gender
0,අචින්ත්‍යා,Female
1,අදිශානි,Female
2,අදිති,Female
3,අග්රානි,Female
4,අහංසා,Female


In [90]:
df.size

3676

In [91]:
#Data cleaning
#Checking for column name consistency
df.columns

Index(['Name', 'Gender'], dtype='object')

In [92]:
#Data Type
df.dtypes

Name      object
Gender    object
dtype: object

In [93]:
#Numer of female names
df[df.Gender == 'Female'].size

1802

In [94]:
#Numer of male names
df[df.Gender == 'Male'].size

1874

In [95]:
df_names = df

In [96]:
df_names.Gender.replace({'Female':0, 'Male':1}, inplace=True)

In [97]:
df_names.Gender.unique()

array([0, 1], dtype=int64)

In [98]:
df_names.dtypes

Name      object
Gender     int64
dtype: object

In [99]:
Xfeatures = df_names['Name']

In [100]:
#Feature Extraction
cv = CountVectorizer()
X = cv.fit_transform(Xfeatures)

In [101]:
cv.get_feature_names()

['අක',
 'අග',
 'අගත',
 'අගශ',
 'අච',
 'අත',
 'අද',
 'අන',
 'අනග',
 'අනන',
 'අප',
 'අබ',
 'අභ',
 'අම',
 'අමන',
 'අමය',
 'අමල',
 'අය',
 'අයන',
 'අර',
 'අරයන',
 'අරල',
 'අශ',
 'අස',
 'අහ',
 'අහර',
 'අහස',
 'ආකර',
 'ආදම',
 'ආන',
 'ආය',
 'ආරද',
 'ආල',
 'ආශ',
 'ඇන',
 'ඉත',
 'ඉන',
 'ඉඳ',
 'ඉම',
 'ඉමන',
 'ඉමය',
 'ඉර',
 'ඉරෂ',
 'ඉල',
 'ඉෂ',
 'ඉස',
 'ඉසල',
 'ඊඩන',
 'ඊතන',
 'උත',
 'උද',
 'උප',
 'උම',
 'උව',
 'උවන',
 'උෂ',
 'ඌව',
 'එත',
 'එන',
 'එම',
 'එමල',
 'එර',
 'එරන',
 'එෂ',
 'එෂල',
 'එසන',
 'එසඳ',
 'ඒන',
 'ඔක',
 'ඔකඳ',
 'ඔන',
 'ඔම',
 'ඔමල',
 'ඔල',
 'ඔව',
 'ඔශ',
 'ඔෂ',
 'ඔස',
 'ඔසන',
 'ඕව',
 'ඕෂ',
 'ඕෂද',
 'කන',
 'කය',
 'කල',
 'කව',
 'ගග',
 'ගගන',
 'ගද',
 'ගන',
 'ගය',
 'ගයත',
 'ගර',
 'ගව',
 'චක',
 'චත',
 'චන',
 'චම',
 'චමත',
 'චමද',
 'චමල',
 'ජන',
 'ජය',
 'ජයන',
 'ජයම',
 'ජයව',
 'ජල',
 'ජස',
 'ටන',
 'ඩන',
 'ඩය',
 'ණද',
 'තක',
 'තජ',
 'තත',
 'තන',
 'තනක',
 'තනව',
 'තම',
 'තමල',
 'තමෂ',
 'තර',
 'තව',
 'තශ',
 'තෂ',
 'තස',
 'දක',
 'දන',
 'දනර',
 'දම',
 'දර',
 'දස',
 'දහම',
 'ධන',
 'නක',
 'නජ',
 '

In [102]:
from sklearn.model_selection import train_test_split

In [103]:
#Features
X
#Labels
y = df_names.Gender

In [104]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33)

In [105]:
#Naive Bayes Classifier 
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.47775947281713343

In [106]:
#Accuracy of our model 
print("Accuracy of Model", clf.score(X_test, y_test)*100, '%')

Accuracy of Model 47.77594728171334 %


In [107]:
#Accuracy of our model 
print("Accuracy of Model", clf.score(X_train, y_train)*100, '%')

Accuracy of Model 62.38830219333875 %


In [108]:
#sample prediction
sample_name = ['නෙතුනි']
vect = cv.transform(sample_name).toarray()

In [109]:
vect

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
      dtype=int64)

In [110]:
clf.predict(vect)

array([1], dtype=int64)

In [111]:
#sample prediction2
sample_name1 = ['දිනේෂ්']
vect1 = cv.transform(sample_name1).toarray()

In [112]:
clf.predict(vect1)

array([1], dtype=int64)

In [113]:
def genderPredictor(a):
    test_name = [a]
    vector = cv.transform(test_name).toarray()
    if clf.predict(vector) == 0:
        print('Female')
    else:
        print('Male')


In [114]:
genderPredictor("රිද්මාලි")

Male


In [152]:
#Using a custom function for feature analysis
#By Analogy that most of the female names ends in 'A'  or  'I' or has the sound of 'A'
def features(name):
    return{
        'first-letter': name[0], #first letter
        'first2-letters': name[0:2], #first 2 letters
        'first3-letters': name[0:3], #first3 letters
        'last-letter': name[-1], #last letter
        'last2-letters': name[-2:], #last2 letters
        'last3-letters': name[-3:], #last3 letters
    }

In [153]:
#Vectorize the features function
features = np.vectorize(features)
print(features(['කසුනි', 'නවංජනා', 'සන්තුෂ්', 'භාතිය']))

[{'last-letter': 'ි', 'last2-letters': 'නි', 'last3-letters': 'ුනි'}
 {'last-letter': 'ා', 'last2-letters': 'නා', 'last3-letters': 'ජනා'}
 {'last-letter': '්', 'last2-letters': 'ෂ්', 'last3-letters': 'ුෂ්'}
 {'last-letter': 'ය', 'last2-letters': 'ිය', 'last3-letters': 'තිය'}]


In [154]:
#Extract the features for the dataset
df_X = features(df_names['Name'])
df_y = df_names['Gender']

In [155]:
from sklearn.feature_extraction import DictVectorizer
corpus = features(['කසුනි','සන්තුෂ්'])
dv = DictVectorizer()
dv.fit(corpus)
transformed = dv.transform(corpus)
print(transformed)

  (0, 1)	1.0
  (0, 2)	1.0
  (0, 4)	1.0
  (1, 0)	1.0
  (1, 3)	1.0
  (1, 5)	1.0


In [156]:
dv.get_feature_names()

['last-letter=්',
 'last-letter=ි',
 'last2-letters=නි',
 'last2-letters=ෂ්',
 'last3-letters=ුනි',
 'last3-letters=ුෂ්']

In [157]:
dfX_train, dfX_test, dfy_train, dfy_test = train_test_split(df_X,df_y, test_size=0.33, random_state=42)

In [158]:
dfX_train

array([{'last-letter': 'ි', 'last2-letters': 'දි', 'last3-letters': '්දි'},
       {'last-letter': 'ා', 'last2-letters': 'කා', 'last3-letters': 'ංකා'},
       {'last-letter': 'ක', 'last2-letters': 'ික', 'last3-letters': 'මික'},
       ...,
       {'last-letter': 'ි', 'last2-letters': 'ගි', 'last3-letters': 'ංගි'},
       {'last-letter': 'ර', 'last2-letters': 'වර', 'last3-letters': 'පවර'},
       {'last-letter': 'ෂ', 'last2-letters': '්ෂ', 'last3-letters': 'ල්ෂ'}],
      dtype=object)

In [159]:
dv = DictVectorizer()
dv.fit_transform(dfX_train)

<1231x513 sparse matrix of type '<class 'numpy.float64'>'
	with 3693 stored elements in Compressed Sparse Row format>

In [160]:
#Model building Using Dictionaries
from sklearn.tree import DecisionTreeClassifier
dclf= DecisionTreeClassifier()
my_xfeatures = dv.transform(dfX_train)
dclf.fit(my_xfeatures, dfy_train)

DecisionTreeClassifier()

In [161]:
DecisionTreeClassifier()

DecisionTreeClassifier()

In [162]:
#Build Features and Transform them
sample_name_eg = ['භාතිය']
transform_dv = dv.transform(features(sample_name_eg))

In [163]:
vect3 = transform_dv.toarray()

In [164]:
#Predicting Gender of name
#Male is 1, Female is 0
dclf.predict(vect3)

array([0], dtype=int64)

In [165]:
if dclf.predict(vect3) ==0:
    print('Female')
else:
    print('Male')

Female


In [166]:
#A function to do it - Watch 13.26
def genderPredictor1(a):
    test_name1 = [a]
    transform_dv = dv.transform(features(test_name1))
    vector = transform_dv.toarray()
    if dclf.predict(vector) == 0:
        print('Female')
    else:
        print('Male')
    

In [167]:
random_name_list = ["ඇලෙක්ස්","ශලීෂා", "සුසිතා", "නිර්මාල්"]

In [168]:
for n in random_name_list:
    print (genderPredictor1(n))

Male
None
Female
None
Female
None
Male
None


In [169]:
##Accuracy of models descision tree Classifier works better than Naive bayes
#Accuracy on training set
print(dclf.score(dv.transform(dfX_train), dfy_train))

0.9731925264012997


In [170]:
#Accuracy on test set
print(dclf.score(dv.transform(dfX_test), dfy_test))

0.9456342668863262


In [171]:
#Saving our model
import joblib

In [172]:
decisiontreeModel = open("decisiontreemodel.pkl","wb")

In [173]:
joblib.dump(dclf, decisiontreeModel)

In [174]:
decisiontreeModel.close

<function BufferedWriter.close>

In [175]:
#Alternative model close
import pickle

In [176]:
dctreeModel = open("namesdetectoremodel.pkl","wb")
pickle.dump(dclf, dctreeModel)

In [177]:
dctreeModel.close()

In [178]:
NaiveBayesModel = open("naivebayesgendermodel.pkl","wb")

In [179]:
joblib.dump(clf, NaiveBayesModel)

In [180]:
NaiveBayesModel.close()