In [182]:
import numpy as np

In [183]:
import pandas as pd

In [184]:
#ML Packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
#from sklearn.feature_extraction.text import TfidfVectorizer

In [185]:
#Load our data
df = pd.read_csv('names_dataset_sinhala.csv')

In [186]:
df.head()

Unnamed: 0,Name,Gender
0,අචින්ත්‍යා,Female
1,අදිශානි,Female
2,අදිති,Female
3,අග්රානි,Female
4,අහංසා,Female


In [187]:
df.size

3676

In [188]:
#Data cleaning
#Checking for column name consistency
df.columns

Index(['Name', 'Gender'], dtype='object')

In [189]:
#Data Type
df.dtypes

Name      object
Gender    object
dtype: object

In [190]:
#Numer of female names
df[df.Gender == 'Female'].size

1802

In [191]:
#Numer of male names
df[df.Gender == 'Male'].size

1874

In [192]:
df_names = df

In [193]:
df_names.Gender.replace({'Female':0, 'Male':1}, inplace=True)

In [194]:
df_names.Gender.unique()

array([0, 1], dtype=int64)

In [195]:
df_names.dtypes

Name      object
Gender     int64
dtype: object

In [196]:
Xfeatures = df_names['Name']

In [197]:
#Feature Extraction
#cv = CountVectorizer()
#X = cv.fit_transform(Xfeatures)

#Using a custom function for feature analysis
#By Analogy that most of the female names ends in 'A'  or  'I' or has the sound of 'A'
def features(name):
    return{
        'first-letter': name[0], #first letter
        'first2-letters': name[0:2], #first 2 letters
        'first3-letters': name[0:3], #first3 letters
        'last-letter': name[-1], #last letter
        'last2-letters': name[-2:], #last2 letters
        'last3-letters': name[-3:], #last3 letters
    }
#Vectorize the features function
features = np.vectorize(features)
print(features(['කසුනි', 'නවංජනා', 'සන්තුෂ්', 'භාතිය']))

#Extract the features for the dataset
X = features(df_names['Name'])
y = df_names['Gender']


[{'first-letter': 'ක', 'first2-letters': 'කස', 'first3-letters': 'කසු', 'last-letter': 'ි', 'last2-letters': 'නි', 'last3-letters': 'ුනි'}
 {'first-letter': 'න', 'first2-letters': 'නව', 'first3-letters': 'නවං', 'last-letter': 'ා', 'last2-letters': 'නා', 'last3-letters': 'ජනා'}
 {'first-letter': 'ස', 'first2-letters': 'සන', 'first3-letters': 'සන්', 'last-letter': '්', 'last2-letters': 'ෂ්', 'last3-letters': 'ුෂ්'}
 {'first-letter': 'භ', 'first2-letters': 'භා', 'first3-letters': 'භාත', 'last-letter': 'ය', 'last2-letters': 'ිය', 'last3-letters': 'තිය'}]


In [198]:
from sklearn.feature_extraction import DictVectorizer
corpus = features(['කසුනි','සන්තුෂ්'])
dv = DictVectorizer()
dv.fit(corpus)
transformed = dv.transform(corpus)
print(transformed)

  (0, 0)	1.0
  (0, 2)	1.0
  (0, 4)	1.0
  (0, 7)	1.0
  (0, 8)	1.0
  (0, 10)	1.0
  (1, 1)	1.0
  (1, 3)	1.0
  (1, 5)	1.0
  (1, 6)	1.0
  (1, 9)	1.0
  (1, 11)	1.0


In [199]:
dv.get_feature_names()

['first-letter=ක',
 'first-letter=ස',
 'first2-letters=කස',
 'first2-letters=සන',
 'first3-letters=කසු',
 'first3-letters=සන්',
 'last-letter=්',
 'last-letter=ි',
 'last2-letters=නි',
 'last2-letters=ෂ්',
 'last3-letters=ුනි',
 'last3-letters=ුෂ්']

In [200]:
from sklearn.model_selection import train_test_split

In [201]:
#Features
#X
#Labels
#y = df_names.Gender

In [202]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33)

In [203]:
X_train

array([{'first-letter': 'ර', 'first2-letters': 'රශ', 'first3-letters': 'රශෙ', 'last-letter': 'ා', 'last2-letters': 'රා', 'last3-letters': 'ෙරා'},
       {'first-letter': 'ර', 'first2-letters': 'රො', 'first3-letters': 'රොශ', 'last-letter': 'ි', 'last2-letters': 'නි', 'last3-letters': '්නි'},
       {'first-letter': 'ස', 'first2-letters': 'සත', 'first3-letters': 'සත්', 'last-letter': 'ි', 'last2-letters': 'දි', 'last3-letters': 'ාදි'},
       ...,
       {'first-letter': 'ත', 'first2-letters': 'තේ', 'first3-letters': 'තේන', 'last-letter': 'ක', 'last2-letters': 'ුක', 'last3-letters': 'නුක'},
       {'first-letter': 'ම', 'first2-letters': 'මි', 'first3-letters': 'මිහ', 'last-letter': 'ය', 'last2-letters': 'ජය', 'last3-letters': 'ිජය'},
       {'first-letter': 'න', 'first2-letters': 'නි', 'first3-letters': 'නිර', 'last-letter': 'ා', 'last2-letters': 'කා', 'last3-letters': 'ිකා'}],
      dtype=object)

In [204]:
dv = DictVectorizer()
dv.fit_transform(X_train)

<1231x1409 sparse matrix of type '<class 'numpy.float64'>'
	with 7386 stored elements in Compressed Sparse Row format>

In [205]:
#Naive Bayes Classifier 
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
my_features = dv.transform(X_train)
clf.fit(my_features, y_train)


MultinomialNB()

In [181]:
#Accuracy of our model 
print("Accuracy of Model", clf.score(X_test, y_test)*100, '%')

TypeError: float() argument must be a string or a number, not 'dict'

In [62]:
#Accuracy of our model 
print("Accuracy of Model", clf.score(X_train, y_train)*100, '%')

Accuracy of Model 62.144597887896026 %


In [63]:
#sample prediction
sample_name = ['නෙතුනි']
vect = cv.transform(sample_name).toarray()

In [64]:
vect

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
      dtype=int64)

In [65]:
clf.predict(vect)

array([1], dtype=int64)

In [66]:
#sample prediction2
sample_name1 = ['දිනේෂ්']
vect1 = cv.transform(sample_name1).toarray()

In [67]:
clf.predict(vect1)

array([1], dtype=int64)

In [68]:
def genderPredictor(a):
    test_name = [a]
    vector = cv.transform(test_name).toarray()
    if clf.predict(vector) == 0:
        print('Female')
    else:
        print('Male')


In [69]:
genderPredictor("රිද්මාලි")

Male


In [70]:
#Using a custom function for feature analysis
#By Analogy that most of the female names ends in 'A'  or  'I' or has the sound of 'A'
def features(name):
    return{
        'first-letter': name[0], #first letter
        'first2-letters': name[0:2], #first 2 letters
        'first3-letters': name[0:3], #first3 letters
        'last-letter': name[-1], #last letter
        'last2-letters': name[-2:], #last2 letters
        'last3-letters': name[-3:], #last3 letters
    }

In [71]:
#Vectorize the features function
features = np.vectorize(features)
print(features(['කසුනි', 'නවංජනා', 'සන්තුෂ්', 'භාතිය']))

[{'first-letter': 'ක', 'first2-letters': 'කස', 'first3-letters': 'කසු', 'last-letter': 'ි', 'last2-letters': 'නි', 'last3-letters': 'ුනි'}
 {'first-letter': 'න', 'first2-letters': 'නව', 'first3-letters': 'නවං', 'last-letter': 'ා', 'last2-letters': 'නා', 'last3-letters': 'ජනා'}
 {'first-letter': 'ස', 'first2-letters': 'සන', 'first3-letters': 'සන්', 'last-letter': '්', 'last2-letters': 'ෂ්', 'last3-letters': 'ුෂ්'}
 {'first-letter': 'භ', 'first2-letters': 'භා', 'first3-letters': 'භාත', 'last-letter': 'ය', 'last2-letters': 'ිය', 'last3-letters': 'තිය'}]


In [168]:
#Extract the features for the dataset
df_X = features(df_names['Name'])
df_y = df_names['Gender']

In [169]:
from sklearn.feature_extraction import DictVectorizer
corpus = features(['කසුනි','සන්තුෂ්'])
dv = DictVectorizer()
dv.fit(corpus)
transformed = dv.transform(corpus)
print(transformed)

  (0, 0)	1.0
  (0, 2)	1.0
  (0, 4)	1.0
  (0, 7)	1.0
  (0, 8)	1.0
  (0, 10)	1.0
  (1, 1)	1.0
  (1, 3)	1.0
  (1, 5)	1.0
  (1, 6)	1.0
  (1, 9)	1.0
  (1, 11)	1.0


In [170]:
dv.get_feature_names()

['first-letter=ක',
 'first-letter=ස',
 'first2-letters=කස',
 'first2-letters=සන',
 'first3-letters=කසු',
 'first3-letters=සන්',
 'last-letter=්',
 'last-letter=ි',
 'last2-letters=නි',
 'last2-letters=ෂ්',
 'last3-letters=ුනි',
 'last3-letters=ුෂ්']

In [171]:
dfX_train, dfX_test, dfy_train, dfy_test = train_test_split(df_X,df_y, test_size=0.33, random_state=42)

In [172]:
dfX_train

array([{'first-letter': 'න', 'first2-letters': 'නෙ', 'first3-letters': 'නෙස', 'last-letter': 'ි', 'last2-letters': 'දි', 'last3-letters': '්දි'},
       {'first-letter': 'බ', 'first2-letters': 'බි', 'first3-letters': 'බිල', 'last-letter': 'ා', 'last2-letters': 'කා', 'last3-letters': 'ංකා'},
       {'first-letter': 'න', 'first2-letters': 'නෂ', 'first3-letters': 'නෂ්', 'last-letter': 'ක', 'last2-letters': 'ික', 'last3-letters': 'මික'},
       ...,
       {'first-letter': 'ව', 'first2-letters': 'වි', 'first3-letters': 'විර', 'last-letter': 'ි', 'last2-letters': 'ගි', 'last3-letters': 'ංගි'},
       {'first-letter': 'ප', 'first2-letters': 'පව', 'first3-letters': 'පවර', 'last-letter': 'ර', 'last2-letters': 'වර', 'last3-letters': 'පවර'},
       {'first-letter': 'ද', 'first2-letters': 'දු', 'first3-letters': 'දුම', 'last-letter': 'ෂ', 'last2-letters': '්ෂ', 'last3-letters': 'ල්ෂ'}],
      dtype=object)

In [173]:
dv = DictVectorizer()
dv.fit_transform(dfX_train)

<1231x1370 sparse matrix of type '<class 'numpy.float64'>'
	with 7386 stored elements in Compressed Sparse Row format>

In [174]:
#Model building Using Dictionaries
from sklearn.tree import DecisionTreeClassifier
dclf= DecisionTreeClassifier()
my_xfeatures = dv.transform(dfX_train)
dclf.fit(my_xfeatures, dfy_train)

DecisionTreeClassifier()

In [175]:
DecisionTreeClassifier()

DecisionTreeClassifier()

In [176]:
#Build Features and Transform them
sample_name_eg = ['භාතිය']
transform_dv = dv.transform(features(sample_name_eg))

In [81]:
vect3 = transform_dv.toarray()

In [82]:
#Predicting Gender of name
#Male is 1, Female is 0
dclf.predict(vect3)

array([0], dtype=int64)

In [83]:
if dclf.predict(vect3) ==0:
    print('Female')
else:
    print('Male')

Female


In [45]:
#A function to do it - Watch 13.26
def genderPredictor1(a):
    test_name1 = [a]
    transform_dv = dv.transform(features(test_name1))
    vector = transform_dv.toarray()
    if dclf.predict(vector) == 0:
        print('Female')
    else:
        print('Male')
    

In [46]:
random_name_list = ["ඇලෙක්ස්","ශලීෂා", "සුසිතා", "නිර්මාල්"]

In [47]:
for n in random_name_list:
    print (genderPredictor1(n))

Male
None
Male
None
Female
None
Male
None


In [48]:
##Accuracy of models descision tree Classifier works better than Naive bayes
#Accuracy on training set
print(dclf.score(dv.transform(dfX_train), dfy_train))

0.9967506092607636


In [49]:
#Accuracy on test set
print(dclf.score(dv.transform(dfX_test), dfy_test))

0.9472817133443163


In [50]:
#Saving our model
import joblib

In [51]:
decisiontreeModel = open("decisiontreemodel.pkl","wb")

In [52]:
joblib.dump(dclf, decisiontreeModel)

In [53]:
decisiontreeModel.close

<function BufferedWriter.close>

In [54]:
#Alternative model close
import pickle

In [55]:
dctreeModel = open("namesdetectoremodel.pkl","wb")
pickle.dump(dclf, dctreeModel)

In [56]:
dctreeModel.close()

In [57]:
NaiveBayesModel = open("naivebayesgendermodel.pkl","wb")

In [179]:
joblib.dump(clf, NaiveBayesModel)

In [180]:
NaiveBayesModel.close()