In [5]:
import numpy as np
import pandas as pd

In [6]:
#Machine Learning Packages used for feature extraction
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
#from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
#Loading the data from a csv file to a dataframe
df = pd.read_csv('https://raw.githubusercontent.com/NiharaDeSilva/BA3006-Text-analytics/main/Assignment%201_BA3006/19880057/names_dataset_sinhala.csv')

In [8]:
#Print the top 5 records
df.head()

Unnamed: 0,Name,Gender
0,අචින්ත්‍යා,Female
1,අදිශානි,Female
2,අදිති,Female
3,අග්රානි,Female
4,අහංසා,Female


In [9]:
df.size

3676

In [10]:
#Checking for column name consistency
df.columns

Index(['Name', 'Gender'], dtype='object')

In [11]:
#Data Type
df.dtypes

Name      object
Gender    object
dtype: object

In [12]:
#Size of female names
df[df.Gender == 'Female'].size

1802

In [13]:
#Size of male names
df[df.Gender == 'Male'].size

1874

In [14]:
df_names = df

In [15]:
df_names.Gender.replace({'Female':0, 'Male':1}, inplace=True)

In [16]:
df_names.Gender.unique()

array([0, 1])

In [17]:
df_names.dtypes

Name      object
Gender     int64
dtype: object

In [18]:
Xfeatures = df_names['Name']

In [19]:
#Feature Extraction using CountVectorizer
cv = CountVectorizer(encoding='utf-8')
X = cv.fit_transform(Xfeatures)

In [55]:
#Print the features
cv.get_feature_names_out()

array(['අක', 'අග', 'අගත', 'අගශ', 'අච', 'අත', 'අද', 'අන', 'අනග', 'අනන',
       'අප', 'අබ', 'අභ', 'අම', 'අමන', 'අමය', 'අමල', 'අය', 'අයන', 'අර',
       'අරයන', 'අරල', 'අශ', 'අස', 'අහ', 'අහර', 'අහස', 'ආකර', 'ආදම', 'ආන',
       'ආය', 'ආරද', 'ආල', 'ආශ', 'ඇන', 'ඉත', 'ඉන', 'ඉඳ', 'ඉම', 'ඉමන',
       'ඉමය', 'ඉර', 'ඉරෂ', 'ඉල', 'ඉෂ', 'ඉස', 'ඉසල', 'ඊඩන', 'ඊතන', 'උත',
       'උද', 'උප', 'උම', 'උව', 'උවන', 'උෂ', 'ඌව', 'එත', 'එන', 'එම', 'එමල',
       'එර', 'එරන', 'එෂ', 'එෂල', 'එසන', 'එසඳ', 'ඒන', 'ඔක', 'ඔකඳ', 'ඔන',
       'ඔම', 'ඔමල', 'ඔල', 'ඔව', 'ඔශ', 'ඔෂ', 'ඔස', 'ඔසන', 'ඕව', 'ඕෂ',
       'ඕෂද', 'කන', 'කය', 'කල', 'කව', 'ගග', 'ගගන', 'ගද', 'ගන', 'ගය',
       'ගයත', 'ගර', 'ගව', 'චක', 'චත', 'චන', 'චම', 'චමත', 'චමද', 'චමල',
       'ජන', 'ජය', 'ජයන', 'ජයම', 'ජයව', 'ජල', 'ජස', 'ටන', 'ඩන', 'ඩය',
       'ණද', 'තක', 'තජ', 'තත', 'තන', 'තනක', 'තනව', 'තම', 'තමල', 'තමෂ',
       'තර', 'තව', 'තශ', 'තෂ', 'තස', 'දක', 'දන', 'දනර', 'දම', 'දර', 'දස',
       'දහම', 'ධන', 'නක', 'නජ', 'නත', 'නතන', 'නද', 'නන', 'නම', 'නය', 'නර

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
#Features
X
#Labels
y = df_names.Gender

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state =42)

In [24]:
#1st Classifier - Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
#instantiate a Multinomial Naive Bayes model
clf = MultinomialNB()
#train the model
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.5123558484349259

In [25]:
#Accuracy of the model for test data
print("Accuracy of Model", clf.score(X_test, y_test)*100, '%')

Accuracy of Model 51.23558484349259 %


In [26]:
#Accuracy of the model for training data
print("Accuracy of Model", clf.score(X_train, y_train)*100, '%')

Accuracy of Model 64.74411047928513 %


In [27]:
#sample prediction 1
sample_name = ['නෙතුනි']
vect = cv.transform(sample_name).toarray()
vect

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [28]:
clf.predict(vect)

array([0])

In [29]:
#sample prediction2
sample_name1 = ['දිනේෂ්']
vect1 = cv.transform(sample_name1).toarray()
vect1

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [30]:
def genderPredictor(a):
    test_name = [a]
    vector = cv.transform(test_name).toarray()
    if clf.predict(vector) == 0:
        print('Female')
    else:
        print('Male')


In [31]:
genderPredictor("රිද්මාලි")

Female


In [33]:
name_list = ["ඇලෙක්ස්","ශලීෂා", "සුසිතා", "නිර්මාල්"]
for n in name_list:
    print (genderPredictor(n))

Female
None
Female
None
Female
None
Female
None


In [34]:
#Using a custom function for feature analysis
#By Analogy that most of the female names ends in 'ආ'' sound or 'ඉ' sound
def features(name):
    name=name.lower()
    return{
        'first-letter': name[0], #first letter
        'first2-letters': name[0:2], #first 2 letters
        'first3-letters': name[0:3], #first3 letters
        'last-letter': name[-1], #last letter
        'last2-letters': name[-2:], #last2 letters
        'last3-letters': name[-3:], #last3 letters
    }

In [35]:
#Vectorize the features function
features = np.vectorize(features)
print(features(['කසුනි', 'නවංජනා', 'සන්තුෂ්', 'භාතිය']))

[{'first-letter': 'ක', 'first2-letters': 'කස', 'first3-letters': 'කසු', 'last-letter': 'ි', 'last2-letters': 'නි', 'last3-letters': 'ුනි'}
 {'first-letter': 'න', 'first2-letters': 'නව', 'first3-letters': 'නවං', 'last-letter': 'ා', 'last2-letters': 'නා', 'last3-letters': 'ජනා'}
 {'first-letter': 'ස', 'first2-letters': 'සන', 'first3-letters': 'සන්', 'last-letter': '්', 'last2-letters': 'ෂ්', 'last3-letters': 'ුෂ්'}
 {'first-letter': 'භ', 'first2-letters': 'භා', 'first3-letters': 'භාත', 'last-letter': 'ය', 'last2-letters': 'ිය', 'last3-letters': 'තිය'}]


In [36]:
#Extract the features for the dataset
df_X = features(df_names['Name'])
df_y = df_names['Gender']

In [37]:
from sklearn.feature_extraction import DictVectorizer
corpus = features(['කසුනි','සන්තුෂ්'])
dv = DictVectorizer()
dv.fit(corpus)
transformed = dv.transform(corpus)
print(transformed)

  (0, 0)	1.0
  (0, 2)	1.0
  (0, 4)	1.0
  (0, 7)	1.0
  (0, 8)	1.0
  (0, 10)	1.0
  (1, 1)	1.0
  (1, 3)	1.0
  (1, 5)	1.0
  (1, 6)	1.0
  (1, 9)	1.0
  (1, 11)	1.0


In [56]:
dv.get_feature_names_out()

array(['first-letter=අ', 'first-letter=ආ', 'first-letter=ඉ', ...,
       'last3-letters=\u200dයා', 'last3-letters=\u200dරා',
       'last3-letters=\u200dරි'], dtype=object)

In [39]:
dfX_train, dfX_test, dfy_train, dfy_test = train_test_split(df_X,df_y, test_size=0.33, random_state=42)

In [40]:
dfX_train

array([{'first-letter': 'න', 'first2-letters': 'නෙ', 'first3-letters': 'නෙස', 'last-letter': 'ි', 'last2-letters': 'දි', 'last3-letters': '්දි'},
       {'first-letter': 'බ', 'first2-letters': 'බි', 'first3-letters': 'බිල', 'last-letter': 'ා', 'last2-letters': 'කා', 'last3-letters': 'ංකා'},
       {'first-letter': 'න', 'first2-letters': 'නෂ', 'first3-letters': 'නෂ්', 'last-letter': 'ක', 'last2-letters': 'ික', 'last3-letters': 'මික'},
       ...,
       {'first-letter': 'ව', 'first2-letters': 'වි', 'first3-letters': 'විර', 'last-letter': 'ි', 'last2-letters': 'ගි', 'last3-letters': 'ංගි'},
       {'first-letter': 'ප', 'first2-letters': 'පව', 'first3-letters': 'පවර', 'last-letter': 'ර', 'last2-letters': 'වර', 'last3-letters': 'පවර'},
       {'first-letter': 'ද', 'first2-letters': 'දු', 'first3-letters': 'දුම', 'last-letter': 'ෂ', 'last2-letters': '්ෂ', 'last3-letters': 'ල්ෂ'}],
      dtype=object)

In [41]:
dv = DictVectorizer()
dv.fit_transform(dfX_train)

<1231x1370 sparse matrix of type '<class 'numpy.float64'>'
	with 7386 stored elements in Compressed Sparse Row format>

In [42]:
#Model building Using Dictionaries
#2nd Classifier - DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
#instantiate a Decision Tree Classifier model
dclf= DecisionTreeClassifier()
my_xfeatures = dv.transform(dfX_train)
dclf.fit(my_xfeatures, dfy_train)

In [43]:
DecisionTreeClassifier()

In [44]:
#Build Features and Transform them
sample_name_eg = ['භාතිය']
transform_dv = dv.transform(features(sample_name_eg))
vect3 = transform_dv.toarray()

In [45]:
#Predicting Gender of name
#Male is 1, Female is 0
dclf.predict(vect3)
if dclf.predict(vect3) ==0:
    print('Female')
else:
    print('Male')

Female


In [46]:
#A function to predict gender
def genderPredictorDT(a):
    test_name1 = [a]
    transform_dv = dv.transform(features(test_name1))
    vector = transform_dv.toarray()
    if dclf.predict(vector) == 0:
        print('Female')
    else:
        print('Male')


In [47]:
random_name_list = ["ඇලෙක්ස්","ශලීෂා", "සුසිතා", "නිර්මාල්"]

In [48]:
for n in random_name_list:
    print (genderPredictorDT(n))

Male
None
Male
None
Female
None
Male
None


In [49]:
#Accuracy on training set
print("Accuracy of Model", dclf.score(dv.transform(dfX_train), dfy_train))

Accuracy of Model 0.9967506092607636


In [50]:
#Accuracy on test set
print("Accuracy of Model", dclf.score(dv.transform(dfX_test), dfy_test))

Accuracy of Model 0.942339373970346


In [51]:
##Accuracy of models descision tree Classifier works better than Naive bayes

In [52]:
#Saving our model
import joblib

In [53]:
decisiontreeModel = open("decisiontreemodel.pkl","wb")
joblib.dump(dclf, decisiontreeModel)
decisiontreeModel.close()

In [54]:
NaiveBayesModel = open("naivebayesgendermodel.pkl","wb")
joblib.dump(clf, NaiveBayesModel)
NaiveBayesModel.close()