<a href="https://colab.research.google.com/github/VATheOld/GenderPredictByName/blob/main/ProjectAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import Machine Learning package

In [6]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Enter Vietnamese or english name:

In [4]:
choice=input("Enter 1 for input Vietnamese name, 2 for English name:")
if choice=="1":
    name=input("Enter Vietnamese name:")
    name=name.title()
    componentName=name.split(" ")
    middleName=componentName[1:len(componentName)-1]
    firstName=componentName[len(componentName)-1:]
    name=middleName+firstName
    inputname=" ".join(name)
    print("inname:",inputname)
elif choice=="2":
    name=input("Enter English name:")
    componentName=name.split(" ")
    print(componentName)
    for component in range(len(componentName)):
        firstName=componentName[0]
    print(firstName)

Enter 1 for input Vietnamese name, 2 for English name:1
Enter Vietnamese name:Trần Thị Việt Cung
inname: Thị Việt Cung


Import datasets:

**Eng name code**

Get column name and gender and change M to 0, F to 1

In [8]:
co_l=["Name","Gender"]
engNamelist=pd.read_csv('/content/drive/My Drive/AI_Project/Eng_name_gender_dataset.csv',usecols=co_l)
engNamelist.Gender.replace({"M":0,"F":1},inplace=True)
print(engNamelist)

             Name  Gender
0           James       0
1            John       0
2          Robert       0
3         Michael       0
4         William       0
...           ...     ...
528249   Zyleigh        1
528250     Zylie        1
528251    Zylpha        1
528252       Zyna       1
528253  Zyra-Mae        1

[528254 rows x 2 columns]


In [9]:
def features(name):
    name = name.lower()
    return {
        'first-letter': name[0], # First letter
        'first2-letters': name[0:2], # First 2 letters
        'first3-letters': name[0:3], # First 3 letters
        'last-letter': name[-1],
        'last2-letters': name[-2:],
        'last3-letters': name[-3:],
    }

In [10]:
features = np.vectorize(features)
print(features(["Anna", "Hannah", "Peter","John","Vladmir","Mohammed"]))

[{'first-letter': 'a', 'first2-letters': 'an', 'first3-letters': 'ann', 'last-letter': 'a', 'last2-letters': 'na', 'last3-letters': 'nna'}
 {'first-letter': 'h', 'first2-letters': 'ha', 'first3-letters': 'han', 'last-letter': 'h', 'last2-letters': 'ah', 'last3-letters': 'nah'}
 {'first-letter': 'p', 'first2-letters': 'pe', 'first3-letters': 'pet', 'last-letter': 'r', 'last2-letters': 'er', 'last3-letters': 'ter'}
 {'first-letter': 'j', 'first2-letters': 'jo', 'first3-letters': 'joh', 'last-letter': 'n', 'last2-letters': 'hn', 'last3-letters': 'ohn'}
 {'first-letter': 'v', 'first2-letters': 'vl', 'first3-letters': 'vla', 'last-letter': 'r', 'last2-letters': 'ir', 'last3-letters': 'mir'}
 {'first-letter': 'm', 'first2-letters': 'mo', 'first3-letters': 'moh', 'last-letter': 'd', 'last2-letters': 'ed', 'last3-letters': 'med'}]


In [11]:
# Extract the features for the dataset
df_X = features(engNamelist['Name'])
print(df_X)

[{'first-letter': 'j', 'first2-letters': 'ja', 'first3-letters': 'jam', 'last-letter': 's', 'last2-letters': 'es', 'last3-letters': 'mes'}
 {'first-letter': 'j', 'first2-letters': 'jo', 'first3-letters': 'joh', 'last-letter': 'n', 'last2-letters': 'hn', 'last3-letters': 'ohn'}
 {'first-letter': 'r', 'first2-letters': 'ro', 'first3-letters': 'rob', 'last-letter': 't', 'last2-letters': 'rt', 'last3-letters': 'ert'}
 ...
 {'first-letter': 'z', 'first2-letters': 'zy', 'first3-letters': 'zyl', 'last-letter': ' ', 'last2-letters': 'a ', 'last3-letters': 'ha '}
 {'first-letter': 'z', 'first2-letters': 'zy', 'first3-letters': 'zyn', 'last-letter': 'a', 'last2-letters': 'na', 'last3-letters': 'yna'}
 {'first-letter': 'z', 'first2-letters': 'zy', 'first3-letters': 'zyr', 'last-letter': ' ', 'last2-letters': 'e ', 'last3-letters': 'ae '}]


In [12]:
df_y = engNamelist['Gender']

In [13]:
from sklearn.feature_extraction import DictVectorizer
 
corpus = features(["Mike", "Julia"])
dv = DictVectorizer()
dv.fit(corpus)
transformed = dv.transform(corpus)
print(transformed)

  (0, 1)	1.0
  (0, 3)	1.0
  (0, 5)	1.0
  (0, 7)	1.0
  (0, 9)	1.0
  (0, 10)	1.0
  (1, 0)	1.0
  (1, 2)	1.0
  (1, 4)	1.0
  (1, 6)	1.0
  (1, 8)	1.0
  (1, 11)	1.0


In [14]:
dv.get_feature_names()



['first-letter=j',
 'first-letter=m',
 'first2-letters=ju',
 'first2-letters=mi',
 'first3-letters=jul',
 'first3-letters=mik',
 'last-letter=a',
 'last-letter=e',
 'last2-letters=ia',
 'last2-letters=ke',
 'last3-letters=ike',
 'last3-letters=lia']

In [15]:
# Train Test Split
from sklearn.model_selection import train_test_split
dfX_train, dfX_test, dfy_train, dfy_test = train_test_split(df_X, df_y, test_size=0.33, random_state=42)
print(dfX_train.shape, dfX_test.shape, dfy_train.shape, dfy_test.shape)

(353930,) (174324,) (353930,) (174324,)


In [16]:
dv = DictVectorizer()
dv.fit_transform(dfX_train)

<353930x13720 sparse matrix of type '<class 'numpy.float64'>'
	with 2123580 stored elements in Compressed Sparse Row format>

In [17]:
# Model building Using DecisionTree

from sklearn.tree import DecisionTreeClassifier
 
dclf = DecisionTreeClassifier()
my_xfeatures =dv.transform(dfX_train)
dclf.fit(my_xfeatures, dfy_train)
test_xfeatures =dv.transform(dfX_test)
y_pred=dclf.predict(test_xfeatures)
print(y_pred)

[0 1 1 ... 1 1 1]


In [18]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
print(confusion_matrix(dfy_test, y_pred))
print(classification_report(dfy_test, y_pred))
print('Accuracy:', str(accuracy_score(dfy_test, y_pred)))
print('Accuracy:', str(accuracy_score(y_pred,dfy_test)))

[[61298 12275]
 [14706 86045]]
              precision    recall  f1-score   support

           0       0.81      0.83      0.82     73573
           1       0.88      0.85      0.86    100751

    accuracy                           0.85    174324
   macro avg       0.84      0.84      0.84    174324
weighted avg       0.85      0.85      0.85    174324

Accuracy: 0.8452249833643102
Accuracy: 0.8452249833643102


In [19]:
# Accuracy on training set
print(dclf.score(dv.transform(dfX_train), dfy_train)) 

0.9197185884214393


In [20]:
# Accuracy on test set
print(dclf.score(dv.transform(dfX_test), dfy_test))

0.8452249833643102


In [21]:
sample_name3 = ["Susan","Emily","Hans","John","Marco","Sayla","Mamon"]

In [22]:
# Fuction to predict using Decision Tree Model
def engNameGenderPredict(name):
    n=[name]
    vector=dv.transform(features(n)).toarray()
    result=dclf.predict(vector)
    if result == 1:
        print("Female")
    elif result == 0:
        print("Male")

In [23]:
for n in sample_name3:
     engNameGenderPredict(n)

Female
Female
Male
Male
Male
Female
Female


In [None]:
#Test input
engName=firstName
print(engName)
engNameGenderPredict(engName)

Jeanne
Female


In [None]:
# save model
import pickle
dctreeModel = open("DecisionTree_Model.pkl","wb")
pickle.dump(dclf,dctreeModel)
dctreeModel.close()

In [None]:
# load the model

import pickle
pickled_model = pickle.load(open('/content/drive/My Drive/AI_Project/DecisionTree_Model.pkl', 'rb'))
pickled_model.predict(dv.transform(dfX_test))
result = pickled_model.score(dv.transform(dfX_test),dfy_test)
print(result)

0.84528234781212


**Vietnam Name code**

Get Vietnamese dataset and set Male as 0, Female as 1

In [24]:
col=['Name','Gender']
VietNamelist=pd.read_csv('/content/drive/My Drive/AI_Project/Vietnamese_Name_gender.csv',usecols=col)
VietNamelist.Gender.replace({"M":0,"F":1},inplace=True)
print(VietNamelist)

               Name Gender
0        phương anh      1
1        phương mai      1
2        phương nam      0
3        quang minh      0
4        quang vinh      0
...             ...    ...
75616       Tiến Sỹ      0
75617     Thị Huyền      1
75618  Keaut Vannet      0
75619       Thị Huê      1
75620      Xuân Đại      0

[75621 rows x 2 columns]


In [25]:
VietNamelist['Name']=VietNamelist['Name'].str.lower()
print(VietNamelist['Name'])

0          phương anh
1          phương mai
2          phương nam
3          quang minh
4          quang vinh
             ...     
75616         tiến sỹ
75617       thị huyền
75618    keaut vannet
75619         thị huê
75620        xuân đại
Name: Name, Length: 75621, dtype: object


In [31]:
Xfeature=VietNamelist['Name']
cv=CountVectorizer()
X=cv.fit_transform(VietNamelist['Name'].values.astype(str))

In [27]:
cv.get_feature_names()



['adel',
 'ai',
 'an',
 'anh',
 'anna',
 'ao',
 'arăn',
 'as',
 'asia',
 'ba',
 'bac',
 'bach',
 'ban',
 'bane',
 'bang',
 'bao',
 'bas',
 'bau',
 'bay',
 'be',
 'ben',
 'benl',
 'bi',
 'bia',
 'bin',
 'binh',
 'biên',
 'biết',
 'biền',
 'biển',
 'biểu',
 'biện',
 'bla',
 'blanh',
 'bli',
 'bling',
 'blong',
 'bo',
 'bon',
 'bonl',
 'bou',
 'bovănhương',
 'boy',
 'brah',
 'brin',
 'brinh',
 'brus',
 'brèm',
 'brê',
 'brìn',
 'brìng',
 'brích',
 'brơ',
 'brưi',
 'brưu',
 'brậm',
 'brị',
 'brịp',
 'brịu',
 'brổih',
 'brớs',
 'brởs',
 'bu',
 'bune',
 'bunthămmạ',
 'buôi',
 'buôl',
 'buồn',
 'by',
 'byai',
 'byali',
 'byil',
 'byã',
 'bài',
 'bàng',
 'bào',
 'bày',
 'bá',
 'bách',
 'bái',
 'báo',
 'bás',
 'bát',
 'báte',
 'báu',
 'bân',
 'bâng',
 'bâu',
 'bây',
 'bão',
 'bèo',
 'bé',
 'bê',
 'bên',
 'bìn',
 'bình',
 'bí',
 'bích',
 'bính',
 'bòi',
 'bô',
 'bôi',
 'bôn',
 'bông',
 'bùi',
 'búp',
 'bă',
 'băn',
 'băng',
 'bůng',
 'bơ',
 'bơs',
 'bưu',
 'bươl',
 'bươne',
 'bước',
 'bường',
 '

In [30]:
Y=VietNamelist.Gender.values.astype(str)

Split dataset into 70% training, 30% test

In [32]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.30,random_state=42)
print(X_train.shape,X_test.shape,Y_train.shape,Y_test.shape)

(52934, 2717) (22687, 2717) (52934,) (22687,)


Using SVM to train data

In [33]:
from sklearn.svm import SVC
svm=SVC()
svm.fit(X_train,Y_train)
svm.score(X_test,Y_test)
y_pred=svm.predict(X_test)

Check accuracy

In [34]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
print(confusion_matrix(Y_test, y_pred))
print(classification_report(Y_test, y_pred))
print('Accuracy:', str(accuracy_score(Y_test, y_pred)))

[[11254   385]
 [  625 10423]]
              precision    recall  f1-score   support

           0       0.95      0.97      0.96     11639
           1       0.96      0.94      0.95     11048

    accuracy                           0.96     22687
   macro avg       0.96      0.96      0.96     22687
weighted avg       0.96      0.96      0.96     22687

Accuracy: 0.9554811125314057


In [35]:
print("Accuracy of Model on test set",svm.score(X_test,Y_test)*100,"%")
print("Accuracy of Model on train set",svm.score(X_train,Y_train)*100,"%")

Accuracy of Model on test set 95.54811125314056 %
Accuracy of Model on train set 96.86401934484452 %


try sample

In [36]:
test=[inputname]
vect=cv.transform(test).toarray()
result=svm.predict(vect)
print(inputname)
if result=="0":
  print("Male")
elif result=="1": 
  print("Female")

Thị Việt Cung
Female


In [37]:
sample_name3 = ["Hà Nguyễn","Văn Nam","Trà My","Việt Anh","Trung Hiếu","Phương Hà","Phương Mi","Nhật Trường","Phương Vy","Đại nam","vi en","Hòa Tiến","Mai Anh"]
vect3 = cv.transform(sample_name3).toarray()

In [38]:
result=svm.predict(vect3)
print(result)
for n in result:
  if n == "1":
        print("Female")
  elif n=="0":
        print("Male")

['1' '0' '1' '0' '0' '1' '1' '0' '1' '0' '1' '0' '1']
Female
Male
Female
Male
Male
Female
Female
Male
Female
Male
Female
Male
Female


save model SVM

In [None]:
# save the model to drive
import pickle
SVM_Model=open("SVM_model.pkl","wb")
pickle.dump(svm,SVM_Model)
SVM_Model.close()

Load model SVM

In [None]:
# load the model 
import pickle
pickled_model = pickle.load(open('/content/drive/My Drive/AI_Project/SVM_model.pkl', 'rb'))
pickled_model.predict(X_test)
result = pickled_model.score(X_test, Y_test)
print(result)

0.9554811125314057
