In [55]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer

In [56]:
df = pd.read_csv("C:/Bangkit/ML/Datasets/Names/combine.csv", encoding='cp1252')

In [57]:
df.head()

Unnamed: 0,name,gender
0,hafizhan shidqi,m
1,gandhi wibowo,m
2,aldio mahendra purwandrarto,m
3,benny putra,m
4,vicky vernando dasta,m


In [58]:
df.size

14652

In [59]:
df.columns

Index(['name', 'gender'], dtype='object')

In [60]:
df.dtypes

name      object
gender    object
dtype: object

In [61]:
# Checking for missing value
df.isnull().isnull().sum()

name      0
gender    0
dtype: int64

In [62]:
# Number of female names
df[df.gender == 'f'].size

6866

In [63]:
# Number of male names
df[df.gender == 'm'].size

7786

In [64]:
df_names = df

In [65]:
# Replacing all f to 0 and m to 1
df_names.gender.replace({'f': 0, 'm': 1}, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_names.gender.replace({'f': 0, 'm': 1}, inplace=True)
  df_names.gender.replace({'f': 0, 'm': 1}, inplace=True)


In [66]:
df_names.head()

Unnamed: 0,name,gender
0,hafizhan shidqi,1
1,gandhi wibowo,1
2,aldio mahendra purwandrarto,1
3,benny putra,1
4,vicky vernando dasta,1


In [67]:
df_names.gender.unique()

array([1, 0], dtype=int64)

In [68]:
df_names['gender'].unique()

array([1, 0], dtype=int64)

In [69]:
df_names.dtypes

name      object
gender     int64
dtype: object

In [70]:
Xfeatures = df_names['name']

In [71]:
cv = CountVectorizer()
X = cv.fit_transform(Xfeatures.values.astype('U'))

In [72]:
X.shape

(7326, 7366)

In [73]:
gender_vectorizer = open("gender_vectorizer.pkl", "wb")
joblib.dump(cv, gender_vectorizer)

In [74]:
gender_vectorizer.close()

In [75]:
cv.get_feature_names_out()

array(['aam', 'aan', 'aang', ..., 'zuriel', 'zurmiati', 'zuwardi'],
      dtype=object)

In [76]:
y=df_names.gender

In [77]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [78]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.73806275579809

In [79]:
print("Accuracy of Model-> {} %".format(clf.score(X_test, y_test)*100))

Accuracy of Model-> 73.806275579809 %


In [80]:
print("Accuracy of Model-> {} %".format(clf.score(X_train, y_train)*100))

Accuracy of Model-> 98.15699658703072 %


In [81]:
# Prediction

In [82]:
sample_names = ["Suchitra"]
vect = cv.transform(sample_names).toarray()
clf.predict(vect)

array([1], dtype=int64)

In [83]:
sample_names2 = ["Natasha"]
vect = cv.transform(sample_names2).toarray()
clf.predict(vect)

array([1], dtype=int64)

In [84]:
sample_names3 = ['Chandu','Suchitra','Nasha','Puja','Kabir','Joseph','Virat']
vect = cv.transform(sample_names3).toarray()
clf.predict(vect)

array([1, 1, 1, 0, 1, 1, 1], dtype=int64)

In [85]:
def gender_predictor(name):
    test_name = [name]
    vector = cv.transform(test_name).toarray()
    if clf.predict(vector) == 0:
        return "Female"
    else:
        return "Male"

In [86]:
for i in sample_names3:
    print(i, "->", gender_predictor(i))

Chandu -> Male
Suchitra -> Male
Nasha -> Male
Puja -> Female
Kabir -> Male
Joseph -> Male
Virat -> Male


In [87]:
# Custom Feature Analysis
# By analogy most female names ends in A or I or has the sound of A

def features(name):
    name = str(name)
    name = name.lower()
    return {
        'first-letter': name[0],
        'first2-letter': name[0:2],
        'first3-letter': name[0:3],
        'last-letter':name[-1],
        'last2-letter':name[-2:],
        'last3-letter':name[-3:],
    }


In [88]:
features = np.vectorize(features)
features(['Chandu','Suchitra','Nasha','Puja','Kabir','Joseph','Virat'])

array([{'first-letter': 'c', 'first2-letter': 'ch', 'first3-letter': 'cha', 'last-letter': 'u', 'last2-letter': 'du', 'last3-letter': 'ndu'},
       {'first-letter': 's', 'first2-letter': 'su', 'first3-letter': 'suc', 'last-letter': 'a', 'last2-letter': 'ra', 'last3-letter': 'tra'},
       {'first-letter': 'n', 'first2-letter': 'na', 'first3-letter': 'nas', 'last-letter': 'a', 'last2-letter': 'ha', 'last3-letter': 'sha'},
       {'first-letter': 'p', 'first2-letter': 'pu', 'first3-letter': 'puj', 'last-letter': 'a', 'last2-letter': 'ja', 'last3-letter': 'uja'},
       {'first-letter': 'k', 'first2-letter': 'ka', 'first3-letter': 'kab', 'last-letter': 'r', 'last2-letter': 'ir', 'last3-letter': 'bir'},
       {'first-letter': 'j', 'first2-letter': 'jo', 'first3-letter': 'jos', 'last-letter': 'h', 'last2-letter': 'ph', 'last3-letter': 'eph'},
       {'first-letter': 'v', 'first2-letter': 'vi', 'first3-letter': 'vir', 'last-letter': 't', 'last2-letter': 'at', 'last3-letter': 'rat'}],
     

In [89]:
df_X = features(df_names['name'])

In [90]:
df_y = df_names['gender']

In [91]:
corpus = features(["Chandu", "Julia"])
dv = DictVectorizer()
dv.fit(corpus)
tranformed  = dv.transform(corpus)

In [92]:
print(tranformed)

  (0, 0)	1.0
  (0, 2)	1.0
  (0, 4)	1.0
  (0, 7)	1.0
  (0, 8)	1.0
  (0, 11)	1.0
  (1, 1)	1.0
  (1, 3)	1.0
  (1, 5)	1.0
  (1, 6)	1.0
  (1, 9)	1.0
  (1, 10)	1.0


In [93]:
dv.get_feature_names_out()

array(['first-letter=c', 'first-letter=j', 'first2-letter=ch',
       'first2-letter=ju', 'first3-letter=cha', 'first3-letter=jul',
       'last-letter=a', 'last-letter=u', 'last2-letter=du',
       'last2-letter=ia', 'last3-letter=lia', 'last3-letter=ndu'],
      dtype=object)

In [94]:
dfX_train, dfX_test, dfy_train ,dfy_test = train_test_split(df_X, df_y, test_size = 0.2, random_state=42) 

In [95]:
dfX_train

array([{'first-letter': 'u', 'first2-letter': 'un', 'first3-letter': 'unn', 'last-letter': 'a', 'last2-letter': 'na', 'last3-letter': 'nna'},
       {'first-letter': 'w', 'first2-letter': 'wa', 'first3-letter': 'war', 'last-letter': 'h', 'last2-letter': 'ah', 'last3-letter': 'nah'},
       {'first-letter': 'm', 'first2-letter': 'mu', 'first3-letter': 'muh', 'last-letter': 'a', 'last2-letter': 'ra', 'last3-letter': 'tra'},
       ...,
       {'first-letter': 's', 'first2-letter': 'sr', 'first3-letter': 'sri', 'last-letter': 'i', 'last2-letter': 'ti', 'last3-letter': 'ati'},
       {'first-letter': 'u', 'first2-letter': 'uj', 'first3-letter': 'uja', 'last-letter': 'h', 'last2-letter': 'ah', 'last3-letter': 'yah'},
       {'first-letter': 's', 'first2-letter': 'si', 'first3-letter': 'sit', 'last-letter': 'i', 'last2-letter': 'wi', 'last3-letter': 'iwi'}],
      dtype=object)

In [96]:
dv = DictVectorizer()
dv.fit_transform(dfX_train)

<5860x2949 sparse matrix of type '<class 'numpy.float64'>'
	with 35160 stored elements in Compressed Sparse Row format>

In [97]:
from sklearn.tree import DecisionTreeClassifier
dclf = DecisionTreeClassifier()
x_features = dv.transform(dfX_train)
dclf.fit(x_features, dfy_train)

In [98]:
sample_name_eg = ["Puja"]
transform_dv = dv.transform(features(sample_name_eg)).toarray()
dclf.predict(transform_dv)

array([0], dtype=int64)

In [99]:
name_eg1 = ["Chioma"]
transform_dv_1 = dv.transform(features(name_eg1)).toarray()
vect_1 = dclf.predict(transform_dv_1)
if vect_1==0:
    print("Female")
else:
    print("Male")

Female


In [100]:
decistion_tree = open("decisiontree.pkl","wb")

In [101]:
joblib.dump(dclf, decistion_tree)

In [102]:
import pickle
decistion_tree_01 = open("name_detector_model.pkl","wb")
pickle.dump(dclf, decistion_tree_01)
decistion_tree_01.close()

In [103]:
naive_bayes =  open("naivebayes.pkl","wb")
joblib.dump(clf, naive_bayes)
naive_bayes.close()