# Understanding Data

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer

In [None]:
df = pd.read_csv("/content/Names_dataset.csv")

In [None]:
df.head()

Unnamed: 0,name,gender
0,alfiya,f
1,ardwin,m
2,henryka,f
3,preeti,f
4,jamaro,m


In [None]:
df.size

250462

In [None]:
df.columns

Index(['name', 'gender'], dtype='object')

In [None]:
df.dtypes

name      object
gender    object
dtype: object

# Data Preprocessing

In [None]:
# Checking for missing value
df.isnull().isnull().sum()

name      0
gender    0
dtype: int64

In [None]:
# Number of female names
df[df['gender']=='f'].size

151942

In [None]:
# Number of male names
df[df['gender']=='m'].size

98520

In [None]:
# Copying a new dataset
df_names = df

# One-Hot Encoding

In [None]:
# Replacing all f to 0 and m to 1
# Numerical data
df_names['gender'].replace({'f':0, 'm':1}, inplace=True)

In [None]:
df_names.head()

Unnamed: 0,name,gender
0,alfiya,0
1,ardwin,1
2,henryka,0
3,preeti,0
4,jamaro,1


In [None]:
df_names['gender'].unique()

array([0, 1])

In [None]:
df_names.dtypes

name      object
gender     int64
dtype: object

# Data Preprocessing (Vectorization)

In [None]:
# Change Categorical to Numerical format (Vector/Matrix form)
Xfeatures = df_names['name']

In [None]:
cv = CountVectorizer() #Object
X = cv.fit_transform(Xfeatures.values.astype('U'))

In [None]:
X

<125231x101785 sparse matrix of type '<class 'numpy.int64'>'
	with 141358 stored elements in Compressed Sparse Row format>

In [None]:
X.shape

(125231, 101785)

In [None]:
cv.get_feature_names_out()

array(['aaban', 'aabha', 'aabid', ..., 'सर', 'सलम', 'हन'], dtype=object)

**Data Splitting**

In [None]:
y = df_names['gender']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Naive Bayse Classifier

In [None]:
# Import Naive Bayes
from sklearn.naive_bayes import MultinomialNB

In [None]:
clf = MultinomialNB()
clf.fit(X_train, y_train)
clf.score(X_train, y_train)

0.9913758684021401

In [None]:
clf.score(X_test, y_test)

0.7104643270651175

In [None]:
print("Accurary of Model(Training) --> {} %".format(clf.score(X_train, y_train)*100))
print("Accurary of Model(Testing) --> {} %".format(clf.score(X_test, y_test)*100))

Accurary of Model(Training) --> 99.13758684021401 %
Accurary of Model(Testing) --> 71.04643270651175 %


\# Prediction

In [None]:
sample_names = ["Sarah"]
vect_sample = cv.transform(sample_names).toarray()
clf.predict(vect_sample)

array([0])

In [None]:
sample_names2 = ["Johnson"]
vect_sample = cv.transform(sample_names2).toarray()
clf.predict(vect_sample)

array([1])

In [None]:
sample_names3 = ["Natasha"]
vect_sample = cv.transform(sample_names3).toarray()
clf.predict(vect_sample)

array([0])

In [None]:
sample_names4 = ["Lisa", "Luiz", "Jose", "Rose", "Sam", "Chandu"]
vect_sample = cv.transform(sample_names4).toarray()
clf.predict(vect_sample)

array([0, 1, 1, 0, 1, 1])

# Build a function

In [None]:
# Building a function
def gender_predictor(name):
  test_name = [name]
  vector = cv.transform(test_name).toarray()
  if clf.predict(vector) == 0:
    return "Female"
  else:
    return "Male"

In [None]:
gender_predictor("Maya")

'Female'

# Deploy

In [None]:
for i in sample_names4:
  print(i, "-->", gender_predictor(i))

Lisa --> Female
Luiz --> Male
Jose --> Male
Rose --> Female
Sam --> Male
Chandu --> Male
