## Gender Classification with Naive Bayes using First Names

In [32]:
# import all the required libraries
import numpy as np
import pandas as pd

In [33]:
# Read the dataset
df = pd.read_csv("../assets/data/gender.csv")

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1294 entries, 0 to 1293
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    1294 non-null   object
 1   Target  1294 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 20.3+ KB


In [35]:
df.head()

Unnamed: 0,Name,Target
0,Yash,1
1,Prit,1
2,Meet,1
3,Drashti,0
4,Saloni,0


In [79]:
# Split data in train and test set
test_size = 0.1

num_rows = len(df)

train_rows = int(num_rows * (1-test_size))

# Create separate DataFrames for each set
train_df = df.iloc[:train_rows]
test_df = df.iloc[train_rows:]

print("Training set size:", len(train_df))
print("Test set size:", len(test_df))

Training set size: 1164
Test set size: 130


In [80]:
female_count = train_df[train_df['Target'] == 0].shape[0]
male_count = train_df[train_df['Target'] == 1].shape[0]  

print(f"Number of girls: {female_count}")
print(f"Number of boys: {male_count}")

Number of girls: 591
Number of boys: 573


In [81]:
# Names can generally be distinguished by the last character and bigrams. 
def create_dict(name, dir):
    key = name[-1]
    dir[key] = dir.get(key, 0) + 1

    # Bigrams
    for i in range(0, len(name)-1):
        key = name[i:i+2]
        dir[key] = dir.get(key, 0) + 1

    return dir


In [82]:
# Creating probability dictionary from train set

girls_dir = {}
boys_dir = {}

# Names can generally be distinguished by the last character, use of vowels, bigrams and trigrams. Hence we consider features such as vowels, bigrams, trigrams etc.
for i in range(len(train_df)):
    name = train_df.iloc[i, 0].lower()
    gender = train_df.iloc[i, 1]

    # print(name, gender)

    if gender == 1:
        boys_dir = create_dict(name, boys_dir)
    else:
        girls_dir = create_dict(name, girls_dir)

In [83]:
total_features = 26 + 26*26
print(total_features)

702


In [89]:
total_boys_count = sum(boys_dir.values())
print(total_boys_count)
total_girls_count = sum(girls_dir.values())
print(total_girls_count)

3352
3411


In [90]:
def test_gender(name):
    name = name.lower()
    girls_prob = 0
    boys_prob = 0
    
    # Performing Laplace Smoothing for each parameter
    K = 1

    # Last Character
    girls_prob += np.log((girls_dir.get(name[-1], 0) + K)/(total_girls_count + K*total_features))
    boys_prob += np.log((boys_dir.get(name[-1], 0) + K)/(total_boys_count + K*total_features)) 

    # Bigrams
    for i in range(0, len(name)-1):
        key = name[i] + name[i+1]
        girls_prob += np.log((girls_dir.get(key, 0)+ K)/(total_girls_count + K*total_features))
        boys_prob += np.log((boys_dir.get(key, 0)+ K)/(total_boys_count + K*total_features))  

    if (girls_prob > boys_prob):
        return "girl", girls_prob, boys_prob
    else:
        return "boy", girls_prob, boys_prob

In [91]:
# Train Accuracy
def get_accuracy(test):
    miss = 0
    performance_matrix = [[0, 0], [0, 0]]

    for i in range(len(test)):
        name = test.iloc[i, 0]
        gender = test.iloc[i, 1]

        pred, girls_prob, boys_prob = test_gender(name)
        if (gender == 0 and pred == "girl"):
            performance_matrix[0][0] += 1
        elif (gender == 0 and pred == "boy"):
            performance_matrix[0][1] += 1
            miss += 1
        elif (gender == 1 and pred == "girl"):
            performance_matrix[1][0] += 1
            miss += 1
        elif (gender == 1 and pred == "boy"):
            performance_matrix[1][1] += 1

    return miss, performance_matrix

In [92]:
miss, performance_matrix = get_accuracy(test_df)

print(miss, len(test_df))
print("test accuracy: ", 1 - miss/(len(test_df)))
print("Confusion Matrix:", performance_matrix)

15 130
test accuracy:  0.8846153846153846
Confusion Matrix: [[77, 11], [4, 38]]


In [None]:
# Testing with our own example
name = "Raavan"
gender, girls_prob, boys_prob = test_gender(name)
print("I am sure " + name + " is a " + gender + ".")
print(girls_prob)
print(boys_prob)

name = "Mandodari"
gender, girls_prob, boys_prob = test_gender(name)
print("I am sure " + name + " is a " + gender + ".")
print(girls_prob)
print(boys_prob)

I am sure Raavan is a boy.
-29.4727117399137
-26.656246203469784
I am sure Mandodari is a girl.
-46.64303289816588
-48.36583309977995
