## Gender Classification with Naive Bayes using First Names

In [1]:
# import all the required libraries
import numpy as np
import pandas as pd

In [2]:
# Read the dataset
df = pd.read_csv("../assets/data/gender.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296 entries, 0 to 1295
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    1296 non-null   object
 1   Target  1296 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 20.4+ KB


In [4]:
df.head()

Unnamed: 0,Name,Target
0,Yash,1
1,Prit,1
2,Meet,1
3,Drashti,0
4,Saloni,0


In [5]:
female_count = df[df['Target'] == 0].shape[0]
male_count = df[df['Target'] == 1].shape[0]  

print(f"Number of girls: {female_count}")
print(f"Number of boys: {male_count}")

Number of girls: 679
Number of boys: 617


In [6]:
# Split data in train and test set
test_size = 0.1

num_rows = len(df)

train_rows = int(num_rows * (1-test_size))

# Create separate DataFrames for each set
train_df = df.iloc[:train_rows]
test_df = df.iloc[train_rows:]

print("Training set size:", len(train_df))
print("Test set size:", len(test_df))

Training set size: 1166
Test set size: 130


In [7]:
# Creating probability dictionary from train set

girls_dir = {}
boys_dir = {}

# Names can generally be distinguished by the last character, use of vowels, bigrams and trigrams. Hence we consider features such as vowels, bigrams, trigrams etc.
for i in range(len(train_df)):
    name = train_df.iloc[i][0]
    gender = train_df.iloc[i][1]

    if gender == 0:
    # Last character
        key = name[-1]
        girls_dir[key] = girls_dir.get(key, 0) + 1

    # Bigrams
    for i in range(0, len(name)-1):
        key = name[i:i+2]
        girls_dir[key] = girls_dir.get(key, 0) + 1

    # # Trigrams
    # for i in range(0, len(name)-2):
    #     key = name[i:i+3]
    #     girls_dir[key] = girls_dir.get(key, 0) + 1
    
    # # Quadgrams
    # for i in range(0, len(name)-3):
    #     key = name[i:i+4]
    #     girls_dir[key] = girls_dir.get(key, 0) + 1
    
    else:
    # Last character
        key = name[-1]
        boys_dir[key] = boys_dir.get(key, 0) + 1

    # Bigrams
    for i in range(0, len(name)-1):
        key = name[i:i+2]
        boys_dir[key] = boys_dir.get(key, 0) + 1

    # # Trigrams
    # for i in range(0, len(name)-2):
    #     key = name[i:i+3]
    #     boys_dir[key] = boys_dir.get(key, 0) + 1
    
    # # Quadgrams
    # for i in range(0, len(name)-3):
    #     key = name[i:i+4]
    #     boys_dir[key] = boys_dir.get(key, 0) + 1

In [8]:
total_boys_count = sum(boys_dir.values())
print(total_boys_count)
total_girls_count = sum(girls_dir.values())
print(total_girls_count)

6778
6203


In [9]:
def test_gender(name):
    name = name.lower()
    girls_prob = 0
    boys_prob = 0
    
    
    # Last Character
    # performing laplace smoothing for each
    girls_prob += (girls_dir.get(name[-1], 0)+ 1)/(total_girls_count + len(girls_dir))
    boys_prob += (boys_dir.get(name[-1], 0)+ 1)/(total_boys_count + len(boys_dir))  

    # Bigrams
    for i in range(0, len(name)-1):
        key = name[i] + name[i+1]
        girls_prob += (girls_dir.get(key, 0)+ 1)/(total_girls_count + len(girls_dir))
        boys_prob += (boys_dir.get(key, 0)+ 1)/(total_boys_count + len(boys_dir))  
    
    # # Trigrams
    # for i in range(0, len(name)-2):
    #     key = name[i] + name[i+1] + name[i+2]
    #     girls_prob += (girls_dir.get(key, 0)+ 1)/(total_girls_count + len(girls_dir))
    #     boys_prob += (boys_dir.get(key, 0)+ 1)/(total_boys_count + len(boys_dir))  
    
    # # Quadgrams
    # for i in range(0, len(name)-3):
    #     key = name[i] + name[i+1] + name[i+2] + name[i+3]
    #     girls_prob += (girls_dir.get(key, 0)+ 1)/(total_girls_count + len(girls_dir))
    #     boys_prob += (boys_dir.get(key, 0)+ 1)/(total_boys_count + len(boys_dir))  

    if (girls_prob > boys_prob):
        return "girl"
    else:
        return "boy"

In [10]:
# Train Accuracy
def get_accuracy(test):
    miss = 0
    performance_matrix = [[0, 0], [0, 0]]

    for i in range(len(test)):
        name = test.iloc[i][0]
        gender = test.iloc[i][1]

        pred = test_gender(name)
        if (gender == 0 and pred == "girl"):
            performance_matrix[0][0] += 1
        elif (gender == 0 and pred == "boy"):
            performance_matrix[0][1] += 1
            miss += 1
        elif (gender == 1 and pred == "girl"):
            performance_matrix[1][0] += 1
            miss += 1
        elif (gender == 1 and pred == "boy"):
            performance_matrix[1][1] += 1

    return miss, performance_matrix

In [11]:
miss, performance_matrix = get_accuracy(test_df)

print(miss, len(test_df))
print("test accuracy: ", 1 - miss/(len(test_df)))
print("Confusion Matrix:", performance_matrix)

23 130
test accuracy:  0.823076923076923
Confusion Matrix: [[75, 13], [10, 32]]


In [14]:
# Testing with our own example

name = "Raavan"
gender = test_gender(name)
print("I am sure " + name + " is a " + gender + ".")

name = "Mandodari"
gender = test_gender(name)
print("I am sure " + name + " is a " + gender + ".")

I am sure Raavan is a boy.
I am sure Mandodari is a girl.
