In [1]:
# import all the required libraries
import numpy as np
import pandas as pd
from Levenshtein import distance

In [2]:
# Read the dataset
df = pd.read_csv("../assets/data/gender.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296 entries, 0 to 1295
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    1296 non-null   object
 1   Target  1296 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 20.4+ KB


In [4]:
df.head()

Unnamed: 0,Name,Target
0,Yash,1
1,Prit,1
2,Meet,1
3,Drashti,0
4,Saloni,0


In [5]:
female_count = df[df['Target'] == 0].shape[0]
male_count = df[df['Target'] == 1].shape[0]  

print(f"Number of girls: {female_count}")
print(f"Number of boys: {male_count}")

Number of girls: 679
Number of boys: 617


In [6]:
# Split data into train, val and test sets
train_size = 0.8
test_size = 0.1
val_size = 0.1

num_rows = len(df)

train_rows = int(num_rows * train_size)
val_rows = int(num_rows * val_size)
test_rows = num_rows - train_rows - val_rows

# Create separate DataFrames for each set
train_df = df.iloc[:train_rows]
val_df = df.iloc[train_rows:train_rows + val_rows]
test_df = df.iloc[train_rows + val_rows:]

print("Training set size:", len(train_df))
print("Validation set size:", len(val_df))
print("Test set size:", len(test_df))


Training set size: 1036
Validation set size: 129
Test set size: 131


In [7]:
def get_accuracy(train_df, test_df, k):
    correct = 0
    m = len(test_df)
    # print(f"Total number of samples: {m}")

    for i in range(len(test_df)):
        test_name = test_df.iloc[i][0]

        distances = []
        for j in range(len(train_df)):
            train_name = train_df.iloc[j][0]
            # Calculate minimum edit distance
            edit_dist = distance(test_name, train_name)
            distances.append((edit_dist, train_df.iloc[j][0], train_df.iloc[j][1]))

        # Sorting based on distance only
        distances.sort(key=lambda x: x[0]) 

        neighbors = distances[:k]
        votes = [neighbor[2] for neighbor in neighbors]
        prediction = max(set(votes), key=votes.count)
        if (prediction == test_df.iloc[i][1]):
            correct += 1

    # print(f"Number of correct predictions: {correct}")
    accuracy = correct / len(test_df)
    return accuracy

In [8]:
for k in range(1,22,2):
    test_names = test_df.iloc[:, 0]
    test_labels = test_df.iloc[:, 1]
    val_accuracy = get_accuracy(train_df, val_df, k)
    print(f'Accuracy for k={k}: {val_accuracy}')

Accuracy for k=1: 0.7674418604651163
Accuracy for k=3: 0.8217054263565892
Accuracy for k=5: 0.8372093023255814
Accuracy for k=7: 0.8527131782945736
Accuracy for k=9: 0.8527131782945736
Accuracy for k=11: 0.8604651162790697
Accuracy for k=13: 0.8449612403100775
Accuracy for k=15: 0.8604651162790697
Accuracy for k=17: 0.8682170542635659
Accuracy for k=19: 0.8604651162790697
Accuracy for k=21: 0.8837209302325582


In [13]:
# Training on both train and validation dataset
K = 9
train_merged_df = pd.concat([train_df, val_df], ignore_index=True, sort=False)
accuracy = get_accuracy(train_merged_df, test_df, K)
print(f"Accuracy on test set: {accuracy}")

Accuracy on test set: 0.8244274809160306


In [84]:
def predict(name):
    correct = 0
    K = 9

    distances = []
    for j in range(len(train_df)):
        train_name = train_df.iloc[j][0]
        # Calculate minimum edit distance
        edit_dist = distance(name, train_name)
        distances.append((edit_dist, train_df.iloc[j][0], train_df.iloc[j][1]))

    # Sorting based on distance only
    distances.sort(key=lambda x: x[0]) 

    neighbors = distances[:K]
    votes = [neighbor[2] for neighbor in neighbors]
    prediction = max(set(votes), key=votes.count)
    
    if prediction == 1:
        print("I am sure " + name + " is a boy.")
    else:
        print("I am sure " + name + " is a girl.")
    
    print("Names with minimum edit distance are:", [neighbor[1] for neighbor in neighbors])

In [85]:
# Testing with our own example
predict("chandan")
predict("chandanbala")

I am sure chandan is a boy.
Names with minimum edit distance are: ['Chandani', 'Vandan', 'Chintan', 'Shantanu', 'Nanda', 'Chand', 'Ranjan']
I am sure chandanbala is a girl.
Names with minimum edit distance are: ['Chandani', 'Shantanu', 'Vandna', 'Chitranjali', 'Vandan', 'Sanjana', 'Chanchal']
