## Gender Classification with SVMS using First Names

In [None]:
# import all the required libraries
import hashlib
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Generating a personalized random seed
username = "prit.kanadiya"
seed = int(hashlib.sha256(username.encode()).hexdigest(), 16) % (2**32)
print(seed)
random.seed(seed)

In [None]:
df = pd.read_csv("../assets/data/gender.csv")
df["Name"] = df["Name"].str.lower()    # Keeping all names in lowercase
print(df.head())

In [None]:
# The dataset is approximately balanced
sub_df = df[df["Target"] == 0]
print(f"Number of girls in dataset: {len(sub_df)}")
sub_df = df[df["Target"] == 1]
print(f"Number of girls in dataset: {len(sub_df)}")

In [None]:
# Convert each name to feature vector
X = []
y = []

# Considering all unigrams, all bigrams, and all trigrams in name and a binary feature indicating whether the last character is a vowel
# We only take the last character because we know in Indian names, last character being vowel can be useful
num_feats = 26 + 26*26 + 26*26*26 + 1 + 1  # 26 letters + 26*26 bigrams + 26*26*26 trigrams + 1 is_vowel = 18279 features

for i in range(len(df)):
    name = df.iloc[i]['Name']
    target = df.iloc[i]['Target']
    name = name.lower()

    vec = np.zeros(num_feats)
    vec[num_feats-1] = 1   # Initialize bias term as 1

    # Consider last character of name
    if (name[-1] == "a" or name[-1] == "e" or name[-1] == "i" or name[-1] == "o" or name[-1] == "u"):
        vec[num_feats-2] = 1

    # Consider all unigrams
    for j in range(len(name)):
        index = ord(name[j]) - 97
        vec[index] += 1

    # Consider all bigrams
    for j in range(len(name)-1):
        bigram = name[j:j+2]
        index = 26 + (ord(bigram[0]) - 97)*26 + (ord(bigram[1]) - 97)
        vec[index] += 1

    # Consider all trigrams
    for k in range(len(name)-2):
        trigram = name[k:k+3]
        index = 26 + 26*26 + (ord(trigram[0]) - 97)*26*26 + (ord(trigram[1]) - 97)*26 + (ord(trigram[2]) - 97)
        vec[index] += 1

    vec = vec.reshape(-1, 1)
    X.append(vec)

    if target == 0:
        # -1 represents girl
        y.append(-1)
    else:
        # 1 represents boy
        y.append(1)

# Printing sample feature vector and label
print(X[0].shape)
print(y[0])

In [None]:
# Split data into train and test sets
test_size = 0.1

data = list(zip(X, y))
random.shuffle(data)

split_index = int(len(data) * (1 - test_size))
train_data = data[:split_index]
test_data = data[split_index:]

X_train, y_train = zip(*train_data)
X_test, y_test = zip(*test_data)

X_train, X_test = np.array(X_train), np.array(X_test)
y_train, y_test = np.array(y_train), np.array(y_test)

y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

print(f'Number of training examples: {len(X_train)}')
print(f'Number of test examples: {len(X_test)}')

In [None]:
class SVM():
    def __init__(self, num_feats, mode):
        self.num_feats = num_feats
        self.mode = mode
        self.W = np.zeros((self.num_feats, 1))
        self.b = np.zeros((1, 1))    # In SVMs, we deal with bias separately
    
    def fit(self, X, y):
        if (self.mode = "hard-margin"):
            