## Gender Classification with SVMs using First Names

In [28]:
# import all the required libraries
import numpy as np
import pandas as pd

In [29]:
# Read the dataset
df = pd.read_csv("../assets/data/gender.csv")

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1298 entries, 0 to 1297
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    1298 non-null   object
 1   Target  1298 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 20.4+ KB


In [31]:
df.head()

Unnamed: 0,Name,Target
0,Yash,1
1,Prit,1
2,Meet,1
3,Drashti,0
4,Saloni,0


In [32]:
# Convert each name to feature vector
X = []
y = []

# Considering the last character, unigrams, bigrams, and trigrams in name
num_feats = 18279  # 26 letters + 26*26 bigrams + 26*26*26 trigrams + 1 bias = 18279 features

for i in range(len(df)):
    name = df.iloc[i]['Name']
    target = df.iloc[i]['Target']
    name = name.lower()

    vec = np.zeros(num_feats)
    vec[num_feats-1] = 1   # Initialize bias term as 1

    # Consider last character of name
    vec[ord(name[-1]) - 97] += 1

    # Consider all bigrams
    for j in range(len(name)-1):
        bigram = name[j:j+2]
        if not bigram.isalpha() or len(bigram) != 2:
            continue
        index = 26 + (ord(bigram[0]) - 97)*26 + (ord(bigram[1]) - 97)
        vec[index] += 1

    # Consider all trigrams
    for k in range(len(name)-2):
        trigram = name[k:k+3]
        if not trigram.isalpha() or len(trigram) != 3:
            continue
        index = 26*26 + (ord(trigram[0]) - 97)*26*26 + (ord(trigram[1]) - 97)*26 + (ord(trigram[2]) - 97)
        vec[index] += 1

    vec = vec.reshape(-1, 1)
    X.append(vec)

    if target == 0:
        # -1 represents girl
        y.append(-1)
    else:
        # 1 represents boy
        y.append(1)

# Printing sample feature vector and label
print(X[0].shape)
print(y[0])


(18279, 1)
1


In [33]:
# Split data into train and test sets
train_size = 0.8
test_size = 0.1
val_size = 0.1

data = list(zip(X, y))

split_index_train = int(len(data) * (1 - test_size - val_size))
split_index_val = split_index_train + int(len(data)*test_size)

train_data = data[:split_index_train]
val_data = data[split_index_train:split_index_val]
test_data = data[split_index_val:]

X_train, y_train = zip(*train_data)
X_test, y_test = zip(*test_data)
X_val, y_val = zip(*val_data)

X_train, X_test, X_val = np.array(X_train), np.array(X_test), np.array(X_val)
y_train, y_test, y_val = np.array(y_train), np.array(y_test), np.array(y_val)


y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)
y_val = y_test.reshape(-1, 1)

print(f'Number of training examples: {len(X_train)}')
print(f'Number of test examples: {len(X_test)}')
print(f'Number of val examples: {len(X_val)}')

Number of training examples: 1038
Number of test examples: 131
Number of val examples: 129


In [34]:
# Defining weights and bias for SVM. Unlike perceptron, we can not absorb the bias term in W.
W = np.zeros((num_feats, 1))
b = 0

In [53]:
def train(X, y, W, b, learning_rate=0.01, num_iterations=1000, tr_lamda=0.01):
    m = X.shape[0]

    for i in range(m):
        z = y[i]*(np.dot(W.T, X[i]) + b)
        
        if (z[0][0] >= 1):
            W = W - learning_rate*(2*tr_lamda*W)
        # else:
        #     W = W - learning_rate*(2*tr_lamda*W - np.dot(y[i], X[i].T)) 
        #     b = b - learning_rate*(y[i])

    return W, b

In [54]:
W, b = train(X_train, y_train, W, b)