## Gender Classification with SVMs using First Names

In [3]:
# import all the required libraries
import numpy as np
import pandas as pd

In [4]:
# Read the dataset
df = pd.read_csv("../assets/data/gender.csv")

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1298 entries, 0 to 1297
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    1298 non-null   object
 1   Target  1298 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 20.4+ KB


In [6]:
df.head()

Unnamed: 0,Name,Target
0,Yash,1
1,Prit,1
2,Meet,1
3,Drashti,0
4,Saloni,0


In [7]:
# Convert each name to feature vector
X = []
y = []

# Considering the last character and bigrams in name
num_feats = 702    # 26 letters + 26*26 bigrams = 702 features

for i in range(len(df)):
  name = df.iloc[i]['Name']
  target = df.iloc[i]['Target']

  vec = np.zeros(num_feats)

  # Consider last character
  key = name[-1]
  if (ord(key) < 97 or ord(key) > 122):
    continue
  vec[ord(key)-97] +=1

  # Consider all bigrams
  for i in range(len(name)-1):
    bigram = name[i:i+2]
    # Skipping whitespace and extra characters if any
    if (ord(bigram[0]) < 97 or ord(bigram[0]) > 122 or ord(bigram[0]) < 97 or ord(bigram[1]) > 122):
      continue
    vec[26 + (ord(bigram[0]) - 97)*26 + (ord(bigram[1]) - 97)] += 1

  X.append(vec)

  if target == 0:
    # 0 represents girl
    y.append(-1)
  else:
    # 1 represents boy
    y.append(1)

# Printing sample feature vector and label
print(X[0].shape)
print(y[0])

(702,)
1


In [8]:
# Split data into train and test sets
train_size = 0.8
test_size = 0.1
val_size = 0.1

data = list(zip(X, y))

# Split for training set
split_index_train = int(len(data) * (1 - test_size - val_size))
# Split for val set
split_index_val = int(len(data)*val_size)

train_data = data[:split_index_train]
val_data = data[split_index_train:split_index_train+split_index_val]
test_data = data[split_index_train+split_index_val:]

X_train, y_train = zip(*train_data)
X_test, y_test = zip(*test_data)
X_val, y_val = zip(*val_data)

X_train, X_test, X_val = np.array(X_train), np.array(X_test), np.array(X_val)
y_train, y_test, y_val = np.array(y_train), np.array(y_test), np.array(y_val)

y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)
y_val = y_val.reshape(-1, 1)

print(f'Number of training examples: {X_train.shape, y_train.shape}')
print(f'Number of test examples: {X_test.shape, y_test.shape}')
print(f'Number of val examples: {X_val.shape, y_val.shape}')

Number of training examples: ((1036, 702), (1036, 1))
Number of test examples: ((131, 702), (131, 1))
Number of val examples: ((129, 702), (129, 1))


In [58]:
# Defining weights and bias for SVM. Unlike perceptron, we can not absorb the bias term in W.
W = np.zeros((num_feats, 1))
b = 0

In [59]:
# If C is very small, it will allow some points to fall inside the margin or classify them incorrectly in order to increase the margin. One the other hand if C is very large, it will try to classify each and every point correctly, but will produce a thin margin.
def train(X, y, W, b, learning_rate=0.1, C=0.5, num_iterations=1000):
    m = X.shape[0]

    for iteration in range(num_iterations):
        z = y * (np.dot(X, W) + b)

        # margin_mask is set to true for all z < 1
        margin_mask = z < 1

        # Calculate gradients
        dW = (1/m)*((2*W) + C*(1-np.dot(X.T, y*margin_mask)))
        db = (1/m)*np.sum((C*margin_mask*(-y)))

        if (iteration%100 == 0):
            hinge_loss =  C*np.sum(1-np.dot(X.T, y*margin_mask))
            regularization_loss = np.sum(W * W)
            total_loss = hinge_loss + regularization_loss
            print(f'Iteration: {round(iteration, 4)}, Total Loss: {round(total_loss, 4)}, Hinge Loss: {round(hinge_loss, 4)}, Regularization Loss: {round(regularization_loss, 4)}')


        W = W - learning_rate*dW
        b = b - learning_rate*db

    return W, b


In [60]:
W, b = train(X_train, y_train, W, b)

Iteration: 0, Total Loss: 363.5, Hinge Loss: 363.5, Regularization Loss: 0.0
Iteration: 100, Total Loss: 109.112, Hinge Loss: 107.0, Regularization Loss: 2.112
Iteration: 200, Total Loss: 230.1356, Hinge Loss: 225.0, Regularization Loss: 5.1356
Iteration: 300, Total Loss: 285.4427, Hinge Loss: 278.5, Regularization Loss: 6.9427
Iteration: 400, Total Loss: 339.6516, Hinge Loss: 332.0, Regularization Loss: 7.6516
Iteration: 500, Total Loss: 332.038, Hinge Loss: 324.0, Regularization Loss: 8.038
Iteration: 600, Total Loss: 332.8731, Hinge Loss: 324.5, Regularization Loss: 8.3731
Iteration: 700, Total Loss: 338.2068, Hinge Loss: 329.5, Regularization Loss: 8.7068
Iteration: 800, Total Loss: 337.0718, Hinge Loss: 328.0, Regularization Loss: 9.0718
Iteration: 900, Total Loss: 263.4637, Hinge Loss: 254.0, Regularization Loss: 9.4637


In [61]:
def get_accuracy(X, y, W, b):
    m = X.shape[0]
    print(f"Total number of test samples: {m}")

    z = y * (np.dot(X, W) + b)  # Compute all z values at once

    # mask is set to true for all misses i.e. z < 0
    mask = z < 0
    miss = np.sum(mask)

    print(f"Number of misclassified samples: {miss}")

    # Calculate and return accuracy
    accuracy = (m - miss) / m
    return accuracy

val_accuracy = get_accuracy(X_val, y_val, W, b)
print(f"Accuracy on Validation set: {val_accuracy}")


Total number of test samples: 129
Number of misclassified samples: 14
Accuracy on test set: 0.8914728682170543


In [62]:
# Reset parameters for training again on combined dataset
W = np.zeros((num_feats, 1))
b = 0

In [63]:
# Training on both train and validation dataset
X_train_val = np.concatenate((X_train, X_val))
y_train_val = np.concatenate((y_train, y_val))
W, b = train(X_train_val, y_train_val, W, b)

Iteration: 0, Total Loss: 375.5, Hinge Loss: 375.5, Regularization Loss: 0.0
Iteration: 100, Total Loss: 58.114, Hinge Loss: 56.0, Regularization Loss: 2.114
Iteration: 200, Total Loss: 243.1362, Hinge Loss: 238.0, Regularization Loss: 5.1362
Iteration: 300, Total Loss: 289.4351, Hinge Loss: 282.5, Regularization Loss: 6.9351
Iteration: 400, Total Loss: 320.1193, Hinge Loss: 312.5, Regularization Loss: 7.6193
Iteration: 500, Total Loss: 307.9687, Hinge Loss: 300.0, Regularization Loss: 7.9687
Iteration: 600, Total Loss: 329.7222, Hinge Loss: 321.5, Regularization Loss: 8.2222
Iteration: 700, Total Loss: 348.4962, Hinge Loss: 340.0, Regularization Loss: 8.4962
Iteration: 800, Total Loss: 261.7987, Hinge Loss: 253.0, Regularization Loss: 8.7987
Iteration: 900, Total Loss: 292.1235, Hinge Loss: 283.0, Regularization Loss: 9.1235


In [64]:
accuracy = get_accuracy(X_test, y_test, W, b)
print(f"Accuracy on test set: {accuracy}")

Total number of test samples: 131
Number of misclassified samples: 19
Accuracy on test set: 0.8549618320610687


In [65]:
def predict(name):
    # Convert name to feature vector
    name = name.lower()

    vec = np.zeros(num_feats)

    key = name[-1]
    vec[ord(key)-97] +=1

    # Consider all bigrams
    for i in range(len(name)-1):
        bigram = name[i:i+2]
        if (ord(bigram[0]) < 97 or ord(bigram[0]) > 122 or ord(bigram[0]) < 97 or ord(bigram[1]) > 122):
            continue
        vec[26 + (ord(bigram[0]) - 97)*26 + (ord(bigram[1]) - 97)] += 1

    z = (np.dot(vec, W) + b) 

    if z > 0:
        print("I am sure " + name + " is a boy.")
    elif z < 0:
        print("I am sure " + name + " is a girl.")

In [68]:
# Testing with our own example
predict("Chandan")
predict("Chandanbala")

I am sure chandan is a boy.
I am sure chandanbala is a girl.
