## Gender Classification with Logistic Regression using First Names

In [3]:
# import all the required libraries
import random
import numpy as np

In [4]:
# Read the dataset
f = open("../assets/data/girls.txt", "r")
girls = []
for girl in f:
    girls.append(girl[:-1].lower())

f = open("../assets/data/boys.txt", "r")
boys = []
for boy in f:
    boys.append(boy[:-1].lower())

print(len(boys), len(girls))
print(girls)
print(boys)

222 280
['aaradhya', 'adah', 'adhira', 'alisha', 'amoli', 'anaisha', 'ananya', 'anika', 'anushka', 'asmee', 'avni', 'carina', 'chara', 'drishti', 'ela', 'eshika', 'geetika', 'gulika', 'hiya', 'hiral', 'ira', 'ishana', 'ishita', 'jeevika', 'kaia', 'kashvi', 'keya', 'kimaya', 'krisha', 'larisa', 'laasya', 'mahika', 'mayra', 'mehar', 'mirai', 'mishka', 'naitee', 'navya', 'nyra', 'nehrika', 'neysa', 'pavati', 'prisha', 'ryka', 'rebecca', 'saanvi', 'sahana', 'sai', 'saisha', 'saira', 'saloni', 'shanaya', 'shrishti', 'sneha', 'turvi', 'taahira', 'taara', 'tanvi', 'viti', 'zara', 'aagya', 'aaina', 'aas', 'akaljeet', 'amanroop', 'anika', 'birva', 'bisanpreet', 'charanpreet', 'dilreet', 'ekkam', 'faal', 'gurleen', 'gurmeet', 'heer', 'harleen', 'harveen', 'ikamroop', 'isha', 'ishmeet', 'katiya', 'mehr', 'nihaara', 'paakhi', 'parminder', 'simrat', 'sukhdeep', 'sukhleen', 'shirina', 'tavleen', 'ami', 'askini', 'anvi', 'bandhini', 'bansari', 'charmi', 'chavi', 'charul', 'drisna', 'dhara', 'dhruvi',

In [6]:
# Convert each name to feature vector
X = []
y = []

# Considering all bigrams and last character
num_feats = 702    # 26 letters + 26*26 bigrams = 702 features

for girl in girls:
  vec = np.zeros(num_feats)

  # Consider all letters
  for letter in girl:
    # Skipping whitespace and extra characters
    if (ord(letter) < 97 or ord(letter) > 122):
      continue
    vec[ord(letter) - 97] += 1

  # Consider all bigrams
  for i in range(len(girl)-1):
    bigram = girl[i:i+2]
    if (ord(bigram[0]) < 97 or ord(bigram[0]) > 122 or ord(bigram[0]) < 97 or ord(bigram[1]) > 122):
      continue
    vec[26 + (ord(bigram[0]) - 97)*26 + (ord(bigram[1]) - 97)] += 1

  vec = vec.reshape(-1, 1)
  X.append(vec)
  y.append(0)

for boy in boys:
  vec = np.zeros(num_feats)

  # Consider all letters
  for letter in boy:
    # Skipping whitespace and extra characters
    if (ord(letter) < 97 or ord(letter) > 122):
      continue
    vec[ord(letter) - 97] += 1
  
  # Consider all bigrams
  for i in range(len(boy)-1):
    bigram = boy[i:i+2]
    if (ord(bigram[0]) < 97 or ord(bigram[0]) > 122 or ord(bigram[0]) < 97 or ord(bigram[1]) > 122):
      continue
    vec[26 + (ord(bigram[0]) - 97)*26 + (ord(bigram[1]) - 97)] += 1

  vec = vec.reshape(-1, 1)
  X.append(vec)
  y.append(1)

# Printing sample feature vector and label
print(X[0].shape)
print(y[0])

(702, 1)
0


In [7]:
# Split training, validation and testing data

test_size = 0.1
val_size = 0.1

data = list(zip(X, y))
random.shuffle(data)

split_index1 = int(len(data) * (1 - test_size - val_size))
split_index2 = int(len(data) * (1 - test_size))
train_data = data[:split_index1]
val_data = data[split_index1:split_index2]
test_data = data[split_index2:]


X_train, y_train = zip(*train_data)
X_test, y_test = zip(*test_data)
X_val, y_val = zip(*val_data)

X_train, X_test = np.array(X_train), np.array(X_test)
y_train, y_test = np.array(y_train), np.array(y_test)
X_val, y_val = np.array(X_val), np.array(y_val)

y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)
y_val = y_val.reshape(-1, 1)

print(f'Number of training examples: {len(X_train)}')
print(f'Number of validation examples: {len(X_val)}')
print(f'Number of test examples: {len(X_test)}')

Number of training examples: 401
Number of validation examples: 50
Number of test examples: 51


In [8]:
def sigmoid(z):
    return (1.0 / (1 + np.exp(-z)))

In [9]:
def forward(X, W, b):
    z = sigmoid(np.dot(X, W) + b)
    return (z > 0.5).astype(int)

In [10]:
def loss(y, y_hat):
    epsilon = 1e-10
    return -np.sum(y * np.log(y_hat + epsilon) + (1 - y) * np.log(1 - y_hat + epsilon))

In [11]:
# Defining weights and bias
W = np.zeros((num_feats, 1))
b = 0

In [12]:
def train(X, y, W, b, learning_rate=0.5, num_iterations=100000):
    X = np.squeeze(X)
    m = X.shape[0]

    for i in range(num_iterations):
        z = np.dot(X, W) + b
        a = sigmoid(z)
        if (i % 10000 == 0):
            print("Loss at iteration", i, "is", loss(y, a))
        dz = a - y
        dW = (1 / m) * np.dot(dz.T, X)
        db = (1 / m) * np.sum(dz)
        W = W - (learning_rate*(dW.T))
        b = b - (learning_rate * db)
    
    return W, b

In [13]:
W, b = train(X_train, y_train, W, b)

Loss at iteration 0 is 277.95201932433804
Loss at iteration 10000 is 25.88342276697327
Loss at iteration 20000 is 15.596852405467057
Loss at iteration 30000 is 11.289980859870088
Loss at iteration 40000 is 8.874396297559354
Loss at iteration 50000 is 7.3184707459093
Loss at iteration 60000 is 6.229763246596679
Loss at iteration 70000 is 5.424320476077932
Loss at iteration 80000 is 4.803932184057068
Loss at iteration 90000 is 4.311225749066947


In [17]:
# Tuning hyperparameters using validation set
def validate(X, y, W, b):
    X = np.squeeze(X)
    m = X.shape[0]
    print(f"Total number of test samples: {m}")

    Z = forward(X, W, b)
    equal_elements = np.sum(Z == y)
    print(f"Number of correct predictions: {equal_elements}")
    return equal_elements/m

val_accuracy = validate(X_val, y_val, W, b)
print(f"Accuracy on test set: {val_accuracy}")
print(W)
print(b)

# We observed that a high learning rate (e.g., 10) can cause the loss to decrease rapidly. However, these rapidly learned weights tend to be large and may not be optimal for the problem. While a lower learning rate (e.g., 0.5) might lead to a slower decrease in loss during training, it allows the model to learn smaller, more feasible weights.

In [18]:
# Train on remaining validation set
W, b = train(X_val, y_val, W, b, 0.5, 1000)

Loss at iteration 0 is 90.56930572485507


In [19]:
def get_accuracy(X, y, W, b):
    X = np.squeeze(X)
    m = X.shape[0]
    print(f"Total number of test samples: {m}")

    Z = forward(X, W, b)
    equal_elements = np.sum(Z == y)
    print(f"Number of correct predictions: {equal_elements}")
    return equal_elements/m

accuracy = get_accuracy(X_test, y_test, W, b)
print(f"Accuracy on test set: {accuracy}")
        

Total number of test samples: 51
Number of correct predictions: 43
Accuracy on test set: 0.8431372549019608


In [20]:
def predict(name):
    # Convert name to feature vector

    name = name.lower()

    vec = np.zeros(num_feats)

    # Consider all letters
    for letter in name:
    # Skipping whitespace and extra characters
        if (ord(letter) < 97 or ord(letter) > 122):
            continue
        vec[ord(letter) - 97] += 1

    # Consider all bigrams
    for i in range(len(name)-1):
        bigram = name[i:i+2]
        if (ord(bigram[0]) < 97 or ord(bigram[0]) > 122 or ord(bigram[0]) < 97 or ord(bigram[1]) > 122):
            continue
        vec[26 + (ord(bigram[0]) - 97)*26 + (ord(bigram[1]) - 97)] += 1

    # vec = vec.reshape(-1, 1)

    z = forward(vec, W, b)

    if z == 1:
        print("I am sure " + name + " is a boy.")
    elif z == 0:
        print("I am sure " + name + " is a girl.")

In [22]:
# Testing with our own example
predict("chandanbala")
predict("dhruvi")

I am sure chandanbala is a boy.
I am sure dhruvi is a girl.
