## Gender Classification with Logistic Regression using First Names

In [1]:
# import all the required libraries
import numpy as np
import pandas as pd
import random

In [2]:
# Read the dataset
df = pd.read_csv("../assets/data/gender.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296 entries, 0 to 1295
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    1296 non-null   object
 1   Target  1296 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 20.4+ KB


In [4]:
df.head()

Unnamed: 0,Name,Target
0,Yash,1
1,Prit,1
2,Meet,1
3,Drashti,0
4,Saloni,0


In [5]:
# Convert each name to feature vector
X = []
y = []

# Considering the last character and bigrams in name
num_feats = 702    # 26 letters + 26*26 bigrams = 702 features

for i in range(len(df)):
  name = df.iloc[i]['Name']
  target = df.iloc[i]['Target']

  vec = np.zeros(num_feats)

  # Consider last character
  key = name[-1]
  if (ord(key) < 97 or ord(key) > 122):
    continue
  vec[ord(key)-97] +=1

  # Consider all bigrams
  for i in range(len(name)-1):
    bigram = name[i:i+2]
    # Skipping whitespace and extra characters if any
    if (ord(bigram[0]) < 97 or ord(bigram[0]) > 122 or ord(bigram[0]) < 97 or ord(bigram[1]) > 122):
      continue
    vec[26 + (ord(bigram[0]) - 97)*26 + (ord(bigram[1]) - 97)] += 1

  vec = vec.reshape(-1, 1)
  X.append(vec)

  if target == 0:
    # 0 represents girl
    y.append(0)
  else:
    # 1 represents boy
    y.append(1)

# Printing sample feature vector and label
print(X[0].shape)
print(y[0])

(702, 1)
1


In [6]:
# Split data into train, val and test sets
test_size = 0.1
val_size = 0.1

data = list(zip(X, y))
random.shuffle(data)

split_index1 = int(len(data) * (1 - test_size - val_size))
split_index2 = int(len(data) * (1 - test_size))
train_data = data[:split_index1]
val_data = data[split_index1:split_index2]
test_data = data[split_index2:]

X_train, y_train = zip(*train_data)
X_test, y_test = zip(*test_data)
X_val, y_val = zip(*val_data)

X_train, X_test = np.array(X_train), np.array(X_test)
y_train, y_test = np.array(y_train), np.array(y_test)
X_val, y_val = np.array(X_val), np.array(y_val)

y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)
y_val = y_val.reshape(-1, 1)

print(f'Number of training examples: {len(X_train)}')
print(f'Number of validation examples: {len(X_val)}')
print(f'Number of test examples: {len(X_test)}')

Number of training examples: 1035
Number of validation examples: 129
Number of test examples: 130


In [7]:
# Computes the sigmoid function
def sigmoid(z):
    return (1.0 / (1 + np.exp(-z)))

In [8]:
# Computes a forward pass
def forward(X, W, b):
    z = sigmoid(np.dot(X, W) + b)
    return (z > 0.5).astype(int)

In [9]:
# Computes the binary cross-entropy loss
def loss(y, y_hat):
    epsilon = 1e-10
    return -np.sum(y * np.log(y_hat + epsilon) + (1 - y) * np.log(1 - y_hat + epsilon))

In [10]:
# Trains the logistic regression model using gradient descent
def train(X, y, W, b, learning_rate=0.5, num_iterations=100000):
    X = np.squeeze(X)
    m = X.shape[0]

    for i in range(num_iterations):
        z = np.dot(X, W) + b
        a = sigmoid(z)
        if (i % 1000 == 0):
            print("Loss at iteration", i, "is", loss(y, a))
        dz = a - y
        dW = (1 / m) * np.dot(dz.T, X)
        db = (1 / m) * np.sum(dz)
        W = W - (learning_rate*(dW.T))
        b = b - (learning_rate * db)
    
    return W, b


In [11]:
# Computes the accuracy of the model
def get_accuracy(X, y, W, b):
    X = np.squeeze(X)
    m = X.shape[0]
    print(f"Total number of samples: {m}")

    Z = forward(X, W, b)
    equal_elements = np.sum(Z == y)
    print(f"Number of correct predictions: {equal_elements}")
    return equal_elements/m

In [12]:
# Computes the confusion matrix for the model
def confusion_matrix(X, y, W, b):
    X = np.squeeze(X)
    m = X.shape[0]

    Z = forward(X, W, b)

    cf = [[0, 0], [0, 0]]
    
    for i in range(len(Z)):
        if Z[i] == 0 and y[i] == 0:
            cf[0][0] += 1
        elif Z[i] == 0 and y[i] == 1:
            cf[0][1] += 1
        elif Z[i] == 1 and y[i] == 0:
            cf[1][0] += 1
        elif Z[i] == 1 and y[i] == 1:
            cf[1][1] += 1

    return cf

In [13]:
# Defining weights and bias
W = np.zeros((num_feats, 1))
b = 0

In [14]:
# Tuning hyperparameters using validation set
W, b = train(X_train, y_train, W, b, 0.5, 10000)

val_accuracy = get_accuracy(X_val, y_val, W, b)
print(f"Accuracy on validation set: {val_accuracy}")

# Reset parameters for changing hyperparameters and tuning again
W = np.zeros((num_feats, 1))
b = 0

Loss at iteration 0 is 717.4073316725433
Loss at iteration 1000 is 258.5933679186586
Loss at iteration 2000 is 229.116389005167
Loss at iteration 3000 is 213.13796425070836
Loss at iteration 4000 is 202.4472696832884
Loss at iteration 5000 is 194.5724858752591
Loss at iteration 6000 is 188.43493831925002
Loss at iteration 7000 is 183.46779353926917
Loss at iteration 8000 is 179.33688881309138
Loss at iteration 9000 is 175.82893593060663
Total number of samples: 129
Number of correct predictions: 118
Accuracy on validation set: 0.9147286821705426


In [15]:
# Training on both train and validation dataset
X_train_val = np.concatenate((X_train, X_val))
y_train_val = np.concatenate((y_train, y_val))
W, b = train(X_train_val, y_train_val, W, b, 0.5, 10000)

Loss at iteration 0 is 806.8233179389763
Loss at iteration 1000 is 288.0327909929472
Loss at iteration 2000 is 255.2542295556753
Loss at iteration 3000 is 237.6714069557526
Loss at iteration 4000 is 226.02842361307182
Loss at iteration 5000 is 217.50340801530194
Loss at iteration 6000 is 210.877239163816
Loss at iteration 7000 is 205.5201105230677
Loss at iteration 8000 is 201.06633458759146
Loss at iteration 9000 is 197.28504311434168


In [16]:
accuracy = get_accuracy(X_test, y_test, W, b)
print(f"Accuracy on test set: {accuracy}")

Total number of samples: 130
Number of correct predictions: 111
Accuracy on test set: 0.8538461538461538


In [17]:
cf = confusion_matrix(X_test, y_test, W, b)
print(cf)

[[56, 7], [12, 55]]


In [18]:
def predict(name):
    # Convert name to feature vector
    name = name.lower()

    vec = np.zeros(num_feats)

    # Consider all letters
    for letter in name:
    # Skipping whitespace and extra characters
        if (ord(letter) < 97 or ord(letter) > 122):
            continue
        vec[ord(letter) - 97] += 1

    # Consider all bigrams
    for i in range(len(name)-1):
        bigram = name[i:i+2]
        if (ord(bigram[0]) < 97 or ord(bigram[0]) > 122 or ord(bigram[0]) < 97 or ord(bigram[1]) > 122):
            continue
        vec[26 + (ord(bigram[0]) - 97)*26 + (ord(bigram[1]) - 97)] += 1

    # vec = vec.reshape(-1, 1)

    z = forward(vec, W, b)

    if z == 1:
        print("I am sure " + name + " is a boy.")
    elif z == 0:
        print("I am sure " + name + " is a girl.")

In [19]:
# Testing with our own example
predict("Preet")
predict("Preeti")

I am sure preet is a boy.
I am sure preeti is a girl.
