## Gender Classification with Perceptron using First Names

In [155]:
# import all the required libraries
import random
import numpy as np

In [156]:
# Read the dataset
f = open("data/girls.txt", "r")
girls = []
for girl in f:
    girls.append(girl[:-1].lower())

f = open("data/boys.txt", "r")
boys = []
for boy in f:
    boys.append(boy[:-1].lower())

print(len(boys), len(girls))
print(girls)
print(boys)

222 280
['aaradhya', 'adah', 'adhira', 'alisha', 'amoli', 'anaisha', 'ananya', 'anika', 'anushka', 'asmee', 'avni', 'carina', 'chara', 'drishti', 'ela', 'eshika', 'geetika', 'gulika', 'hiya', 'hiral', 'ira', 'ishana', 'ishita', 'jeevika', 'kaia', 'kashvi', 'keya', 'kimaya', 'krisha', 'larisa', 'laasya', 'mahika', 'mayra', 'mehar', 'mirai', 'mishka', 'naitee', 'navya', 'nyra', 'nehrika', 'neysa', 'pavati', 'prisha', 'ryka', 'rebecca', 'saanvi', 'sahana', 'sai', 'saisha', 'saira', 'saloni', 'shanaya', 'shrishti', 'sneha', 'turvi', 'taahira', 'taara', 'tanvi', 'viti', 'zara', 'aagya', 'aaina', 'aas', 'akaljeet', 'amanroop', 'anika', 'birva', 'bisanpreet', 'charanpreet', 'dilreet', 'ekkam', 'faal', 'gurleen', 'gurmeet', 'heer', 'harleen', 'harveen', 'ikamroop', 'isha', 'ishmeet', 'katiya', 'mehr', 'nihaara', 'paakhi', 'parminder', 'simrat', 'sukhdeep', 'sukhleen', 'shirina', 'tavleen', 'ami', 'askini', 'anvi', 'bandhini', 'bansari', 'charmi', 'chavi', 'charul', 'drisna', 'dhara', 'dhruvi',

In [157]:
# Convert each name to feature vector
X = []
y = []

# Considering all single letters, bigrams and bias term
num_feats = 703    # 26 letters + 26^2 bigrams + 1 bias term = 703 features

for girl in girls:
  vec = np.zeros(num_feats)
  vec[num_feats-1] = 1   # Initialize bias term as 1

  # Consider all single letters
  for letter in girl:
    # Skipping whitespace and extra characters
    if (ord(letter) < 97 or ord(letter) > 122):
      continue
    vec[ord(letter) - 97] += 1

  # Consider all bigrams
  for i in range(len(girl)-1):
    bigram = girl[i:i+2]
    if (ord(bigram[0]) < 97 or ord(bigram[0]) > 122 or ord(bigram[0]) < 97 or ord(bigram[1]) > 122):
      continue
    vec[26 + (ord(bigram[0]) - 97)*26 + (ord(bigram[1]) - 97)] += 1

  vec = vec.reshape(-1, 1)
  X.append(vec)
  y.append(-1)

for boy in boys:
  vec = np.zeros(num_feats)
  vec[num_feats-1] = 1
  for letter in boy:
    if (ord(letter) < 97 or ord(letter) > 122):
      continue
    vec[ord(letter) - 97] += 1
  
  for i in range(len(boy)-1):
    bigram = boy[i:i+2]
    if (ord(bigram[0]) < 97 or ord(bigram[0]) > 122 or ord(bigram[0]) < 97 or ord(bigram[1]) > 122):
      continue
    vec[26 + (ord(bigram[0]) - 97)*26 + (ord(bigram[1]) - 97)] += 1

  vec = vec.reshape(-1, 1)
  X.append(vec)
  y.append(1)

# Printing sample feature vector and label
print(X[0].shape)
print(y[0])

(703, 1)
-1


In [158]:
# Split training and test data

test_size = 0.1
random.seed(0)

data = list(zip(X, y))
random.shuffle(data)

split_index = int(len(data) * (1 - test_size))
train_data = data[:split_index]
test_data = data[split_index:]

X_train, y_train = zip(*train_data)
X_test, y_test = zip(*test_data)

X_train, X_test = np.array(X_train), np.array(X_test)
y_train, y_test = np.array(y_train), np.array(y_test)

y_train = y_train.reshape(-1, 1)
y_test = X_test.reshape(-1, 1)

print(f'Number of training examples: {len(X_train)}')
print(f'Number of test examples: {len(X_test)}')

Number of training examples: 451
Number of test examples: 51


In [164]:
# Defining weights for perceptron
W = np.zeros((703, 1))

In [165]:
def fit(X, y, W):
    m = X.shape[0]

    num_iterations = 0

    # Continue till labels are classified
    while True:
        miss = 0
        for j in range(m):
            Z = y[j]*(np.dot(W.T, X[j]))
            if (Z[0][0] <= 0):
                W = W + y[j]*X[j]
                miss += 1
        
        print(f"Iteration {num_iterations}: {miss} missclassifications")
        num_iterations += 1

        if (miss == 0):
            break
        
    return W

In [166]:
W = fit(X_train, y_train, W)

Iteration 0: 222 missclassifications
Iteration 1: 149 missclassifications
Iteration 2: 124 missclassifications
Iteration 3: 117 missclassifications
Iteration 4: 93 missclassifications
Iteration 5: 100 missclassifications
Iteration 6: 96 missclassifications
Iteration 7: 93 missclassifications
Iteration 8: 71 missclassifications
Iteration 9: 78 missclassifications
Iteration 10: 50 missclassifications
Iteration 11: 62 missclassifications
Iteration 12: 54 missclassifications
Iteration 13: 65 missclassifications
Iteration 14: 58 missclassifications
Iteration 15: 54 missclassifications
Iteration 16: 59 missclassifications
Iteration 17: 52 missclassifications
Iteration 18: 48 missclassifications
Iteration 19: 44 missclassifications
Iteration 20: 53 missclassifications
Iteration 21: 51 missclassifications
Iteration 22: 33 missclassifications
Iteration 23: 48 missclassifications
Iteration 24: 50 missclassifications
Iteration 25: 49 missclassifications
Iteration 26: 30 missclassifications
Iterat

In [167]:
def get_accuracy(X, y, W):
    m = X.shape[0]

    miss = 0
    for j in range(m):
        Z = y[j]*(np.dot(W.T, X[j]))
        if (Z[0][0] <= 0):
            miss += 1

    return (m-miss)/m

accuracy = get_accuracy(X_test, y_test, W)
print(accuracy)
        

0.09803921568627451


In [176]:
def predict(name):
    # Convert name to feature vector

    name = name.lower()
    
    # Considering all single letters, bigrams and bias term
    num_feats = 703    # 26 letters + 26^2 bigrams + 1 bias term = 703 features

    vec = np.zeros(num_feats)
    vec[num_feats-1] = 1   # Initialize bias term as 1

    # Consider all single letters
    for letter in name:
        # Skipping whitespace and extra characters
        if (ord(letter) < 97 or ord(letter) > 122):
            continue
        vec[ord(letter) - 97] += 1

    # Consider all bigrams
    for i in range(len(name)-1):
        bigram = name[i:i+2]
        if (ord(bigram[0]) < 97 or ord(bigram[0]) > 122 or ord(bigram[0]) < 97 or ord(bigram[1]) > 122):
            continue
        vec[26 + (ord(bigram[0]) - 97)*26 + (ord(bigram[1]) - 97)] += 1

    vec = vec.reshape(-1, 1)
    
    z = np.dot(W.T, vec)

    if z > 0:
        print("I am sure " + name + " is a boy.")
    elif z < 0:
        print("I am sure " + name + " is a girl.")
    else:
        print("I am not sure if " + name + " is a boy or a girl.")

In [208]:
# Testing with our own example
predict("Prit")
predict("Pritika")

I am sure prit is a boy.
I am sure pritika is a girl.
