## Gender Classification with Kernelized Perceptron using First Names

In [35]:
# import all the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [36]:
# Read the dataset
df = pd.read_csv("../../assets/data/gender.csv")

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1294 entries, 0 to 1293
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    1294 non-null   object
 1   Target  1294 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 20.3+ KB


In [38]:
df.head()

Unnamed: 0,Name,Target
0,Yash,1
1,Prit,1
2,Meet,1
3,Drashti,0
4,Saloni,0


In [39]:
# Convert each name to feature vector
X = []
y = []

# Considering the last character, unigrams, and bigrams in name
num_feats = 26 + 26*26 + 1  # 26 letters + 26*26 bigrams + 1 bias

for i in range(len(df)):
    name = df.iloc[i]['Name']
    target = df.iloc[i]['Target']
    name = name.lower()

    vec = np.zeros(num_feats)
    vec[num_feats-1] = 1   # Initialize bias term as 1

    # Consider last character of name
    vec[ord(name[-1]) - 97] += 1

    # Consider all bigrams
    for j in range(len(name)-1):
        bigram = name[j:j+2]
        index = 26 + (ord(bigram[0]) - 97)*26 + (ord(bigram[1]) - 97)
        vec[index] += 1

    # # Consider all trigrams
    # for k in range(len(name)-2):
    #     trigram = name[k:k+3]
    #     index = 26*26 + (ord(trigram[0]) - 97)*26*26 + (ord(trigram[1]) - 97)*26 + (ord(trigram[2]) - 97)
    #     vec[index] += 1

    X.append(vec)

    if target == 0:
        # -1 represents girl
        y.append(-1)
    else:
        # 1 represents boy
        y.append(1)

# Printing sample feature vector and label
print(X[0].shape)
print(y[0])


(703,)
1


In [40]:
# Split data into train and test sets
test_size = 0.1

data = list(zip(X, y))

split_index = int(len(data) * (1 - test_size))
train_data = data[:split_index]
test_data = data[split_index:]

X_train, y_train = zip(*train_data)
X_test, y_test = zip(*test_data)

X_train, X_test = np.array(X_train), np.array(X_test)
y_train, y_test = np.array(y_train), np.array(y_test)

y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

print(f'Number of training examples: {len(X_train)}')
print(f'Number of test examples: {len(X_test)}')

Number of training examples: 1164
Number of test examples: 130


In [41]:
# Defining linear kernel function
def linear_kernel(x1, x2):
    return np.dot(x1, x2)

In [42]:
# Defining polynomial kernel function
def polynomial_kernel(x, y, p=3):
    return (1 + np.dot(x, y)) ** p

In [43]:
# Defining radial basis function kernel function
def rbf_kernel(x, y, sigma=5.0):
    return np.exp(-(np.linalg.norm(x-y)**2) / (2 * (sigma**2)))

In [44]:
# Defining the kernel matrix
num_samples = len(X_train)
K = np.zeros((num_samples,num_samples))

for i in range(num_samples):
    for j in range(num_samples):
        K[i][j] = rbf_kernel(X_train[i], X_train[j])

In [45]:
# Defining the alpha vector
alpha = np.zeros((num_samples,1))

In [46]:
def fit(X, y, alpha, K, threshold=250):
    m = X.shape[0]
    num_iterations = 0
    missclassifications = []  # List to store missclassifications at each iteration

    # Continue till all examples are classified
    while True:
        miss = 0

        for i in range(m):
            ksum = 0
            for j in range(m):
                ksum += alpha[j]*K[i][j]

            Z = y[i]*ksum

            if (Z[0] <= 0):
                miss += 1
                alpha[i] += y[i]
        
        missclassifications.append(miss)
        
        if (num_iterations % 10 == 0):
            print(f"Iteration {num_iterations}: {miss} missclassifications")
            
        num_iterations += 1

        if (miss == 0):
            print(f"Iteration {num_iterations}: Converged!")
            break
            
        if (num_iterations > threshold):
            print("Algorithm did not converge!")
            break
    
    return alpha, missclassifications

In [47]:
alpha, missclassifications = fit(X_train, y_train, alpha, K)

Iteration 0: 263 missclassifications
Iteration 10: 118 missclassifications
Iteration 20: 92 missclassifications
Iteration 30: 76 missclassifications
Iteration 40: 84 missclassifications
Iteration 50: 60 missclassifications
Iteration 60: 44 missclassifications
Iteration 70: 56 missclassifications
Iteration 80: 48 missclassifications
Iteration 90: 29 missclassifications
Iteration 100: 25 missclassifications
Iteration 110: 22 missclassifications
Iteration 120: 30 missclassifications
Iteration 130: 20 missclassifications
Iteration 140: 34 missclassifications
Iteration 150: 10 missclassifications
Iteration 160: 6 missclassifications
Iteration 166: Converged!


In [None]:
# Visualizing missclassifications vs num_iterations
plt.plot(range(1, len(missclassifications) + 1), missclassifications)
plt.xlabel('Number of Iterations')
plt.ylabel('Missclassifications')
plt.title('Missclassifications vs Number of Iterations')
plt.grid(True)
plt.show()

In [32]:
def get_accuracy(X, y, X_train, alpha):
    m = X.shape[0]
    n = X_train.shape[0]
    print(f"Total number of test samples: {m}")

    miss = 0
    for i in range(m):
        ksum = 0
        for j in range(n):
            ksum += alpha[j]*linear_kernel(X[i], X_train[j])
            
        Z = y[i]*ksum
        
        if (Z[0] <= 0):
            miss += 1
    
    print(f"Number of missclassified samples: {miss}")

    return (m-miss)/m

accuracy = get_accuracy(X_test, y_test, X_train, alpha)
print(f"Accuracy on test set: {accuracy}")

Total number of test samples: 130
Number of missclassified samples: 18
Accuracy on test set: 0.8615384615384616


In [33]:
print(alpha)

[[1.]
 [0.]
 [0.]
 ...
 [3.]
 [0.]
 [1.]]


In [34]:
for i in range(len(alpha)):
    if (alpha[i] != 0):
        print(df.iloc[i]['Name'])

Yash
Drashti
Hinal
Jay
Darshana
Hardik
Janvi
Ronak
Naman
Khyati
Sikha
Minal
Milan
Kaushik
Smit
Ravina
Hetal
Pooja
Komal
Mihir
kunjal
Mona
Bhavya
Bhavika
Hetavi
Raghav
Gopal
Sonal
Kashyap
Dhruv
Ishva
Denish
Abhi
Jenny
Dhruvi
Margi
Kajal
Mital
Nevil
Krishna
Dhavni
Hemanshi
Shyla
Amar
Diya
Ananya
Aditi
Nikhil
Ashwin
Shaila
Salina
Devi
Dhara
Rajiv
Bhanu
Manisha
Harshvi
Anshu
Shanti
Anuradha
Dev
Devendra
Janak
Mohan
Shivani
Balaraj
Sanjay
Sonam
shakshi
Neetu
Snehal
Vaishaki
Salman
Jethalal
Iyer
Pranjali
Hrutik
Pravin
Prince
Prem
Anand
Ritu
Kadambari
Champa
Ashu
Hanuman
Anirudh
Nilesha
Siddhi
Gulam
Shiv
Deep
Pankaj
Sumit
Preeti
Himmat
Nishit
shashank
Ruby
Purvi
Heet
Heer
Eram
Sayandip
Anshul
Harsha
Harsh
Krishma
Savita
Rajan
Noopur
Mushk
Doly
Viren
Amita
Dimpal
Alisha
Zaid
Misbah
Jinal
Nilam
Omkar
Lalita
Narendra
Shenaz
Kiran
Palak
Saroj
Shivam
Ramnik
Manoj
Anshuman
Namira
Raavi
Ravi
Div
Taral
Sarfarz
Ali
Yogita
Rajendra
devki
Nisha
Sita
Mahendra
Sachin
Ravindra
Ambati
Bhawati
Heet
Zeel
Aksh