## Gender Classification with Decision Trees using First Names

In [1]:
# import all the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Read the dataset
df = pd.read_csv("../assets/data/gender.csv")

FileNotFoundError: [Errno 2] No such file or directory: '../assets/data/gender.csv'

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1294 entries, 0 to 1293
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    1294 non-null   object
 1   Target  1294 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 20.3+ KB


In [12]:
df.head()

Unnamed: 0,Name,Target
0,Yash,1
1,Prit,1
2,Meet,1
3,Drashti,0
4,Saloni,0


In [13]:
# Converting all names to lowercase
df['Name'] = df['Name'].str.lower()
df.head()

Unnamed: 0,Name,Target
0,yash,1
1,prit,1
2,meet,1
3,drashti,0
4,saloni,0


In [None]:
num_datasets = 10
datasets = []

for i in range(num_datasets):
    temp_df = df.sample(n=len(df), replace=True) 
    datasets.append(temp)

In [30]:
# Convert each name to feature vector
X = []
y = []

# Considering the last character and bigrams in name
num_feats = 702    # 26 letters + 26*26 bigrams = 702 features

for i in range(len(df)):
  name = df.iloc[i]['Name']
  target = df.iloc[i]['Target']

  vec = np.zeros(num_feats)

  # Consider last character
  key = name[-1]
  if (ord(key) < 97 or ord(key) > 122):
    continue
  vec[ord(key)-97] +=1

  # Consider all bigrams
  for i in range(len(name)-1):
    bigram = name[i:i+2]
    # Skipping whitespace and extra characters if any
    if (ord(bigram[0]) < 97 or ord(bigram[0]) > 122 or ord(bigram[1]) < 97 or ord(bigram[1]) > 122):
      continue
    vec[26 + (ord(bigram[0]) - 97)*26 + (ord(bigram[1]) - 97)] += 1

  vec = vec.reshape(-1, 1)
  X.append(vec)

  if target == 0:
    # 0 represents girl
    y.append(0)
  else:
    # 1 represents boy
    y.append(1)

# Printing sample feature vector and label
print(X[0].shape)
print(y[0])

(702, 1)
1


In [41]:
# Split data into train, val and test sets
test_size = 0.1
val_size = 0.1

data = list(zip(X, y))

split_index1 = int(len(data) * (1 - test_size - val_size))
split_index2 = int(len(data) * (1 - test_size))
train_data = data[:split_index1]
val_data = data[split_index1:split_index2]
test_data = data[split_index2:]

X_train, y_train = zip(*train_data)
X_test, y_test = zip(*test_data)
X_val, y_val = zip(*val_data)

X_train, X_test = np.array(X_train), np.array(X_test)
y_train, y_test = np.array(y_train), np.array(y_test)
X_val, y_val = np.array(X_val), np.array(y_val)

X_train = pd.DataFrame(X_train.squeeze())
X_val = pd.DataFrame(X_val.squeeze())
X_test = pd.DataFrame(X_test.squeeze())

y_train = pd.DataFrame(y_train)
y_val = pd.DataFrame(y_val)
y_test = pd.DataFrame(y_test)

# y_train = y_train.reshape(-1, 1)
# y_test = y_test.reshape(-1, 1)
# y_val = y_val.reshape(-1, 1)

print(f'Number of training examples: {len(X_train)}')
print(f'Number of validation examples: {len(X_val)}')
print(f'Number of test examples: {len(X_test)}')

Number of training examples: 1034
Number of validation examples: 129
Number of test examples: 130


In [64]:
def calculate_entropy(y1, y2):
    epsilon = 1e-10
    total_len = len(y1) + len(y2)
    
    male_mask = y1 == 1  
    female_mask = y1 == 0 

    y1_male = y1[male_mask] 
    y1_female = y1[female_mask]

    p1 = len(y1_male)/len(y1)
    p2 = len(y1_female)/len(y1)

    entropy1 = -(p1*np.log(p1+epsilon) + p2*np.log(p2+epsilon))

    male_mask = y2 == 1  
    female_mask = y2 == 0 

    y2_male = y2[male_mask] 
    y2_female = y2[female_mask]

    p1 = len(y2_male)/len(y2)
    p2 = len(y2_female)/len(y2)

    entropy2 = -(p1*np.log(p1 + epsilon) + p2*np.log(p2 + epsilon))
    
    return ((len(y1)/total_len)*entropy1 + (len(y2)/total_len)*entropy2)

In [198]:
def get_best_split(X, y, depth):
    best_entropy = float('inf') 
    best_split_val = 0
    best_split_dim = 0

    for col in X.columns:
        # print(f"Trying split at depth {depth} for column: {col}")
        # Sort the data for given column
        data = X[col].drop_duplicates().sort_values().reset_index(drop=True)

        # Test all split values for given column
        for i in range(len(data) - 1):
            split_val = (data.iloc[i] + data.iloc[i + 1]) / 2
            
            left_mask = X[col] <= split_val  
            right_mask = X[col] > split_val

            X_left = X[left_mask]
            y_left = y[left_mask]

            X_right = X[right_mask]
            y_right = y[right_mask]

            entropy = calculate_entropy(y_left, y_right)

            if entropy < best_entropy:
                best_entropy = entropy
                best_split_val = split_val
                best_split_dim = col
    
    # Splitting data based on best value
    left_mask = X[best_split_dim] <= best_split_val  
    right_mask = X[best_split_dim] > best_split_val

    X_left = X[left_mask]
    y_left = y[left_mask]

    X_right = X[right_mask]
    y_right = y[right_mask]
    
    return best_split_dim, best_split_val, X_left, y_left, X_right, y_right, best_entropy


In [199]:
# First we define a decision node using class
class Node:
   def __init__(self, dim, val, left, right):
      self.left = left
      self.right = right
      self.dim = dim
      self.val = val

# Defining class for a leaf node
class Leaf:
   def __init__(self, data):
      self.data = data

In [215]:
# Building the decision tree
def BuildTree(X_train, y_train, K, max_depth, min_instances):
    y_train = np.ravel(y_train)
    if (len(X_train) <= min_instances):
        most_frequent_class = np.bincount(y_train).argmax()
        return Leaf(most_frequent_class)
    elif (K > max_depth):
        # Returing average of nodes
        most_frequent_class = np.bincount(y_train).argmax()
        return Leaf(most_frequent_class)
    else:
        split_dim, split_val, left_X, left_y, right_X, right_y, best_entropy = get_best_split(X_train, y_train, K)
        if (best_entropy <= 1e-2):
            most_frequent_class = np.bincount(y_train).argmax()
            return Leaf(most_frequent_class)
        return Node(split_dim, split_val, BuildTree(left_X, left_y, K+1, max_depth, min_instances), BuildTree(right_X, right_y, K+1, max_depth, min_instances))

In [216]:
# Traversing the tree using depth first search
def traverse_tree(node):
    # If the node is a Leaf, print its value
    if isinstance(node, Leaf):
            print(f"Gender: {node.data}")

    # If the node is a regular Node, print its dimension and value
    elif isinstance(node, Node):
        if (node.dim < 26):
            # This is case for unigrams
            print(f"character: {chr(65 + node.dim)} with value {node.val}")
        else:
            # This is case for bigrams
            temp = node.dim - 26
            first = temp // 26
            second = temp % 26
            print(f"character: {chr(65 + first)}{chr(65 + second)} with value {node.val}")

        traverse_tree(node.left)
        traverse_tree(node.right)

In [229]:
max_depth = 4
min_instances = 5
root = BuildTree(X_train, y_train, 0, max_depth, min_instances)
traverse_tree(root)

character: A with value 0.5
character: I with value 0.5
character: AL with value 0.5
character: U with value 0.5
character: OO with value 0.5
Gender: 1
Gender: 0
character: BH with value 0.5
Gender: 0
Gender: 1
character: IN with value 0.5
character: VA with value 0.5
Gender: 1
Gender: 1
Gender: 0
character: JI with value 0.5
character: AN with value 0.5
character: RS with value 0.5
Gender: 0
Gender: 1
Gender: 0
Gender: 1
character: DR with value 0.5
character: AU with value 0.5
character: AP with value 0.5
character: HN with value 0.5
Gender: 0
Gender: 1
Gender: 1
Gender: 1
character: EN with value 0.5
Gender: 0
Gender: 1


In [230]:
# Evaluate a given instance
def evaluate_instance(X, root):
        
    current_node = root

    # Traverse the tree until a leaf node is reached
    while isinstance(current_node, Node):
        if X[current_node.dim] <= current_node.val:
            current_node = current_node.left
        else: 
            current_node = current_node.right

    if isinstance(current_node, Leaf):
        return current_node.data

    # If the root is a leaf node (directly passed), just return its value
    return current_node.data

In [231]:
def evaluate(X, y):
    y = np.array(y)
    y_hat = np.zeros(y.shape)
    for i in range (len(X)):
        y_hat[i] = evaluate_instance(X.iloc[i], root)

    # Confusion Matrix
    conf_matrix = [[0, 0], [0, 0]]
    for i in range(len(y)):
        conf_matrix[int(y_hat[i])][int(y[i])] += 1
        
    # Accuracy
    accuracy = np.sum(y == y_hat) / len(y)
    return accuracy, conf_matrix

In [232]:
# This is done to tune height and min_instances
val_accuracy, conf_matrix = evaluate(X_val, y_val)
print(f"Accuracy for validation set: {val_accuracy}")
print(conf_matrix)

Accuracy for validation set: 0.8992248062015504
[[61, 5], [8, 55]]


In [233]:
# This is done to tune height and min_instances
test_accuracy, conf_matrix = evaluate(X_test, y_test)
print(f"Accuracy for test set: {test_accuracy}")
print(conf_matrix)

Accuracy for test set: 0.8615384615384616
[[74, 4], [14, 38]]
