# Gender Classification from names

## Edit dataset

In [63]:
import pandas as pd

GenderData = pd.read_csv('KGGenderDataset.csv', delimiter=';')
GenderData.head()

Unnamed: 0,CustomerId,CustomerName,Gender
0,,GERALL OWEN,M
1,75.0,Mardatomi Maselta,M
2,80.0,Stefanus Suyono,M
3,697.0,Mohamad Iwan Cahyadi,M
4,780.0,HKF Official,F


In [64]:
GenderData = GenderData.drop('CustomerId', axis=1) #membuang column yang tidak diperlukan

In [65]:
GenderData.head()

Unnamed: 0,CustomerName,Gender
0,GERALL OWEN,M
1,Mardatomi Maselta,M
2,Stefanus Suyono,M
3,Mohamad Iwan Cahyadi,M
4,HKF Official,F


In [66]:
GenderData.dtypes

CustomerName    object
Gender          object
dtype: object

In [67]:
GenderData.isna().sum() #lihat jika ada null value

CustomerName    1
Gender          0
dtype: int64

In [68]:
GenderData = GenderData.dropna() #drop null value

In [69]:
GenderData.isna().sum() #check lagi ada null value

CustomerName    0
Gender          0
dtype: int64

In [70]:
# Function to split names
def split_name(name):

    name = name.lower()
    parts = name.split()
    
    # Initialize the result as empty strings for all parts
    first_name = second_name = third_name = fourth_name = last_name = ' '
    
    if len(parts) >= 1:
        first_name = parts[0]  # First name
    if len(parts) >= 2:
        second_name = parts[1]  # Second name
    if len(parts) >= 3:
        third_name = parts[2]  # Third name
    if len(parts) >= 4:
        fourth_name = parts[3]  # Fourth name
    if len(parts) >= 5:
        # If more than 4 parts, concatenate all additional names to LastName
        last_name = ' '.join(parts[4:])
    
    return pd.Series([first_name, second_name, third_name, fourth_name, last_name])
        
# Apply function to the DataFrame
GenderData[['FirstName', 'SecondName', 'ThirdName', 'FourthName', 'LastName']] = GenderData['CustomerName'].apply(split_name)

In [71]:
GenderData.head(40)

Unnamed: 0,CustomerName,Gender,FirstName,SecondName,ThirdName,FourthName,LastName
0,GERALL OWEN,M,gerall,owen,,,
1,Mardatomi Maselta,M,mardatomi,maselta,,,
2,Stefanus Suyono,M,stefanus,suyono,,,
3,Mohamad Iwan Cahyadi,M,mohamad,iwan,cahyadi,,
4,HKF Official,F,hkf,official,,,
5,Rizky Bramantyo,M,rizky,bramantyo,,,
6,Enya grace,F,enya,grace,,,
7,nhia pram,F,nhia,pram,,,
8,Sintya Handayani,F,sintya,handayani,,,
9,Aril Al Kautsar,M,aril,al,kautsar,,


In [72]:
GenderData.isna().sum() #check null value

CustomerName    0
Gender          0
FirstName       0
SecondName      0
ThirdName       0
FourthName      0
LastName        0
dtype: int64

In [73]:
GenderData = GenderData.drop('CustomerName', axis=1) #drop column CustomerName

In [74]:
GenderData.head(40)

Unnamed: 0,Gender,FirstName,SecondName,ThirdName,FourthName,LastName
0,M,gerall,owen,,,
1,M,mardatomi,maselta,,,
2,M,stefanus,suyono,,,
3,M,mohamad,iwan,cahyadi,,
4,F,hkf,official,,,
5,M,rizky,bramantyo,,,
6,F,enya,grace,,,
7,F,nhia,pram,,,
8,F,sintya,handayani,,,
9,M,aril,al,kautsar,,


In [75]:
GenderData.to_csv("KGGenderDatasetClean.csv", index=False) #save clean dataset

## Random Forest

In [15]:
import pandas as pd

GenderData = pd.read_csv('KGGenderDatasetClean.csv')
GenderData.head()

Unnamed: 0,Gender,FirstName,SecondName,ThirdName,FourthName,LastName
0,M,gerall,owen,,,
1,M,mardatomi,maselta,,,
2,M,stefanus,suyono,,,
3,M,mohamad,iwan,cahyadi,,
4,F,hkf,official,,,


### preprocess data

In [16]:
from sklearn.model_selection import train_test_split

# Function to extract bi-grams from a name
def get_bigrams(name):
    return tuple(name[i:i+2] for i in range(len(name)-1)) if isinstance(name, str) else ("",)

# Apply bi-gram tokenization to all name columns
X = GenderData[['FirstName', 'SecondName', 'ThirdName', 'FourthName', 'LastName']].astype(str)
X = X.map(get_bigrams)  

# Convert to categorical type
for col in X.columns:
    X[col] = X[col].astype("category")

# Encode y: M → 0, F → 1
y = GenderData['Gender'].map({'M': 0, 'F': 1})

# Splitting the dataset (75% train, 25% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Print example data
print("Example of tokenized and categorized X_train:")
print(X_train.head())

print("\nExample of encoded y_train:")
print(y_train.head())


Example of tokenized and categorized X_train:
                  FirstName                            SecondName ThirdName  \
7140   (he, en, nd, dr, ra)      (ni, ik, ki, ij, ju, ul, lu, uw)        ()   
1800       (be, en, nn, ny)  (an, ng, gg, ga, ar, rb, bi, it, to)     (s.,)   
24787      (ha, as, sn, na)                                    ()        ()   
6105       (wa, aw, wa, an)          (se, et, ty, ya, aw, wa, an)        ()   
29028      (nu, ur, ru, ul)                      (ih, hs, sa, an)        ()   

      FourthName LastName  
7140          ()       ()  
1800          ()       ()  
24787         ()       ()  
6105          ()       ()  
29028         ()       ()  

Example of encoded y_train:
7140     0
1800     0
24787    1
6105     0
29028    1
Name: Gender, dtype: int64


### Algorithm

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency
from sklearn.utils import resample
from collections import Counter
from random import sample

class CHAIDTree:
    def __init__(self, min_samples_split=10, significance_level=0.05):
        self.min_samples_split = min_samples_split
        self.significance_level = significance_level
        self.tree = None  # Tree structure will be stored here

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def _build_tree(self, X, y):
        if len(y) < self.min_samples_split or len(set(y)) == 1:
            return Counter(y).most_common(1)[0][0]  # Return majority class

        best_split = self._find_best_split(X, y)
        if best_split is None:
            return Counter(y).most_common(1)[0][0]  # No valid split found
        
        feature, groups = best_split
        node = {"feature": feature, "children": {}}

        for category, (X_sub, y_sub) in groups.items():
            node["children"][category] = self._build_tree(X_sub, y_sub)

        return node

    def _find_best_split(self, X, y):
        best_feature = None
        best_p_value = float("inf")
        best_groups = None

        for feature in X.columns:
            categories = X[feature].unique()
            contingency_table = np.array([[(y[X[feature] == cat] == 0).sum(), 
                                           (y[X[feature] == cat] == 1).sum()] 
                                          for cat in categories])
            
            chi2, p, _, _ = chi2_contingency(contingency_table, correction=False)
            
            if p < self.significance_level and p < best_p_value:
                best_feature = feature
                best_p_value = p
                best_groups = {cat: (X[X[feature] == cat], y[X[feature] == cat]) for cat in categories}

        return (best_feature, best_groups) if best_feature else None

    def predict_sample(self, sample, node, y):
        if isinstance(node, (int, np.int64)):  # Leaf node
            return node
        feature = node["feature"]
        value = sample.get(feature, None)
        return self.predict_sample(sample, node["children"].get(value, Counter(y).most_common(1)[0][0]), y)

    def predict(self, X, y):
        return np.array([self.predict_sample(sample, self.tree, y) for _, sample in X.iterrows()])


class RandomForestCHAID:
    def __init__(self, n_trees=100, sample_size=0.8, min_samples_split=10, significance_level=0.05, max_features="sqrt"):
        self.n_trees = n_trees
        self.sample_size = sample_size
        self.min_samples_split = min_samples_split
        self.significance_level = significance_level
        self.max_features = max_features  # Feature bagging method
        self.trees = []
        self.selected_features = []  # Track selected features per tree

    def fit(self, X, y):
        n_samples = int(self.sample_size * len(X))
        n_features = self._calculate_n_features(X)  # Determine number of features per tree

        for _ in range(self.n_trees):
            # Bootstrap sample the dataset
            X_sample, y_sample = resample(X, y, n_samples=n_samples, random_state=None)

            # Feature Bagging: Select a random subset of features
            selected_features = tuple(sample(list(X.columns), n_features))
            self.selected_features.append(selected_features)  

            # Train a CHAID Decision Tree using only the selected features
            tree = CHAIDTree(min_samples_split=self.min_samples_split, significance_level=self.significance_level)
            tree.fit(X_sample[list(selected_features)], y_sample)  

            self.trees.append(tree)

    def _calculate_n_features(self, X):
        """ Determines number of features to use for feature bagging """
        if self.max_features == "sqrt":
            return max(1, int(np.sqrt(len(X.columns))))
        elif self.max_features == "log2":
            return max(1, int(np.log2(len(X.columns))))
        elif isinstance(self.max_features, int):
            return min(len(X.columns), self.max_features)
        else:
            return len(X.columns)  # Default: Use all features (no feature bagging)

    def predict(self, X):
        tree_preds = np.array([tree.predict(X[list(features)], y_train) 
                               for tree, features in zip(self.trees, self.selected_features)])

        # Majority Voting
        final_preds = [Counter(tree_preds[:, i]).most_common(1)[0][0] for i in range(X.shape[0])]
        return np.array(final_preds)


# Train Random Forest CHAID with Feature Bagging
rf = RandomForestCHAID(n_trees=100, sample_size=0.8, max_features="sqrt")  # Use sqrt(feature count)
rf.fit(X_train, y_train)

# Make predictions
y_pred = rf.predict(X_test)

# Evaluate accuracy
from sklearn.metrics import accuracy_score
print("Random Forest CHAID Accuracy:", accuracy_score(y_test, y_pred))


## ANN

### preprocessing data

In [None]:
import pandas as pd

GenderData = pd.read_csv('KGGenderDatasetClean.csv')
GenderData.head()

# Function to extract bi-grams from a name
def get_bigrams(name):
    return tuple(name[i:i+2] for i in range(len(name)-1)) if isinstance(name, str) else ("",)

# Apply bi-gram tokenization to all name columns
X = GenderData[['FirstName', 'SecondName', 'ThirdName', 'FourthName', 'LastName']].astype(str)
X = X.map(get_bigrams)  

# Convert to categorical type
for col in X.columns:
    X[col] = X[col].astype("category")

# Encode y: M → 0, F → 1
y = GenderData['Gender'].map({'M': 0, 'F': 1})

# Splitting the dataset (75% train, 25% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Print example data
print("Example of tokenized and categorized X_train:")
print(X_train.head())

print("\nExample of encoded y_train:")
print(y_train.head())


### algorithm (belum selesai)

In [None]:

# Define activation functions and their derivatives
def sigmoid(x):
    return 1 / (1 + np.exp(-x))


# Define the Neural Network class
class NeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size):
        # Initialize weights
        self.weights_input_hidden = np.random.randn(input_size, hidden_size)
        self.weights_hidden_output = np.random.randn(hidden_size, output_size)
        
        # Initialize biases
        self.bias_hidden = np.zeros((1, hidden_size))
        self.bias_output = np.zeros((1, output_size))
        
    def forward(self, X):
        # Forward pass
        self.hidden_layer_input = np.dot(X, self.weights_input_hidden) + self.bias_hidden
        self.hidden_layer_output = relu(self.hidden_layer_input)
        
        self.output_layer_input = np.dot(self.hidden_layer_output, self.weights_hidden_output) + self.bias_output
        self.output = sigmoid(self.output_layer_input)
        
        return self.output
    
    def backward(self, X, y, learning_rate):
        # Calculate the error
        output_error = y - self.output
        output_delta = output_error * sigmoid_derivative(self.output)
        
        hidden_error = output_delta.dot(self.weights_hidden_output.T)
        hidden_delta = hidden_error * relu_derivative(self.hidden_layer_output)
        
        # Update weights and biases
        self.weights_hidden_output += self.hidden_layer_output.T.dot(output_delta) * learning_rate
        self.bias_output += np.sum(output_delta, axis=0, keepdims=True) * learning_rate
        
        self.weights_input_hidden += X.T.dot(hidden_delta) * learning_rate
        self.bias_hidden += np.sum(hidden_delta, axis=0, keepdims=True) * learning_rate
        
    def train(self, X, y, epochs, learning_rate):
        for epoch in range(epochs):
            self.forward(X)
            self.backward(X, y, learning_rate)
            
            if epoch % 1000 == 0:
                loss = np.mean(np.square(y - self.output))
                print(f'Epoch {epoch}, Loss: {loss}')

# Create and train the Neural Network
nn = NeuralNetwork(input_size=5, hidden_size=5, output_size=2)
nn.train(X_train, y_train, epochs=10000, learning_rate=0.01)

# Test the Neural Network
predictions = nn.forward(X_test)
predicted_classes = np.argmax(predictions, axis=1)
true_classes = np.argmax(y_test, axis=1)

# Calculate accuracy
accuracy = np.mean(predicted_classes == true_classes)
print(f'Accuracy: {accuracy * 100}%')
