In [None]:
import pandas as pd
import re
import torch
import numpy as np
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
df = pd.read_csv('/content/CodeBotixMaster.csv')

def preprocess_name(name):
    if pd.isnull(name):
        return ""
    name = name.lower().strip()
    name = name.replace('.', ' ')
    name = re.sub(r'[^a-z\s\'-]', '', name)
    tokens = name.split()
    if len(tokens) == 0:
        return ""

    index = 0
    while index < len(tokens) and len(tokens[index]) == 1:
        index += 1
    if index < len(tokens):
        return tokens[index]

    return ""

df['first_name'] = df['name'].apply(preprocess_name)

In [None]:
train_df = pd.read_csv('/content/Gender_Data.csv')

train_df['Name'] = train_df['Name'].astype(str).str.lower().str.strip()


In [None]:
chars = set("".join(train_df['Name']))
char2idx = {c: i+1 for i, c in enumerate(sorted(chars))}
idx2char = {i: c for c, i in char2idx.items()}
vocab_size = len(char2idx) + 1

def encode_name(name, max_len):
    seq = [char2idx.get(c, 0) for c in name]
    if len(seq) < max_len:
        seq += [0] * (max_len - len(seq))
    else:
        seq = seq[:max_len]
    return seq

max_len = max(len(n) for n in train_df['Name'])
X = np.array([encode_name(n, max_len) for n in train_df['Name']])
y = train_df['Gender'].values

In [None]:
class NameDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [None]:
dataset = NameDataset(X, y)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_ds, val_ds = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_ds, batch_size=256, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=256)

In [None]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers=1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        _, (h_n, _) = self.lstm(x)
        out = self.fc(h_n[-1])
        return self.sigmoid(out).squeeze()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMClassifier(vocab_size, embed_dim=64, hidden_dim=128).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [None]:
def train_model(model, train_loader, val_loader, epochs=5):
    for epoch in range(epochs):
        model.train()
        total_loss, total_correct, total_samples = 0, 0, 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            preds = model(X_batch)
            loss = criterion(preds, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * len(y_batch)
            total_correct += ((preds > 0.5).int() == y_batch.int()).sum().item()
            total_samples += len(y_batch)

        train_acc = total_correct / total_samples
        val_acc, val_loss = evaluate(model, val_loader)
        print(f"Epoch {epoch+1}/{epochs} | "
              f"Train Loss: {total_loss/total_samples:.4f}, Train Acc: {train_acc:.4f}, "
              f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

def evaluate(model, loader):
    model.eval()
    total_loss, total_correct, total_samples = 0, 0, 0
    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            preds = model(X_batch)
            loss = criterion(preds, y_batch)
            total_loss += loss.item() * len(y_batch)
            total_correct += ((preds > 0.5).int() == y_batch.int()).sum().item()
            total_samples += len(y_batch)
    return total_correct / total_samples, total_loss / total_samples

In [None]:
train_model(model, train_loader, val_loader, epochs=5)

Epoch 1/5 | Train Loss: 0.1512, Train Acc: 0.9463, Val Loss: 0.1889, Val Acc: 0.9323
Epoch 2/5 | Train Loss: 0.1456, Train Acc: 0.9495, Val Loss: 0.1977, Val Acc: 0.9292
Epoch 3/5 | Train Loss: 0.1442, Train Acc: 0.9501, Val Loss: 0.1910, Val Acc: 0.9322
Epoch 4/5 | Train Loss: 0.1371, Train Acc: 0.9531, Val Loss: 0.1965, Val Acc: 0.9277
Epoch 5/5 | Train Loss: 0.1324, Train Acc: 0.9544, Val Loss: 0.2019, Val Acc: 0.9287


In [None]:
def predict_gender(new_df, model, max_len, char2idx):
    new_df['Name'] = new_df['Name'].astype(str).str.lower().str.strip()
    X_new = np.array([encode_name(n, max_len) for n in new_df['Name']])
    X_new = torch.tensor(X_new, dtype=torch.long).to(device)
    with torch.no_grad():
        preds = model(X_new).cpu().numpy()
    new_df['Predicted_Gender'] = (preds > 0.5).astype(int)
    return new_df

In [None]:
print(df['gender'].value_counts(dropna=False))

mask_missing = df['gender'].isna()
missing_names = df.loc[mask_missing, 'first_name']

predicted_df = predict_gender(
    pd.DataFrame({"Name": missing_names}),
    model, max_len, char2idx
)

df.loc[mask_missing, 'gender'] = predicted_df['Predicted_Gender']

print(df['gender'].value_counts())



In [None]:
df['gender'] = df['gender'].replace({0: "male", 1: "female"})
df['gender'] = df['gender'].replace({0: "male", 1: "female"})

In [None]:
df