**Problem Statement #1:**
Build a sequential model to classify names into gender.

Input to the model will be a name, i.e. a sequence of characters.

Use one hot representation of the  characters.

Remove non-ascii characters, if there are any



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, LSTM, GRU, Dense, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam

# Loading the dataset
data = pd.read_csv('name_gender.csv')

# Removing non-ASCII characters
data['name'] = data['name'].apply(lambda x: ''.join([c for c in x if ord(c) < 128]))

# Mapping gender to numeric values
data['gender'] = data['gender'].map({'M': 0, 'F': 1})

# Creating a set of all unique characters
chars = set(''.join(data['name'].values))
char_to_index = {ch: idx + 1 for idx, ch in enumerate(chars)}  # index starts from 1

# One-hot encoding the names
def encode_name(name):
    return [char_to_index.get(c, 0) for c in name]  # default to 0 for non-ASCII chars

data['encoded_name'] = data['name'].apply(encode_name)

# Padding the names to the same length (max length = 15 for example)
max_name_length = 15
X = pad_sequences(data['encoded_name'], maxlen=max_name_length, padding='post')

# One-hot encoding the labels (gender)
y = to_categorical(data['gender'], num_classes=2)

# Function to build different RNN models
def build_rnn_model(cell_type='simple'):
    model = Sequential()
    model.add(Embedding(input_dim=len(char_to_index)+1, output_dim=64, input_length=max_name_length))

    if cell_type == 'simple':
        model.add(SimpleRNN(128, activation='relu'))
    elif cell_type == 'lstm':
        model.add(LSTM(128, activation='relu'))
    elif cell_type == 'gru':
        model.add(GRU(128, activation='relu'))

    model.add(Dense(2, activation='softmax'))
    model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

    return model

# Function to evaluate model performance
def evaluate_model(model, X, y, dataset_size):
    X_train, X_test, y_train, y_test = train_test_split(X[:dataset_size], y[:dataset_size], test_size=0.2, random_state=42)
    model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test), verbose=2)

    # Evaluating overall accuracy
    test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)

    # Calculating class-wise accuracy
    y_pred = model.predict(X_test)
    y_pred_class = np.argmax(y_pred, axis=1)
    y_test_class = np.argmax(y_test, axis=1)

    male_acc = np.sum((y_pred_class == 0) & (y_test_class == 0)) / np.sum(y_test_class == 0) * 100
    female_acc = np.sum((y_pred_class == 1) & (y_test_class == 1)) / np.sum(y_test_class == 1) * 100

    return test_acc, male_acc, female_acc

# Train models with different dataset sizes and report accuracies
dataset_sizes = [int(0.25 * len(data)), int(0.5 * len(data)), int(0.75 * len(data)), len(data)]
results = []

for dataset_size in dataset_sizes:
    for cell_type in ['simple', 'lstm', 'gru']:
        model = build_rnn_model(cell_type)
        test_acc, male_acc, female_acc = evaluate_model(model, X, y, dataset_size)
        results.append({
            'dataset_size': dataset_size,
            'cell_type': cell_type,
            'test_acc': test_acc,
            'male_acc': male_acc,
            'female_acc': female_acc
        })

results_df = pd.DataFrame(results)
print(results_df)




Epoch 1/10
297/297 - 9s - 29ms/step - accuracy: 0.8028 - loss: 0.4280 - val_accuracy: 0.7879 - val_loss: 0.4613
Epoch 2/10
297/297 - 1s - 4ms/step - accuracy: 0.8654 - loss: 0.3265 - val_accuracy: 0.8718 - val_loss: 0.3112
Epoch 3/10
297/297 - 1s - 4ms/step - accuracy: 0.8811 - loss: 0.2960 - val_accuracy: 0.8830 - val_loss: 0.2912
Epoch 4/10
297/297 - 1s - 4ms/step - accuracy: 0.8915 - loss: 0.2705 - val_accuracy: 0.8773 - val_loss: 0.2987
Epoch 5/10
297/297 - 1s - 4ms/step - accuracy: 0.8992 - loss: 0.2537 - val_accuracy: 0.8832 - val_loss: 0.2772
Epoch 6/10
297/297 - 2s - 6ms/step - accuracy: 0.9072 - loss: 0.2364 - val_accuracy: 0.8906 - val_loss: 0.2743
Epoch 7/10
297/297 - 3s - 9ms/step - accuracy: 0.9130 - loss: 0.2194 - val_accuracy: 0.8952 - val_loss: 0.2662
Epoch 8/10
297/297 - 2s - 6ms/step - accuracy: 0.9210 - loss: 0.2036 - val_accuracy: 0.8973 - val_loss: 0.2561
Epoch 9/10
297/297 - 1s - 5ms/step - accuracy: 0.9284 - loss: 0.1888 - val_accuracy: 0.8973 - val_loss: 0.2749


In [None]:
# Assuming `results_df` is your DataFrame with the results
results_df['dataset_size_percentage'] = results_df['dataset_size'].apply(lambda x: f'{(x / len(data)) * 100:.0f}%')

# Dropping the original 'dataset_size' column if it's not needed
results_df = results_df.drop(columns=['dataset_size'])

# Reorder columns for better presentation
results_df = results_df[['dataset_size_percentage', 'cell_type', 'test_acc', 'male_acc', 'female_acc']]

# Displaying the updated DataFrame
print(results_df)


   dataset_size_percentage cell_type  test_acc   male_acc  female_acc
0                      25%    simple  0.899832  87.485516   91.407799
1                      25%      lstm  0.885732  82.850521   91.837409
2                      25%       gru  0.880471  87.543453   88.334435
3                      50%    simple  0.893402  86.617100   91.127767
4                      50%      lstm  0.884458  84.386617   91.110336
5                      50%       gru  0.890140  87.201275   90.203939
6                      75%    simple  0.892592  85.010383   91.771799
7                      75%      lstm  0.893644  83.462337   92.854750
8                      75%       gru  0.891188  88.257504   89.628224
9                     100%    simple  0.892823  88.515366   89.722567
10                    100%      lstm  0.897138  85.009378   92.414079
11                    100%       gru  0.893507  85.297937   91.677019


**Problem Statement #2:**
Train a language model using these names.

In [25]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, GRU
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Loading the dataset
data = pd.read_csv('name_gender.csv')

# Removing non-ASCII characters
data['name'] = data['name'].apply(lambda x: ''.join([c for c in x if ord(c) < 128]))

# Mapping gender to numeric values
data['gender'] = data['gender'].map({'M': 0, 'F': 1})

# Creating a set of all unique characters
chars = set(''.join(data['name'].values))
char_to_index = {ch: idx + 1 for idx, ch in enumerate(chars)}  # index starts from 1
index_to_char = {idx: ch for ch, idx in char_to_index.items()}

# One-hot encoding the names
def encode_name(name):
    return [char_to_index.get(c, 0) for c in name]  # default to 0 for non-ASCII chars

data['encoded_name'] = data['name'].apply(encode_name)

# Padding the names to the same length
max_name_length = 15
X = pad_sequences(data['encoded_name'], maxlen=max_name_length, padding='post')

# One-hot encoding the labels (gender)
y = to_categorical(data['gender'], num_classes=2)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the LSTM model
def build_lstm_model():
    model = Sequential()
    model.add(Embedding(input_dim=len(char_to_index) + 1, output_dim=64, input_length=max_name_length))
    model.add(LSTM(128, activation='relu'))
    model.add(Dense(len(char_to_index) + 1, activation='softmax'))  # Predict next character
    return model

# Train the language model
lm_model = build_lstm_model()
lm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Prepare input and output for the language model
X_lm = []
y_lm = []

for name in data['encoded_name']:
    for i in range(1, len(name)):
        X_lm.append(name[:i])
        y_lm.append(name[i])

X_lm = pad_sequences(X_lm, maxlen=max_name_length, padding='post')
y_lm = np.array(y_lm)

lm_model.fit(X_lm, y_lm, epochs=10, batch_size=64, verbose=2)

# Function to generate names
def generate_name(gender, length=6):
    name = []
    char_idx = np.random.choice(list(char_to_index.values()))
    for _ in range(length):
        name.append(char_idx)
        padded_input = pad_sequences([name], maxlen=max_name_length, padding='post')
        next_char_prob = lm_model.predict(padded_input, verbose=0)[0]
        char_idx = np.argmax(next_char_prob)
        if char_idx == 0:  # End token
            break
    return ''.join(index_to_char.get(i, '') for i in name)

# Generating 100 male and 100 female names
male_names = [generate_name(0) for _ in range(100)]
female_names = [generate_name(1) for _ in range(100)]

# Combining generated names and true labels
generated_names = male_names + female_names
true_labels = [0] * 100 + [1] * 100

# Encode and pad the generated names
encoded_generated_names = [encode_name(name) for name in generated_names]
padded_generated_names = pad_sequences(encoded_generated_names, maxlen=max_name_length, padding='post')

# Loading the best LSTM classification model from Problem 1
classification_model = Sequential()
classification_model.add(Embedding(input_dim=len(char_to_index) + 1, output_dim=64, input_length=max_name_length))
classification_model.add(LSTM(128, activation='relu'))
classification_model.add(Dense(2, activation='softmax'))
classification_model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

# Retrain the LSTM classification model on the full dataset
classification_model.fit(X_train, y_train, epochs=10, batch_size=64, verbose=2)

# Predict and calculate accuracy
predictions = np.argmax(classification_model.predict(padded_generated_names), axis=1)
accuracy = accuracy_score(true_labels, predictions)

print(f"Classification Accuracy on Generated Names: {accuracy * 100:.2f}%")




Epoch 1/10
8217/8217 - 31s - 4ms/step - accuracy: 0.3017 - loss: 2.2079
Epoch 2/10
8217/8217 - 27s - 3ms/step - accuracy: 0.3470 - loss: 2.0367
Epoch 3/10
8217/8217 - 41s - 5ms/step - accuracy: 0.3614 - loss: 1.9851
Epoch 4/10
8217/8217 - 41s - 5ms/step - accuracy: 0.3700 - loss: 1.9539
Epoch 5/10
8217/8217 - 41s - 5ms/step - accuracy: 0.3751 - loss: 1.9326
Epoch 6/10
8217/8217 - 27s - 3ms/step - accuracy: 0.3801 - loss: 1.9151
Epoch 7/10
8217/8217 - 41s - 5ms/step - accuracy: 0.3835 - loss: 1.9028
Epoch 8/10
8217/8217 - 41s - 5ms/step - accuracy: 0.3863 - loss: 1.8917
Epoch 9/10
8217/8217 - 41s - 5ms/step - accuracy: 0.3886 - loss: 1.8830
Epoch 10/10
8217/8217 - 41s - 5ms/step - accuracy: 0.3908 - loss: 1.8750
Epoch 1/10
1188/1188 - 7s - 6ms/step - accuracy: 0.8139 - loss: 0.4087
Epoch 2/10
1188/1188 - 4s - 3ms/step - accuracy: 0.8498 - loss: 0.3454
Epoch 3/10
1188/1188 - 5s - 4ms/step - accuracy: 0.8625 - loss: 0.3211
Epoch 4/10
1188/1188 - 4s - 3ms/step - accuracy: 0.8726 - loss: 0.

In [26]:
# Function to generate a single name
def generate_name(length=6):
    name = []
    char_idx = np.random.choice(list(char_to_index.values()))  # Random starting character
    for _ in range(length):
        name.append(char_idx)
        padded_input = pad_sequences([name], maxlen=max_name_length, padding='post')
        next_char_prob = lm_model.predict(padded_input, verbose=0)[0]
        char_idx = np.argmax(next_char_prob)
        if char_idx == 0:  # End token
            break
    return ''.join(index_to_char.get(i, '') for i in name)

# Generate 100 male names and 100 female names
male_names = [generate_name() for _ in range(100)]
female_names = [generate_name() for _ in range(100)]

# Printing male names
print("Generated Male Names:")
print("\n".join(male_names))

# Printing female names
print("\nGenerated Female Names:")
print("\n".join(female_names))


Generated Male Names:
Gerrin
Xavion
xishaw
nayaha
Karina
Uriann
Patric
quandr
imonte
Quante
enneri
janael
kennah
Patric
ariann
prinal
orinna
fordin
marian
Angeli
quandr
Uriann
zariah
taniah
kennah
janael
Valind
yriahn
Janish
Harman
Harman
Quante
Harman
Harman
imonte
Farina
Brittn
yriahn
Patric
hanaya
shanta
Xavion
lorien
xishaw
rendal
ariann
Raylen
Raylen
Xavion
Natali
Valind
marian
Janish
Yanish
Elizab
wannah
wannah
Raylen
Marian
Valind
nayaha
Deland
vannah
shanta
prinal
Deland
charle
orinna
fordin
Janish
Patric
Elizab
fordin
Natali
Uriann
orinna
Natali
geniel
Tamika
Charle
Xavion
geniel
Deland
hanaya
zariah
enneri
yriahn
enneri
Gerrin
Walett
Latavi
Charle
Shante
Natali
charle
ariann
enneri
Charle
Charle
Oriann

Generated Female Names:
Farina
Angeli
ariann
zariah
marian
Walett
shanta
charle
charle
Yanish
Charle
lorien
Janish
zariah
Quante
Valind
taniah
Gerrin
Angeli
Gerrin
Tamika
taniah
Marian
Latavi
Uriann
Latavi
Charle
Oriann
briell
geniel
Oriann
rendal
taniah
Valind
janael
Xavion
z

**Problem Statement #2a:**
Train a language model using names starting with A, M, and Z.



In [30]:
import random

# Function to generate a name with temperature sampling
def generate_name_with_temperature(model, seed_char, max_length=15, temperature=1.0):
    generated_name = [char_to_index[seed_char]]
    for _ in range(max_length - 1):
        padded_input = pad_sequences([generated_name], maxlen=max_name_length, padding='post')
        predicted_probs = model.predict(padded_input, verbose=0).flatten()

        # Apply temperature sampling
        scaled_probs = np.log(predicted_probs + 1e-10) / temperature
        exp_probs = np.exp(scaled_probs)
        normalized_probs = exp_probs / np.sum(exp_probs)

        # Sample the next character index
        predicted_char_index = np.random.choice(len(normalized_probs), p=normalized_probs)

        if predicted_char_index == 0:  # Stop if padding index is predicted
            break
        generated_name.append(predicted_char_index)
    return ''.join(index_to_char[idx] for idx in generated_name)

# Generating 50 diverse names
generated_names = [
    generate_name_with_temperature(lm_model, seed_char, temperature=0.8)
    for seed_char in ['A', 'M', 'Z'] for _ in range(50)
]
print("Generated Names:", generated_names)


Generated Names: ['Aubryanahmaedph', 'Aneskaliahaharp', 'Ayedenahneelich', 'Azlichaeleneysh', 'Avenellesephopt', 'Alexandriamarie', 'Angellieterisox', 'Aubriannahbeebs', 'Angelenasederbe', 'Alaeyahshahnahm', 'Alextanderahmar', 'Antonettavephpw', 'Arrianneetteriz', 'Auttinetterephe', 'Anjelliesanahhd', 'Arnenatterieshh', 'Arnellisamareew', 'Aydricaynerethg', 'Aubreenneelesha', 'Aldoreineetteep', 'Adailahirahmeno', 'Avriellahmebebz', 'Annessandeiosap', 'Anglindariettop', 'Ashamrahmelekia', 'Alausanderrashg', 'Antonimatteephd', 'Alezeinajesephh', 'Arbaleighaelech', 'Aleekhtamaelich', 'Annieleneetethv', 'Alicondrawemedm', 'Araibahmaricesa', 'Arnolissesenosh', 'Alexandriahieph', 'Ariongalianeter', 'Akardalloneewha', 'Antonelleneberi', 'Archanjieelahmi', 'Anniaahmmariopz', 'Ashteneysenthhe', 'Adainahahbhaelb', 'Athonianetteris', 'Ambriellenahbae', 'Adonienisesephw', 'Alexandyrabeelz', 'Antwinetteaseph', 'Antianousesophu', 'Alanyahmareeeeh', 'Amushantionawse', 'Mikelahielenamg', 'Melandraephe

In [38]:
# Function to calculate the perplexity of a generated name
def calculate_perplexity(model, name, max_length=15):
    log_prob_sum = 0
    for i in range(1, len(name)):
        # Pass a list of indices, not just a single index
        padded_input = pad_sequences([ [char_to_index[c] for c in name[:i]] ], maxlen=max_name_length, padding='post')

        predicted_probs = model.predict(padded_input, verbose=0).flatten()
        true_char_index = char_to_index[name[i]]

        # Calculating log probability for the true character in the sequence
        log_prob_sum += np.log(predicted_probs[true_char_index] + 1e-10)

    # Calculating perplexity
    perplexity = np.exp(-log_prob_sum / (len(name) - 1))
    return perplexity

# Calculating perplexity for each generated name
perplexities = [calculate_perplexity(lm_model, name) for name in generated_names]

# Output the generated names with their corresponding perplexities
for name, perplexity in zip(generated_names, perplexities):
    print(f"Generated Name: {name}, Perplexity: {perplexity:.2f}")


Generated Name: Aanzaahahrmarpa, Perplexity: 8.35
Generated Name: Ayannahahbhaelj, Perplexity: 4.17
Generated Name: Audreynahnaeleq, Perplexity: 4.97
Generated Name: Anashianahhieli, Perplexity: 4.62
Generated Name: Alicahniabariec, Perplexity: 5.71
Generated Name: Almarielliashob, Perplexity: 5.62
Generated Name: Alexinahnadrbae, Perplexity: 5.73
Generated Name: Annalinasephepe, Perplexity: 3.59
Generated Name: Adairahmanelenz, Perplexity: 4.55
Generated Name: Adhikondeephepp, Perplexity: 6.06
Generated Name: AbiredlahnelhAa, Perplexity: 8.80
Generated Name: Arnellisaneberb, Perplexity: 3.73
Generated Name: Addisyneseneptx, Perplexity: 4.68
Generated Name: Assreententekok, Perplexity: 8.61
Generated Name: Antronettepopip, Perplexity: 4.40
Generated Name: Airahmirabellau, Perplexity: 4.91
Generated Name: Aydenonasephepp, Perplexity: 4.09
Generated Name: Andrianettephaw, Perplexity: 3.05
Generated Name: Alyssanderiashh, Perplexity: 2.47
Generated Name: Angeleneeesensa, Perplexity: 4.41
