# Problem Statement 1

In [1]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Embedding, Dense, Dropout, SimpleRNN, LSTM, GRU
from keras.utils import to_categorical
import numpy as np
from sklearn.model_selection import train_test_split
import string, random

In [2]:
data = pd.read_csv("name_gender.csv")
data['name'] = data['name'].apply(lambda x: ''.join(filter(lambda y: y in string.printable, x)))
chars = sorted(list(set(''.join(data['name'].values))))

char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))


In [5]:
def train_model(cell_type, data_size):
    sampled_data = data.sample(frac=data_size, random_state=42)
    max_length = max([len(name) for name in sampled_data['name']])
    input_data_X = np.zeros((len(sampled_data), max_length, len(chars)), dtype=np.bool)
    output_data_Y = np.zeros((len(sampled_data), 2), dtype=np.bool)

    for i, name in enumerate(sampled_data['name']):
        for j, char in enumerate(name):
            input_data_X[i, j, char_to_int[char]] = 1
        output_data_Y[i, 0 if sampled_data.iloc[i]['gender'] == 'M' else 1] = 1

    X_train, X_test, y_train, y_test = train_test_split(input_data_X, output_data_Y, test_size=0.2, random_state=42)

    model = Sequential()
    if cell_type == 'SimpleRNN':
        model.add(SimpleRNN(128, input_shape=(max_length, len(chars))))
    elif cell_type == 'LSTM':
        model.add(LSTM(128, input_shape=(max_length, len(chars))))
    elif cell_type == 'GRU':
        model.add(GRU(128, input_shape=(max_length, len(chars))))
    else:
        print("Unexpected cell type")
        return

    model.add(Dropout(0.2))
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=20, batch_size=128, validation_data=(X_test, y_test), verbose=0)
    evaluation_scores = model.evaluate(X_test, y_test, verbose=0)

    print("Cell type:", cell_type)
    print("Data size:", data_size)
    print("Accuracy: %.2f%%" % (evaluation_scores[1]*100))

    predictions = model.predict(X_test)
    predicted_classes = np.argmax(predictions, axis=1)
    true_classes = np.argmax(y_test, axis=1)
    male_indices = np.where(true_classes == 0)[0]
    female_indices = np.where(true_classes == 1)[0]
    male_accuracy = np.mean(predicted_classes[male_indices] == true_classes[male_indices])
    female_accuracy = np.mean(predicted_classes[female_indices] == true_classes[female_indices])
    print("Male accuracy: %.2f%%" % (male_accuracy*100))
    print("Female accuracy: %.2f%%" % (female_accuracy*100))
    print()

# Iterating through different cell types and data sizes
for cell_type in ['SimpleRNN', 'LSTM', 'GRU']:
    for data_size in [0.25, 0.5, 0.75, 1.0]:
        train_model(cell_type, data_size)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  input_data_X = np.zeros((len(sampled_data), max_length, len(chars)), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  output_data_Y = np.zeros((len(sampled_data), 2), dtype=np.bool)


Cell type: SimpleRNN
Data size: 0.25
Accuracy: 84.45%
Male accuracy: 78.38%
Female accuracy: 87.87%

Cell type: SimpleRNN
Data size: 0.5
Accuracy: 87.03%
Male accuracy: 84.78%
Female accuracy: 88.35%

Cell type: SimpleRNN
Data size: 0.75
Accuracy: 87.24%
Male accuracy: 85.74%
Female accuracy: 88.10%

Cell type: SimpleRNN
Data size: 1.0
Accuracy: 88.69%
Male accuracy: 87.00%
Female accuracy: 89.68%

Cell type: LSTM
Data size: 0.25
Accuracy: 86.38%
Male accuracy: 81.30%
Female accuracy: 89.25%

Cell type: LSTM
Data size: 0.5
Accuracy: 88.80%
Male accuracy: 86.79%
Female accuracy: 89.99%

Cell type: LSTM
Data size: 0.75
Accuracy: 89.57%
Male accuracy: 86.26%
Female accuracy: 91.47%

Cell type: LSTM
Data size: 1.0
Accuracy: 90.54%
Male accuracy: 90.05%
Female accuracy: 90.83%

Cell type: GRU
Data size: 0.25
Accuracy: 85.73%
Male accuracy: 78.26%
Female accuracy: 89.94%

Cell type: GRU
Data size: 0.5
Accuracy: 88.87%
Male accuracy: 85.09%
Female accuracy: 91.09%

Cell type: GRU
Data size: 0

# Problem Statement 2

In [6]:
import pandas as pd
import random, csv


In [7]:
# load the CSV file into a pandas DataFrame
df = pd.read_csv('name_gender.csv')


In [8]:
# filter out the rows where gender is unknown or probability is less than 1
df = df[(df['gender'] != 'U') & (df['probability'] == 1)]


In [9]:
# create a dictionary to store the frequency of each character transition
def create_transition_dict(names):
    transition_dict = {}
    for name in names:
        name = name.lower()
        for i in range(len(name)-1):
            current_char = name[i]
            next_char = name[i+1]
            if current_char not in transition_dict:
                transition_dict[current_char] = {}
            if next_char not in transition_dict[current_char]:
                transition_dict[current_char][next_char] = 0
            transition_dict[current_char][next_char] += 1
    return transition_dict


In [10]:
# generate a name based on the Markov Chain model
def generate_name(transition_dict, gender):
    vowels = 'aeiou'
    consonants = 'bcdfghjklmnpqrstvwxyz'
    if gender == 'M':
        first_letter = random.choice(['a', 'e', 'i', 'o', 'u'] + list(consonants))
    else:
        first_letter = random.choice(['a', 'e', 'i', 'o', 'u'] + list(vowels))
    name = first_letter
    current_letter = first_letter
    while len(name) < 10:
        if current_letter not in transition_dict:
            break
        next_letter = random.choices(list(transition_dict[current_letter].keys()),
                                      list(transition_dict[current_letter].values()))[0]
        name += next_letter
        current_letter = next_letter
    return name.capitalize()

In [11]:
# create a list of male and female names using the Markov Chain model
male_names = []
female_names = []
transition_dict = create_transition_dict(df['name'].values)
for index, row in df.iterrows():
    if row['gender'] == 'M':
        male_names.append(generate_name(transition_dict, 'M'))
    else:
        female_names.append(generate_name(transition_dict, 'F'))


In [12]:
with open('/content/generated_names.csv', mode='w', newline='') as file:
    writer = csv.writer(file)

    # Write the header row
    writer.writerow(['gender', 'name'])

    # Write the male names
    for name in male_names:
        writer.writerow(['M', name])

    # Write the female names
    for name in female_names:
        writer.writerow(['F', name])



In [16]:
# Print the generated names
print("Generated Male Names:")
for name in male_names[:100]:
    print(name)
print("\nGenerated Female Names:")
for name in female_names[:100]:
    print(name)


Generated Male Names:
Yaitahajay
Keaykilaco
Waneahacyn
Unelylliri
Nirahexeja
Naronelika
Sllaqumbre
Wevayeynim
Laroyntoll
Unquaheyli
Rianinntty
Teeviabeli
Nalyervith
Uexoshaian
Delifranie
Tyavinaree
Wnnkickann
Oneadonngh
Starmeinar
Qusaqunaen
Shahnnghaq
Bhvelinahr
Kiaeneetia
Eeohladona
Peshiarlee
Junanddiel
Yaravunsho
Tetrietaro
Genntynank
Hueziamiau
Wdermarely
Fomuliaici
Benialetye
Qualanielo
Oanarndenw
Nnoriaijar
Llennntiro
Uelasisier
Naveaysere
Jainadatax
Tuamandiya
Leetanenca
Ohzaubrdin
Telannenda
Alenjeemee
Hemetieyta
Jahalikhil
Walanazaya
Guettamand
Ghaheayara
Xiasawrlai
Kedanakopi
Ronelaizia
Xtanluishe
Zjishosika
Lenjerlaiy
Xlyakratai
Geingenywe
Zahnamarda
Kmadureise
Hazarinyrj
Ashikahiei
Hnorahahah
Jauldidatt
Xareaymari
Jananarele
Tishahaikl
Eeningupah
Traleelist
Jondanyrab
Inichamcil
Yasarealen
Jenisallel
Kynimingil
Useckirenn
Viahyarixt
Danimzarde
Hoejekiyah
Ueerzanorr
Salynstyon
Gelmayeyrt
Voneniyoll
Hekekonttt
Diaredyabr
Shalleinth
Kauzhoriab
Onnnizamag
Fonemaynie
Phemofenyd

In [18]:
data = pd.read_csv("/content/generated_names.csv")
data['name'] = data['name'].apply(lambda x: ''.join(filter(lambda y: y in string.printable, x)))
chars = sorted(list(set(''.join(data['name'].values))))

char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

male_count = 0
for name in male_names:
    if train_model("GRU", 1.0) == "M":
        male_count += 1
        male_accuracy = male_count / len(male_names)
        exit

female_count = 0
for name in female_names:
    if train_model("GRU", 1.0) == "F":
        female_count += 1
        female_accuracy = female_count / len(female_names)
        exit

print("Accuracy on generated male names: ", male_accuracy)
print("Accuracy on generated female names: ", female_accuracy)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  input_data_X = np.zeros((len(sampled_data), max_length, len(chars)), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  output_data_Y = np.zeros((len(sampled_data), 2), dtype=np.bool)


Cell type: GRU
Data size: 1.0
Accuracy: 93.52%
Male accuracy: 81.13%
Female accuracy: 100.00%

Cell type: GRU
Data size: 1.0
Accuracy: 93.52%
Male accuracy: 81.13%
Female accuracy: 100.00%



KeyboardInterrupt: ignored

#Probelem Statement 2a

In [10]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.utils import to_categorical
import nltk

In [2]:
# Download the names dataset from the NLTK library
nltk.download('names')


[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Unzipping corpora/names.zip.


True

In [3]:
# Load the names dataset and filter names starting with 'a', 'm', or 'z'
names = nltk.corpus.names.words('/content/name_gender.csv')
names = [name.lower() for name in names if name[0].lower() in ['a', 'm', 'z']]


In [4]:
# Create a set of unique characters in the names and create character-to-integer mappings
chars = sorted(list(set(' '.join(names))))
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))


In [5]:
# Set the sequence length for training data
seq_length = 10
dataX = []
dataY = []

# Create input sequences and corresponding output for training
for name in names:
    for i in range(len(name)-seq_length):
        seq_in = name[i:i+seq_length]
        seq_out = name[i+seq_length]
        dataX.append([char_to_int[char] for char in seq_in])
        dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)

# Reshape the input sequences for the LSTM model
X = np.reshape(dataX, (n_patterns, seq_length, 1))
X = X / float(len(chars))
y = to_categorical(dataY)

# Create an LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Train the model
model.fit(X, y, epochs=20, batch_size=128)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x78c758d7e260>

In [6]:
# Generate and print 50 names using the trained model
for i in range(50):
    start = np.random.randint(0, len(dataX)-1)
    pattern = dataX[start]
    name = [int_to_char[value] for value in pattern]

    for j in range(20):
        x = np.reshape(pattern, (1, len(pattern), 1))
        x = x / float(len(chars))
        prediction = model.predict(x, verbose=0)
        index = np.argmax(prediction)
        result = int_to_char[index]
        name.append(result)
        pattern.append(index)
        pattern = pattern[1:len(pattern)]
        if result == '.':
            break

    print(''.join(name).capitalize())


Nnalise,f,1.
Bbigayl,f,1.
M,0.80334666666666666666666666
Ell,f,0.7766666666666666666666
Marcelene,f,1.
Anna,f,0.999666666666666666666
.93150684966666666666666666666
Ne,f,0.99966666666666666666666
.83333333355666666666666666666
Rycaroline,f,1.
984413453666666666666666666666
Cdaniel,m,1.
Addison,f,1.
368290668866666666666666666666
615384615366666666666666666666
053876786166666666666666666666
F,0.90156566666666666666666666
Maylon,m,0.
Allie,f,0.99666666666666666666
Maryella,f,1.
N,m,0.965566666666666666666666
M,0.99978666666666666666666666
,0.945375066666666666666666666
821782178266666666666666666666
,m,0.8577466666666666666666666
Hlan,f,0.996666666666666666666
054421768766666666666666666666
Ckennah,f,1.
Martavion,m,1.
999593727166666666666666666666
539682539666666666666666666666
Arquevious,m,1.
Aryfrances,m,1.
,m,0.5998666666666666666666666
596379921666666666666666666666
Dreyanna,f,1.
064516129056666666666666666666
Agapita,f,1.
A,f,0.999666666666666666666666
Aleceia,f,1.
F,0.998549666666

In [7]:
# Split the data into training and testing sets
split_index = int(len(dataX) * 0.9)
trainX, testX = dataX[:split_index], dataX[split_index:]
trainY, testY = dataY[:split_index], dataY[split_index:]


In [12]:
from keras.preprocessing.sequence import pad_sequences

# Assuming seq_length is the desired fixed length
seq_length = 10

# Pad or truncate sequences to the fixed length
testX = pad_sequences(testX, maxlen=seq_length, padding='post', truncating='post')

# Reshape the testing data and convert output to categorical
testX = np.reshape(testX, (len(testX), seq_length, 1))
testX = testX / float(len(chars))
testY = to_categorical(testY)
# Reshape the testing data and convert output to categorical
testX = np.reshape(testX, (len(testX), seq_length, 1))
testX = testX / float(len(chars))
testY = to_categorical(testY)




In [13]:
perplexities = []

# Generate 50 names and calculate perplexity for each
for i in range(50):
    start = np.random.randint(0, len(testX)-1)
    pattern = testX[start]
    name = [int_to_char[value] for value in pattern.flatten()]

    perplexity = 1.0
    for j in range(20):
        x = np.reshape(pattern, (1, len(pattern), 1))
        x = x / float(len(chars))
        prediction = model.predict(x, verbose=0)
        index = np.argmax(prediction)
        result = int_to_char[index]
        name.append(result)
        pattern = np.append(pattern, index)
        pattern = pattern[1:len(pattern)]
        if result == '.':
            break
        perplexity *= prediction[0][index]

    perplexity = pow(perplexity, -1/len(name))
    perplexities.append(perplexity)
    print(''.join(name).capitalize(), 'Perplexity:', perplexity)

# Calculate and print the average perplexity
avg_perplexity = sum(perplexities) / len(perplexities)
print('Average Perplexity:', avg_perplexity)

          45555556666666666666 Perplexity: 3.395727594459409
          45555556666666666666 Perplexity: 3.395727594459409
          45555556666666666666 Perplexity: 3.395727594459409
          45555556666666666666 Perplexity: 3.395727594459409
          45555556666666666666 Perplexity: 3.395727594459409
          45555556666666666666 Perplexity: 3.395727594459409
          45555556666666666666 Perplexity: 3.395727594459409
          45555556666666666666 Perplexity: 3.395727594459409
          45555556666666666666 Perplexity: 3.395727594459409
          45555556666666666666 Perplexity: 3.395727594459409
          45555556666666666666 Perplexity: 3.395727594459409
          45555556666666666666 Perplexity: 3.395727594459409
          45555556666666666666 Perplexity: 3.395727594459409
          45555556666666666666 Perplexity: 3.395727594459409
          45555556666666666666 Perplexity: 3.395727594459409
          45555556666666666666 Perplexity: 3.395727594459409
          45555556666666