In [2]:
import pandas as pd
import numpy as np
from faker import Faker
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import precision_recall_curve, roc_curve, auc, f1_score
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Embedding, Dense, Flatten

# Fake Name Generator for training data

In [3]:
# generate 100k records of human names
english_locales = [
    'en',
    'en_US',  # American English
    'en_GB',  # British English
    'en_AU',  # Australian English
    'en_CA',  # Canadian English
    'en_IN',  # Indian English
    'en_PH',  # Philippine English
    'en_IE',  # Irish English
    'en_TH'   # Thai English
]

num_names_per_ethnicity = int(100000 / len(english_locales)) 

all_names = []

for locale in english_locales:
    fake = Faker(locale)
    names = [fake.name() for _ in range(num_names_per_ethnicity)]
    all_names.extend(names)

In [4]:
data = {'Name': all_names, 'Label': 1}
df = pd.DataFrame(data)

# Fake functional account name generator for training data

In [5]:
data_functional = pd.read_excel('functional_account_name.xlsx')

In [6]:
data_functional = data_functional[data_functional['Label'] == 'N']
data_functional[['A','B']] = data_functional['fullname'].str.split(', ', expand=True)

In [7]:
data_functional_a = data_functional['A'].tolist()
data_functional_b = data_functional['B'].tolist()

In [8]:
#generate 100k records of functional account names that are like the ones in our system
fake = Faker()

# Function to generate inhuman names
def generate_inhuman_names(num_names):
    inhuman_names = [fake.random_element(elements=(data_functional_a + data_functional_b)) for _ in range(num_names)]
    return inhuman_names

num_inhuman_names = 100000  

inhuman_first_names = generate_inhuman_names(num_inhuman_names)
inhuman_last_names = generate_inhuman_names(num_inhuman_names)

fullname = []

for i in range(num_inhuman_names):
    fullname.append(inhuman_first_names[i] + ' ' + inhuman_last_names[i])

data_functional = {'Name': fullname, 'Label': 0}
df_functional = pd.DataFrame(data_functional)

In [9]:
train_df = pd.concat([df, df_functional], ignore_index=True)

In [10]:
X = train_df['Name']
y = train_df['Label']

# Model training

In [11]:
#pre-processing - tokenizing, and padding
max_words = 100000 # Set an appropriate value
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)

In [12]:
X_padded = pad_sequences(X_sequences)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, stratify = y)

In [16]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=2560, input_length=X_padded.shape[1]))
#model.add(LSTM(64))
#model.add(Dense(1, activation='sigmoid'))

model.add(Flatten())  # Flatten the 3D embedding output
model.add(Dense(256, activation='relu'))  # Add a dense layer
model.add(Dense(1, activation='sigmoid'))

In [34]:
X_padded.shape[1]

6

In [18]:
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

In [19]:
model.fit(X_train, y_train, epochs=5, batch_size=2560, validation_split=0.1)
loss, accuracy = model.evaluate(X_test, y_test)
print(accuracy)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
1.0


# Model Testing

In [21]:
df = pd.read_excel('data (9).xlsx')
df = df[['fullname', 'Label']]
new_records_sequences = tokenizer.texts_to_sequences(df['fullname'])
new_records_padded = pad_sequences(new_records_sequences, maxlen=X_padded.shape[1])
predictions = model.predict(new_records_padded)
df['prediction'] = predictions
df['Label'] = df['Label'].map({'Y': 1, 'N': 0})



In [22]:
precision, recall, thresholds = precision_recall_curve(df['Label'], df['prediction'])

In [23]:
f1_scores = 2 * (precision * recall) / (precision + recall)

# Find the threshold that maximizes the F1-score
best_threshold = thresholds[np.argmax(f1_scores)]

best_threshold

0.99833125

In [26]:
from sklearn.metrics import roc_curve, roc_auc_score

fpr, tpr, thresholds = roc_curve(df['Label'], df['prediction'])
auc = roc_auc_score(df['Label'], df['prediction'])
optimal_threshold = thresholds[np.argmin(np.sqrt((1-fpr)**2 + tpr**2))]

optimal_threshold

1.9786178e-06

In [47]:
new_records = pd.read_excel('data (9).xlsx')

new_records_sequences = tokenizer.texts_to_sequences(new_records['fullname'])
new_records_padded = pad_sequences(new_records_sequences, maxlen=X_padded.shape[1])


predictions = model.predict(new_records_padded)


threshold = best_threshold
predicted_labels_f1 = (predictions > best_threshold).astype(int)
predicted_labels_roc = (predictions > optimal_threshold).astype(int)
predicted_labels_normal = (predictions > 0.5).astype(int)


new_records['predicted_label_f1'] = predicted_labels_f1
new_records['predicted_label_roc'] = predicted_labels_roc
new_records['predicted_label_normal'] = predicted_labels_normal


new_records['true_y'] = new_records['Label'].map({'Y': 1, 'N': 0})



In [48]:
predictions

array([[9.99797046e-01],
       [9.99999583e-01],
       [9.99701083e-01],
       [9.99989748e-01],
       [9.99959648e-01],
       [9.99909282e-01],
       [9.99980986e-01],
       [9.99991059e-01],
       [9.99901474e-01],
       [9.98331249e-01],
       [9.99980986e-01],
       [9.98331249e-01],
       [9.99996364e-01],
       [9.99993145e-01],
       [9.98331249e-01],
       [9.99987423e-01],
       [9.99764323e-01],
       [9.99998569e-01],
       [9.99989986e-01],
       [9.99995589e-01],
       [9.99999046e-01],
       [9.98331249e-01],
       [9.99999583e-01],
       [9.98331249e-01],
       [9.99969602e-01],
       [9.99999583e-01],
       [9.98331249e-01],
       [9.99980986e-01],
       [9.98331249e-01],
       [9.99998927e-01],
       [9.99999523e-01],
       [9.99996364e-01],
       [9.99998927e-01],
       [9.98331249e-01],
       [9.99755561e-01],
       [6.31943658e-06],
       [5.74902879e-06],
       [9.02770898e-06],
       [1.32882315e-05],
       [9.98331249e-01],


In [49]:
from sklearn.metrics import accuracy_score
accuracy_f1 = accuracy_score(new_records['true_y'], new_records['predicted_label_f1'])
accuracy_roc = accuracy_score(new_records['true_y'], new_records['predicted_label_roc'])
accuracy_normal = accuracy_score(new_records['true_y'], new_records['predicted_label_normal'])

In [50]:
accuracy_f1, accuracy_roc, accuracy_normal

(0.7440944881889764, 0.7795275590551181, 0.9960629921259843)

# Model Saving

In [32]:
from tensorflow import keras
model.save('my_model_with_relu.h5')

  saving_api.save_model(


In [33]:
import pickle

# Assuming 'tokenizer' is your Tokenizer object
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)