In [1]:
!pip install scikit-learn #
!pip install nltk #natural lang tool kit



In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder #A label encoder is used to convert categorical data, or text data, into numerical data which can be understood by a machine learning model
from keras.preprocessing.text import Tokenizer #splitting sentences, sentence segmentation, i.e extracting words
from keras.preprocessing.sequence import pad_sequences #to match lenght of arrays (types pre and post padding)
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense #An embedding layer is a way to represent words or text as dense vectors of real numbers. This is useful because it allows you to capture the semantic meaning of words.
from sklearn.metrics import accuracy_score,classification_report

In [3]:
df=pd.read_csv('/content/drive/MyDrive/Deep Learning Projects/indiannamesgenders.csv')
print(df.head())
print(df.shape)
#In the context of Pandas DataFrames, df.shape returns a tuple representing the dimensions of the DataFrame. The first element of the tuple corresponds to the number of rows, and the second element corresponds to the number of columns.


              name gender    race
0          barjraj      m  indian
1     ramdin verma      m  indian
2  sharat chandran      m  indian
3  birender mandal      m  indian
4             amit      m  indian
(123529, 3)


In [5]:
#exploratory data processing to deal with missing cols
df.info() #informs about the null values present

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123529 entries, 0 to 123528
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   name    123474 non-null  object
 1   gender  123529 non-null  object
 2   race    123529 non-null  object
dtypes: object(3)
memory usage: 2.8+ MB


In [6]:
df=df.dropna() #drop null value cols
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 123474 entries, 0 to 123528
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   name    123474 non-null  object
 1   gender  123474 non-null  object
 2   race    123474 non-null  object
dtypes: object(3)
memory usage: 3.8+ MB


In [10]:
x=df['name']
y=df['gender']
#encode gender labels using labelencoder
label_encoder=LabelEncoder()
y=label_encoder.fit_transform(y)
#split the data into training and testing part
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=4)

In [11]:
#tokenize the name
max_words=1000 #taking 1000 tokens at a time
tokenizer=Tokenizer(num_words=max_words) #encoding data
tokenizer.fit_on_texts(x_train)

#convert words to sequences
x_train_seq=tokenizer.texts_to_sequences(x_train)
x_test_seq=tokenizer.texts_to_sequences(x_test)

#pad sequences to ensure uniform lenght
max_len=max(len(seq) for seq in x_train_seq) #selecting max lenght

#all the names shd have same lenght as max
x_train_pad=pad_sequences(x_train_seq,max_len)
x_test_pad=pad_sequences(x_test_seq,max_len)

In [13]:
  x_train_seq

[[766],
 [],
 [],
 [580],
 [],
 [886],
 [650],
 [315],
 [2, 989],
 [23],
 [127],
 [128],
 [651],
 [27],
 [275],
 [],
 [6],
 [61],
 [3],
 [734],
 [],
 [],
 [],
 [652],
 [31],
 [947],
 [895],
 [715],
 [32],
 [238],
 [547],
 [445],
 [],
 [79],
 [479],
 [403],
 [],
 [],
 [810],
 [],
 [276],
 [59, 915],
 [581],
 [],
 [],
 [796, 96],
 [],
 [188],
 [210],
 [],
 [653],
 [151],
 [],
 [582],
 [583],
 [521],
 [686],
 [],
 [96],
 [],
 [],
 [277],
 [404],
 [211],
 [584],
 [585],
 [735],
 [716],
 [],
 [129],
 [446],
 [316],
 [],
 [],
 [621],
 [2],
 [480],
 [],
 [],
 [],
 [],
 [],
 [405],
 [887],
 [],
 [278],
 [],
 [818],
 [279],
 [359],
 [14],
 [32],
 [916],
 [189],
 [],
 [53],
 [781],
 [],
 [2],
 [522],
 [957],
 [239],
 [],
 [79],
 [],
 [586],
 [4, 622],
 [997],
 [587],
 [],
 [753, 1],
 [],
 [23],
 [754],
 [169],
 [717],
 [948],
 [447],
 [],
 [317],
 [623],
 [],
 [588],
 [],
 [118],
 [624],
 [318],
 [14, 97],
 [],
 [98],
 [481],
 [190],
 [482],
 [360],
 [],
 [12],
 [],
 [34],
 [],
 [625],
 [888, 1]

In [18]:
'''embedding_dim=50
model=Sequential()
model.add(Embedding(input_dim=max_words,output_dim=embedding_dim,input_length=max_len))
model.add(LSTM(units=50))
model.add(Dense(units=1,activation='sigmoid'))
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.fit(x_train,y_train,epochs=10,batch_size=32)'''
embedding_dim=50
model=Sequential()
model.add(Embedding(input_dim=max_words,output_dim=embedding_dim,input_length=max_len))
model.add(LSTM(units=50))
model.add(Dense(units=1,activation='sigmoid'))
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
# Pass the padded sequences (numerical data) to the model
model.fit(x_train_pad,y_train,epochs=10,batch_size=32) # Use x_train_pad instead of x_train

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x793e19ed0a30>

In [20]:
y_pred_proba=model.predict(x_test_pad)
y_pred=(y_pred_proba>0.5).astype(int)
accuracy=accuracy_score(y_test,y_pred)
print(accuracy)

0.8659125880733202


In [22]:
y_pred_proba

array([[0.52365154],
       [0.9999573 ],
       [0.057654  ],
       ...,
       [0.01483461],
       [0.9981205 ],
       [0.00779944]], dtype=float32)

In [21]:
y_pred

array([[1],
       [1],
       [0],
       ...,
       [0],
       [1],
       [0]])

In [45]:
#input a name for gender prediction
input_name=input('Enter a name for gender prediction:')

#tokenize and pad the input name
input_name_sequences=tokenizer.texts_to_sequences([input_name])
input_name_padded=pad_sequences(input_name_sequences,maxlen=max_len)
predicted_proba=model.predict(input_name_padded)
predicted_label=(predicted_proba>0.5).astype(int)[0][0]
predicted_gender=label_encoder.inverse_transform([predicted_label])[0]
if(predicted_gender=='f'):
  print('Predicted Gender: Female')
else:
  print('Predicted Gender: Male')


Enter a name for gender prediction:srivathsa
Predicted Gender: Male
