# Gender Classification

### Training Code

##### Import libraries

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
import numpy as np
import joblib

##### Prepare Dataset
Data is located on Datasets/Names/combined.csv

In [11]:
# Load dataset
df = pd.read_csv("C:/Bangkit/ML/Datasets/Names/combine.csv", encoding='cp1252')

# Print all info about it
print(f"Number of columns: {df.columns}")
print(f"Number of rows: {df.size}")
print(f"Columns data types : \n{df.dtypes}")
print(f"number of nulls : \n{df.isnull().isnull().sum()}")
print(f"Consisting of {df[df.gender == 'f'].size} female and {df[df.gender == 'm'].size} male names")
df.head()

# Drop all null values
df_names = df.dropna().copy()

# Replace gender (f/m) into integer (1/0)
df_names['gender'] = df_names['gender'].replace({'f': 0, 'm': 1}).astype(int)

# Check if gender is only either 1/0
df_names['gender'].unique()
df_names.head()

Number of columns: Index(['name', 'gender'], dtype='object')
Number of rows: 14654
Columns data types : 
name      object
gender    object
dtype: object
number of nulls : 
name      0
gender    0
dtype: int64
Consisting of 6868 female and 7786 male names


  df_names['gender'] = df_names['gender'].replace({'f': 0, 'm': 1}).astype(int)


Unnamed: 0,name,gender
0,hafizhan shidqi,1
1,gandhi wibowo,1
2,aldio mahendra purwandrarto,1
3,benny putra,1
4,vicky vernando dasta,1


##### Generating The Vectorizer

In [12]:
Xfeatures = df_names['name']
cv = CountVectorizer()
X = cv.fit_transform(Xfeatures.values.astype('U'))
X.shape
gender_vectorizer = open("GenderClassificationVectorizer.pkl", "wb")
joblib.dump(cv, gender_vectorizer)
gender_vectorizer.close()
cv.get_feature_names_out()
y=df_names["gender"]

##### Train and save the model 

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = MultinomialNB()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
model_file = "GenderClassificationModel.pkl"
joblib.dump(clf, model_file)
gender_vectorizer.close()
print(f"Model trained and saved as {model_file}")

Model trained and saved as GenderClassificationModel.pkl


### Running Code

##### Load model and vector

In [14]:
import joblib
model_file = "GenderClassificationModel.pkl"
with open(model_file, "rb") as model_file:
    loaded_model = joblib.load(model_file)
with open("GenderClassificationVectorizer.pkl", "rb") as vectorizer_file:
    loaded_vectorizer = joblib.load(vectorizer_file)

##### Run Inference

In [19]:
sample_names = ["pratiwi"]
vect = loaded_vectorizer.transform(sample_names).toarray()
prediction = loaded_model.predict(vect)[0]
print("Male" if prediction == 1 else "Female")

Female
