In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [2]:
#data extraction 

df = pd.read_csv("Data_files/Indian_firstname_Gender_Data.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53982 entries, 0 to 53981
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    53982 non-null  object
 1   Gender  53982 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 843.6+ KB


#### 1 = Male
#### 0 = Female

In [4]:
#we need to build extract some features from these names to get the better output

def extract_name_features(df, name_col ='name'):
    df['name'] = df[name_col].str.strip().str.lower()
    df["name_length"] = df["name"].apply(len)
    df["first_letter"] = df["name"].str[0]
    df["last_letter"] = df["name"].str[-1]
    df["vowel_count"] = df["name"].apply(lambda x: sum(1 for c in x if c in "aeiou"))
    df["consonant_count"] = df["name_length"] - df["vowel_count"]
    df["suffix_2"] = df["name"].str[-2:]
    df["prefix_2"] = df["name"].str[:2]
    df["is_last_letter_a"] = (df["last_letter"]== "a").astype(int)

    #rearranging the df accordingly 
    df_function = df[["name","name_length","first_letter","last_letter","vowel_count",	"consonant_count",	"suffix_2","prefix_2","is_last_letter_a"]].copy()
    return df_function

In [5]:
df

Unnamed: 0,Name,Gender
0,Aaban,0
1,Aabharan,0
2,Aabhas,0
3,Aabhat,0
4,Aabheer,0
...,...,...
53977,Zumathy,1
53978,Zurika,1
53979,Zuruthi,1
53980,Zuruthika,1


In [6]:
features = extract_name_features(df, name_col="Name")

In [7]:
features

Unnamed: 0,name,name_length,first_letter,last_letter,vowel_count,consonant_count,suffix_2,prefix_2,is_last_letter_a
0,aaban,5,a,n,3,2,an,aa,0
1,aabharan,8,a,n,4,4,an,aa,0
2,aabhas,6,a,s,3,3,as,aa,0
3,aabhat,6,a,t,3,3,at,aa,0
4,aabheer,7,a,r,4,3,er,aa,0
...,...,...,...,...,...,...,...,...,...
53977,zumathy,7,z,y,2,5,hy,zu,0
53978,zurika,6,z,a,3,3,ka,zu,1
53979,zuruthi,7,z,i,3,4,hi,zu,0
53980,zuruthika,9,z,a,4,5,ka,zu,1


In [8]:
features.isnull().sum()

name                0
name_length         0
first_letter        0
last_letter         0
vowel_count         0
consonant_count     0
suffix_2            0
prefix_2            0
is_last_letter_a    0
dtype: int64

In [9]:
#doing the numerical conversion using one hot encoding 

cat_features = ["first_letter", "last_letter","suffix_2","prefix_2"]
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
cat_encoded = encoder.fit_transform(features[cat_features])

#combining with numeric features 
X_numeric = features.drop(cat_features + ["name"], axis = 1).values 
X_ready = np.hstack([X_numeric, cat_encoded])

In [10]:
pd.DataFrame(X_ready)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,636,637,638,639,640,641,642,643,644,645
0,5.0,3.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,8.0,4.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,6.0,3.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6.0,3.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7.0,4.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53977,7.0,2.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
53978,6.0,3.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
53979,7.0,3.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
53980,9.0,4.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [11]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_ready)

In [12]:
df_converted = pd.DataFrame(X_scaled)

In [13]:
df_converted

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,636,637,638,639,640,641,642,643,644,645
0,-1.412613,-0.293794,-1.844284,-0.658045,2.708450,-0.180212,-0.139745,-0.241077,-0.113369,-0.050441,...,-0.004304,-0.069702,-0.004304,-0.02952,-0.017219,-0.012175,-0.018264,-0.013612,-0.021090,-0.006087
1,-0.019238,0.686948,-0.499004,-0.658045,2.708450,-0.180212,-0.139745,-0.241077,-0.113369,-0.050441,...,-0.004304,-0.069702,-0.004304,-0.02952,-0.017219,-0.012175,-0.018264,-0.013612,-0.021090,-0.006087
2,-0.948155,-0.293794,-1.171644,-0.658045,2.708450,-0.180212,-0.139745,-0.241077,-0.113369,-0.050441,...,-0.004304,-0.069702,-0.004304,-0.02952,-0.017219,-0.012175,-0.018264,-0.013612,-0.021090,-0.006087
3,-0.948155,-0.293794,-1.171644,-0.658045,2.708450,-0.180212,-0.139745,-0.241077,-0.113369,-0.050441,...,-0.004304,-0.069702,-0.004304,-0.02952,-0.017219,-0.012175,-0.018264,-0.013612,-0.021090,-0.006087
4,-0.483697,0.686948,-1.171644,-0.658045,2.708450,-0.180212,-0.139745,-0.241077,-0.113369,-0.050441,...,-0.004304,-0.069702,-0.004304,-0.02952,-0.017219,-0.012175,-0.018264,-0.013612,-0.021090,-0.006087
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53977,-0.483697,-1.274536,0.173636,-0.658045,-0.369215,-0.180212,-0.139745,-0.241077,-0.113369,-0.050441,...,-0.004304,-0.069702,-0.004304,-0.02952,-0.017219,-0.012175,-0.018264,-0.013612,47.415715,-0.006087
53978,-0.948155,-0.293794,-1.171644,1.519652,-0.369215,-0.180212,-0.139745,-0.241077,-0.113369,-0.050441,...,-0.004304,-0.069702,-0.004304,-0.02952,-0.017219,-0.012175,-0.018264,-0.013612,47.415715,-0.006087
53979,-0.483697,-0.293794,-0.499004,-0.658045,-0.369215,-0.180212,-0.139745,-0.241077,-0.113369,-0.050441,...,-0.004304,-0.069702,-0.004304,-0.02952,-0.017219,-0.012175,-0.018264,-0.013612,47.415715,-0.006087
53980,0.445220,0.686948,0.173636,1.519652,-0.369215,-0.180212,-0.139745,-0.241077,-0.113369,-0.050441,...,-0.004304,-0.069702,-0.004304,-0.02952,-0.017219,-0.012175,-0.018264,-0.013612,47.415715,-0.006087


In [14]:
#adding both df and df_converted to get a complete data set 

df = pd.concat([df_converted,df["Gender"]],axis = 1)

In [15]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,637,638,639,640,641,642,643,644,645,Gender
0,-1.412613,-0.293794,-1.844284,-0.658045,2.708450,-0.180212,-0.139745,-0.241077,-0.113369,-0.050441,...,-0.069702,-0.004304,-0.02952,-0.017219,-0.012175,-0.018264,-0.013612,-0.021090,-0.006087,0
1,-0.019238,0.686948,-0.499004,-0.658045,2.708450,-0.180212,-0.139745,-0.241077,-0.113369,-0.050441,...,-0.069702,-0.004304,-0.02952,-0.017219,-0.012175,-0.018264,-0.013612,-0.021090,-0.006087,0
2,-0.948155,-0.293794,-1.171644,-0.658045,2.708450,-0.180212,-0.139745,-0.241077,-0.113369,-0.050441,...,-0.069702,-0.004304,-0.02952,-0.017219,-0.012175,-0.018264,-0.013612,-0.021090,-0.006087,0
3,-0.948155,-0.293794,-1.171644,-0.658045,2.708450,-0.180212,-0.139745,-0.241077,-0.113369,-0.050441,...,-0.069702,-0.004304,-0.02952,-0.017219,-0.012175,-0.018264,-0.013612,-0.021090,-0.006087,0
4,-0.483697,0.686948,-1.171644,-0.658045,2.708450,-0.180212,-0.139745,-0.241077,-0.113369,-0.050441,...,-0.069702,-0.004304,-0.02952,-0.017219,-0.012175,-0.018264,-0.013612,-0.021090,-0.006087,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53977,-0.483697,-1.274536,0.173636,-0.658045,-0.369215,-0.180212,-0.139745,-0.241077,-0.113369,-0.050441,...,-0.069702,-0.004304,-0.02952,-0.017219,-0.012175,-0.018264,-0.013612,47.415715,-0.006087,1
53978,-0.948155,-0.293794,-1.171644,1.519652,-0.369215,-0.180212,-0.139745,-0.241077,-0.113369,-0.050441,...,-0.069702,-0.004304,-0.02952,-0.017219,-0.012175,-0.018264,-0.013612,47.415715,-0.006087,1
53979,-0.483697,-0.293794,-0.499004,-0.658045,-0.369215,-0.180212,-0.139745,-0.241077,-0.113369,-0.050441,...,-0.069702,-0.004304,-0.02952,-0.017219,-0.012175,-0.018264,-0.013612,47.415715,-0.006087,1
53980,0.445220,0.686948,0.173636,1.519652,-0.369215,-0.180212,-0.139745,-0.241077,-0.113369,-0.050441,...,-0.069702,-0.004304,-0.02952,-0.017219,-0.012175,-0.018264,-0.013612,47.415715,-0.006087,1


In [16]:
#we need to do the train test split

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state =42)

for train_index, test_index in split.split(df,df["Gender"]):
    df_train_set = df.loc[train_index]
    df_test_set = df.loc[test_index]

In [17]:
X = df_train_set.iloc[:,:-1]
Y = df_train_set.iloc[:,-1] 

In [18]:
#using the test set to test the data

X_test = df_test_set.iloc[:,:-1]
Y_test = df_test_set.iloc[:,-1] 

In [19]:
#creating a model using randomforest classifier 

model = RandomForestClassifier()
model.fit(X,Y)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [20]:
#give the dataset for prediction 

predict = X_test
actual = Y_test.values

In [21]:
#getting the prediction model

predictor = model.predict(predict)
print(predictor)

[0 1 0 ... 0 0 0]


In [22]:
#checking for accuracy in the prediction model 

count = 0
for i in range(0,len(actual)):
    if actual[i]== predictor[i]:
        count = count +1
print((count*100)/len(actual))

91.3401870890062
