<a href="https://colab.research.google.com/github/SagarBajaj14/Learning_projects/blob/main/Gender_prediction_usingRandomForest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [89]:
''' Dataset: https://www.ssa.gov/oact/babynames/names.zip
There can be various parameters to determine gender from a name for eg. characteristics such as first and last letter of a name and so on. 
When we convert words into numbers whith encoding we will get output vector containing 1000s of features.
In this example we can achieve an accuracy of 80% using only three features to determine gender from a name.
'''
import sklearn
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.ensemble import RandomForestClassifier 
from sklearn import svm

In [90]:
#Checking frequency of all letters in a name 
def name_count(name):
  arr = np.zeros(52)
  for i,j in enumerate(name):
    arr[ord('j')-ord('a')] += 1 
  return arr

data = np.genfromtxt('yob2021.txt',delimiter=',',dtype = None,names=[('name'),('gender'),('count')],encoding=None,converters={0: lambda s:s.lower()})
data = np.array([row for row in data if row[2]>=25])
name_map = np.vectorize(name_count, otypes=[np.ndarray])
X_name = data['name']
Xl = name_map(X_name)
X = np.array(Xl.tolist())
Y = data['gender']

for i in range(5):
  X_train,X_test,Y_train,Y_test = train_test_split(X,Y, test_size = 0.33)
  res = RandomForestClassifier(n_estimators = 150, min_samples_split = 20)
  res.fit(X_train,Y_train)
  print(np.mean(res.predict(X_test) == Y_test))

0.5593820405535886
0.5551979401351786
0.5574509172835532
0.5465078854200193
0.5539105246218217


In [91]:
#Checking frequency of all letters in a name and ordering them.
def name_count(name):
  arr = np.zeros(52)
  for i,j in enumerate(name):
    arr[ord('j')-ord('a')] += 1 
    arr[ord('j')-ord('a') + 26] += i+1 
  return arr

data = np.genfromtxt('yob2021.txt',delimiter=',',dtype = None,names=[('name'),('gender'),('count')],encoding=None,converters={0: lambda s:s.lower()})
data = np.array([row for row in data if row[2]>=25])
name_map = np.vectorize(name_count, otypes=[np.ndarray])
X_name = data['name']
Xl = name_map(X_name)
X = np.array(Xl.tolist())
Y = data['gender']

for i in range(5):
  X_train,X_test,Y_train,Y_test = train_test_split(X,Y, test_size = 0.33)
  res = RandomForestClassifier(n_estimators = 150, min_samples_split = 20)
  res.fit(X_train,Y_train)
  print(np.mean(res.predict(X_test) == Y_test))

0.5561635017701964
0.5600257483102672
0.5523012552301255
0.5597038944319279
0.5551979401351786


In [92]:
def name_count(name):
 arr = np.zeros(52+26*26) # 26 dimensions for each letter
 for i, j in enumerate(name):
  arr[ord('j')-ord('a')] += 1
  arr[ord('j')-ord('a')+26] += i+1

# Considering bi-grams as feature(eg.'aa','ab' and so on)
 for x in range(len(name)-1):
  ind = (ord(name[x])-ord('a'))*26 + (ord(name[x+1])-ord('a'))
  arr[ind] += 1
 return arr

data = np.genfromtxt('yob2021.txt',delimiter=',',dtype = None,names=[('name'),('gender'),('count')],encoding=None,converters={0: lambda s:s.lower()})
data = np.array([row for row in data if row[2]>=25])
name_map = np.vectorize(name_count, otypes=[np.ndarray])
X_name = data['name']
Xl = name_map(X_name)
X = np.array(Xl.tolist())
Y = data['gender']

for i in range(5):
  X_train,X_test,Y_train,Y_test = train_test_split(X,Y, test_size = 0.33)
  res = RandomForestClassifier(n_estimators = 150, min_samples_split = 20)
  res.fit(X_train,Y_train)
  print(np.mean(res.predict(X_test) == Y_test))

0.7360798197618281
0.7347924042484711
0.733183134856775
0.7389765046668813
0.7405857740585774


In [93]:
def name_count(name):
 arr = np.zeros(52+26*26+3)
 for i, j in enumerate(name):
  arr[ord('j')-ord('a')] += 1
  arr[ord('j')-ord('a')+26] += i+1

 for x in range(len(name)-1):
  ind = (ord(name[x])-ord('a'))*26 + (ord(name[x+1])-ord('a')) + 52
  arr[ind] += 1
# inlcuding last letter, second last letter and length of name as features.
 arr[-3] = ord(name[-1])-ord('a')
 
 arr[-2] = ord(name[-2])-ord('a')

 arr[-1] = len(name)
 return arr

data = np.genfromtxt('yob2021.txt',delimiter=',',dtype = None,names=[('name'),('gender'),('count')],encoding=None,converters={0: lambda s:s.lower()})
data = np.array([row for row in data if row[2]>=20])
name_map = np.vectorize(name_count, otypes=[np.ndarray])
X_name = data['name']
Xl = name_map(X_name)
X = np.array(Xl.tolist())
Y = data['gender']

for i in range(5):
  X_train,X_test,Y_train,Y_test = train_test_split(X,Y, test_size = 0.33)
  res = RandomForestClassifier(n_estimators = 150, min_samples_split = 20)
  res.fit(X_train,Y_train)
  print(np.mean(res.predict(X_test) == Y_test))

0.8054945054945055
0.8038461538461539
0.7994505494505495
0.8002747252747253
0.7898351648351648


In [94]:
# Checking importance of last letter feature 
def name_count(name):
  arr = np.zeros(1)
  arr[0] = ord(name[-1])-ord('a')+1
  return arr

data = np.genfromtxt('yob2021.txt',delimiter=',',dtype = None,names=[('name'),('gender'),('count')],encoding=None,converters={0: lambda s:s.lower()})
data = np.array([row for row in data if row[2]>=20])
name_map = np.vectorize(name_count, otypes=[np.ndarray])
X_name = data['name']
Xl = name_map(X_name)
X = np.array(Xl.tolist())
Y = data['gender']

for i in range(5):
  X_train,X_test,Y_train,Y_test = train_test_split(X,Y, test_size = 0.33)
  res = RandomForestClassifier(n_estimators = 150, min_samples_split = 20)
  res.fit(X_train,Y_train)
  print(np.mean(res.predict(X_test) == Y_test))

0.759065934065934
0.7521978021978022
0.7673076923076924
0.7618131868131868
0.7521978021978022


In [95]:
#Combing top three features(last and second last letter, order of a)
#Many words are derived from latin so occurence of a's and o's is important in determing gender.
def name_count(name):
  arr = np.zeros(3)
  arr[0] = ord(name[-1])-ord('a')+1
  arr[1] = ord(name[-2])-ord('a')+1

  for i, j in enumerate(name):
    if j == 'a':
      arr[2] += i+1
  return arr

data = np.genfromtxt('yob2021.txt',delimiter=',',dtype = None,names=[('name'),('gender'),('count')],encoding=None,converters={0: lambda s:s.lower()})
data = np.array([row for row in data if row[2]>=20])
name_map = np.vectorize(name_count, otypes=[np.ndarray])
X_name = data['name']
Xl = name_map(X_name)
X = np.array(Xl.tolist())
Y = data['gender']

for i in range(5):
  X_train,X_test,Y_train,Y_test = train_test_split(X,Y, test_size = 0.33)
  res = RandomForestClassifier(n_estimators = 150, min_samples_split = 20)
  res.fit(X_train,Y_train)
  print(np.mean(res.predict(X_test) == Y_test))


0.7942307692307692
0.7923076923076923
0.7837912087912088
0.7906593406593406
0.782967032967033


In [96]:
def name_count(name):
  arr = np.zeros(3)
  arr[0] = ord(name[-1])-ord('a')+1
  arr[1] = ord(name[-2])-ord('a')+1

  for i, j in enumerate(name):
    if j == 'a':
      arr[2] += i+1
  return arr

data = np.genfromtxt('yob2021.txt',delimiter=',',dtype = None,names=[('name'),('gender'),('count')],encoding=None,converters={0: lambda s:s.lower()})
data = np.array([row for row in data if row[2]>=20])
name_map = np.vectorize(name_count, otypes=[np.ndarray])
X_name = data['name']
Xl = name_map(X_name)
X = np.array(Xl.tolist())
Y = data['gender']

X_train,X_test,Y_train,Y_test = train_test_split(X,Y, test_size = 0.33)
res = RandomForestClassifier(n_estimators = 150, min_samples_split = 20)
res.fit(X_train,Y_train)

index = np.random.choice(np.arange(len(Xl)), 10,replace = False)
X_sam = X_name[index]
Y_sam = Y[index]
pred = res.predict(X[index])

print('Name', 'Actual', 'Predicted')
for a,b, r in zip(X_sam,Y_sam, pred):
  print(a, b , r)

Name Actual Predicted
kayci F F
naftuli M F
jaycion M M
wylder M M
douglas M M
vedh M M
swayze M M
cheyenne F F
donnie M F
nadia F F
