In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import MultinomialNB

!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

#authenticate googledrive user
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
downloaded = drive.CreateFile({'id':'1BVHGwJdlLB9Gg4wMZhWzQkjXaUNRL84r'})
downloaded.GetContentFile('gender_ethnicity.csv')


In [None]:
#load first name dataset 
fname = 'https://raw.githubusercontent.com/philipperemy/name-dataset/master/names_dataset/first_names.all.txt'
name_data = pd.read_csv(fname)
name_data.dropna()
name_data.head()


Unnamed: 0,--------------------
0,a'isha
1,a'ishah
2,a-jay
3,aa'isha
4,aa'ishah


In [None]:
fname_data = name_data.rename(columns = {"--------------------": "NAME" }) 
fname_data.head()

Unnamed: 0,NAME
0,a'isha
1,a'ishah
2,a-jay
3,aa'isha
4,aa'ishah


In [None]:
#load gender and ethnicity dataset
gender_data = pd.read_csv('gender_ethnicity.csv')
pd.set_option('display.max_columns', None)
gender_data.dropna()
gender_data.head()

Unnamed: 0,BRITH_YEAR,GENDER,ETHNICTY,NAME,COUNT,RANK
0,2011,FEMALE,HISPANIC,GERALDINE,13,75
1,2011,FEMALE,HISPANIC,GIA,21,67
2,2011,FEMALE,HISPANIC,GIANNA,49,42
3,2011,FEMALE,HISPANIC,GISELLE,38,51
4,2011,FEMALE,HISPANIC,GRACE,36,53


In [None]:
gender_data2 = gender_data.apply(lambda x: x.astype(str).str.lower())
gender_data2.head()

Unnamed: 0,BRITH_YEAR,GENDER,ETHNICTY,NAME,COUNT,RANK
0,2011,female,hispanic,geraldine,13,75
1,2011,female,hispanic,gia,21,67
2,2011,female,hispanic,gianna,49,42
3,2011,female,hispanic,giselle,38,51
4,2011,female,hispanic,grace,36,53


In [None]:
#merge first name with gender and ethnicity
mergedData1=fname_data.merge(gender_data2, on='NAME')
mergedData1.head()

Unnamed: 0,NAME,BRITH_YEAR,GENDER,ETHNICTY,COUNT,RANK
0,aahil,2014,male,asian and pacific islander,14,54
1,aaliyah,2011,female,black non hispanic,69,5
2,aaliyah,2011,female,hispanic,63,30
3,aaliyah,2011,female,black non hispanic,69,5
4,aaliyah,2011,female,hispanic,63,30


In [None]:
#load first name, last name dataset
url2 = 'https://media.githubusercontent.com/media/rjurney/baby_names/master/wiki_name_race.csv'
name_data2 = pd.read_csv(url2)
name_data2.head()

Unnamed: 0,name_last,name_suffix,name_first,name_middle,race
0,heynis,,aafje,,"GreaterEuropean,WestEuropean,Germanic"
1,noordewier-reddingius,,aaltje,,"GreaterEuropean,WestEuropean,Germanic"
2,de quant,,abbie,,"GreaterEuropean,WestEuropean,Germanic"
3,,,abbo of provence,,"GreaterEuropean,WestEuropean,Germanic"
4,ahanfouf,,abdelaziz,,"GreaterEuropean,WestEuropean,Germanic"


In [None]:
lname_data1 = name_data2.rename(columns = {"name_first": "NAME" }) 
lname_data1.head()

Unnamed: 0,name_last,name_suffix,NAME,name_middle,race
0,heynis,,aafje,,"GreaterEuropean,WestEuropean,Germanic"
1,noordewier-reddingius,,aaltje,,"GreaterEuropean,WestEuropean,Germanic"
2,de quant,,abbie,,"GreaterEuropean,WestEuropean,Germanic"
3,,,abbo of provence,,"GreaterEuropean,WestEuropean,Germanic"
4,ahanfouf,,abdelaziz,,"GreaterEuropean,WestEuropean,Germanic"


In [None]:
#merge first name with last name
mergedData2=mergedData1.merge(lname_data1, on='NAME')
mergedData2.head()

Unnamed: 0,NAME,BRITH_YEAR,GENDER,ETHNICTY,COUNT,RANK,name_last,name_suffix,name_middle,race
0,aaron,2011,male,asian and pacific islander,51,19,abraham,,,"GreaterEuropean,WestEuropean,Hispanic"
1,aaron,2011,male,asian and pacific islander,51,19,schwartzman,,,"GreaterEuropean,WestEuropean,Hispanic"
2,aaron,2011,male,asian and pacific islander,51,19,,,,"GreaterEuropean,Jewish"
3,aaron,2011,male,asian and pacific islander,51,19,aaronsohn,,,"GreaterEuropean,Jewish"
4,aaron,2011,male,asian and pacific islander,51,19,alexandre,,,"GreaterEuropean,Jewish"


In [None]:
count_row = mergedData2.shape[0] 
print(count_row)

929750


In [None]:
alldata = mergedData2.rename(columns = {"NAME": "First_Name" , "name_last": "Last_Name" ,"GENDER": "Inferred_Gender","ETHNICTY": "Inferred_Ethnicity"}) 
alldata = alldata.drop(columns = ['BRITH_YEAR', 'COUNT','RANK','name_suffix','name_middle','race'], axis=1)
alldata.head(100)

Unnamed: 0,First_Name,Inferred_Gender,Inferred_Ethnicity,Last_Name
0,aaron,male,asian and pacific islander,abraham
1,aaron,male,asian and pacific islander,schwartzman
2,aaron,male,asian and pacific islander,
3,aaron,male,asian and pacific islander,aaronsohn
4,aaron,male,asian and pacific islander,alexandre
...,...,...,...,...
95,aaron,male,black non hispanic,hart
96,aaron,male,black non hispanic,ibn sargado
97,aaron,male,black non hispanic,isaac
98,aaron,male,black non hispanic,kernis


In [None]:
#get full name using first name and last name
alldata["Full_Name"] = (alldata["First_Name"] +" "+ alldata["Last_Name"])
alldata.head()

Unnamed: 0,First_Name,Inferred_Gender,Inferred_Ethnicity,Last_Name,Full_Name
0,aaron,male,asian and pacific islander,abraham,aaron abraham
1,aaron,male,asian and pacific islander,schwartzman,aaron schwartzman
2,aaron,male,asian and pacific islander,,
3,aaron,male,asian and pacific islander,aaronsohn,aaron aaronsohn
4,aaron,male,asian and pacific islander,alexandre,aaron alexandre


In [None]:
#drop null values
data = alldata.dropna()

In [None]:
count_row = data.shape[0] 
print(count_row)

922779


In [None]:
# Replacing Female and Male with 0 and 1
data.Inferred_Gender.replace({'female':0,'male':1},inplace=True)
data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,


Unnamed: 0,First_Name,Inferred_Gender,Inferred_Ethnicity,Last_Name,Full_Name
0,aaron,1,asian and pacific islander,abraham,aaron abraham
1,aaron,1,asian and pacific islander,schwartzman,aaron schwartzman
3,aaron,1,asian and pacific islander,aaronsohn,aaron aaronsohn
4,aaron,1,asian and pacific islander,alexandre,aaron alexandre
5,aaron,1,asian and pacific islander,adan,aaron adan


In [None]:
#feature data
X = data['Full_Name']


In [None]:
#vectorize feature
vector1 = TfidfVectorizer()
X = vector1.fit_transform(X)

In [None]:
vector1.get_feature_names()

['1765',
 '1837',
 'aabye',
 'aaes',
 'aamer',
 'aaron',
 'aaronovitch',
 'aaronsohn',
 'aasen',
 'abacco',
 'abacha',
 'abaco',
 'abada',
 'abadie',
 'aballay',
 'abalo',
 'abancour',
 'abarca',
 'abaroa',
 'abart',
 'abatantuono',
 'abate',
 'abba',
 'abbadie',
 'abbado',
 'abbamonte',
 'abbandando',
 'abbandante',
 'abbar',
 'abbas',
 'abbasi',
 'abbate',
 'abbatemarco',
 'abbatessa',
 'abbati',
 'abbaticchio',
 'abbatini',
 'abbensetts',
 'abberley',
 'abbey',
 'abbiati',
 'abbington',
 'abbondanzieri',
 'abbot',
 'abbott',
 'abbotts',
 'abboud',
 'abbruscato',
 'abby',
 'abd',
 'abdala',
 'abdallah',
 'abdellaoui',
 'abdi',
 'abdillat',
 'abdou',
 'abdoulahi',
 'abdoulaye',
 'abdul',
 'abdulaziz',
 'abdulkhaleq',
 'abdulla',
 'abdullah',
 'abdullahi',
 'abdy',
 'abecassis',
 'abeel',
 'abeijón',
 'abeille',
 'abel',
 'abelard',
 'abeles',
 'abelin',
 'abell',
 'abella',
 'abelli',
 'abellio',
 'abelly',
 'abendana',
 'abenhaim',
 'abercrombie',
 'abercromby',
 'abergavenny',
 'abe

In [None]:
from sklearn.model_selection import train_test_split

#label
y = data['Inferred_Gender']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
# classification using Naive Bayes Classifier
clf = MultinomialNB()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

0.9939576642431646

In [None]:
#predict gender using input name
def predict_gender(name):
    test_name = [name]
    gen = vector1.transform(test_name).toarray()
    if clf.predict(gen) == 0:
        gend = 'Female'
        return gend
        
    else:
        gend = 'Male'
        return gend

In [None]:
data['Inferred_Ethnicity'].unique()

array(['asian and pacific islander', 'black non hispanic', 'hispanic',
       'white non hispanic', 'asian and paci', 'black non hisp',
       'white non hisp'], dtype=object)

In [None]:
class_names = ['asian and pacific islander', 'black non hispanic', 'hispanic',
       'white non hispanic', 'asian and paci', 'black non hisp',
       'white non hisp']

In [None]:
len(class_names)

7

In [None]:
# Features
Xfeatures = data['Full_Name']
#label
ylabels= data['Inferred_Ethnicity']

In [None]:
# Vectorize Features
vector = TfidfVectorizer()
X = vector.fit_transform(Xfeatures)

In [None]:
vector.get_feature_names()

['1765',
 '1837',
 'aabye',
 'aaes',
 'aamer',
 'aaron',
 'aaronovitch',
 'aaronsohn',
 'aasen',
 'abacco',
 'abacha',
 'abaco',
 'abada',
 'abadie',
 'aballay',
 'abalo',
 'abancour',
 'abarca',
 'abaroa',
 'abart',
 'abatantuono',
 'abate',
 'abba',
 'abbadie',
 'abbado',
 'abbamonte',
 'abbandando',
 'abbandante',
 'abbar',
 'abbas',
 'abbasi',
 'abbate',
 'abbatemarco',
 'abbatessa',
 'abbati',
 'abbaticchio',
 'abbatini',
 'abbensetts',
 'abberley',
 'abbey',
 'abbiati',
 'abbington',
 'abbondanzieri',
 'abbot',
 'abbott',
 'abbotts',
 'abboud',
 'abbruscato',
 'abby',
 'abd',
 'abdala',
 'abdallah',
 'abdellaoui',
 'abdi',
 'abdillat',
 'abdou',
 'abdoulahi',
 'abdoulaye',
 'abdul',
 'abdulaziz',
 'abdulkhaleq',
 'abdulla',
 'abdullah',
 'abdullahi',
 'abdy',
 'abecassis',
 'abeel',
 'abeijón',
 'abeille',
 'abel',
 'abelard',
 'abeles',
 'abelin',
 'abell',
 'abella',
 'abelli',
 'abellio',
 'abelly',
 'abendana',
 'abenhaim',
 'abercrombie',
 'abercromby',
 'abergavenny',
 'abe

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,ylabels,test_size=0.33,random_state=42)

In [None]:
# classification using Naive Bayes Classifier
naiv = MultinomialNB()
naiv.fit(x_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [None]:
f_name = input('Enter First Name :')

Enter First Name :mary


In [None]:
l_name = input('Enter Last Name :')

Enter Last Name :priya


In [None]:
name = (f_name + ' ' + l_name)

In [None]:
name_eth= ['name']
predict_eth = vector.transform(name_eth).toarray()

In [None]:
ethnic = naiv.predict(predict_eth)

In [None]:
inf_gender = predict_gender(name)

In [None]:
print('First Name :',f_name ,'\nLast Name :', l_name, '\nInferred Gender :', inf_gender,'\nInferred Ethnicity :', ethnic[0] )

First Name : mary 
Last Name : priya 
Inferred Gender : Female 
Inferred Ethnicity : hispanic
