In [38]:
import time

import numpy as np
import pandas as pd

from nameparser import HumanName
from genderize import Genderize

from data.namsor_api import NamSorApi

In [6]:
df_author = pd.read_csv('../processed/author.csv')
df_author

Unnamed: 0,author,number_publication,year
0,H. Vincent Poor,1771,1978
1,Mohamed-Slim Alouini,1407,1997
2,Philip S. Yu,1284,1981
3,Wei Wang,1284,1987
4,Wei Zhang,1283,1990
...,...,...,...
2308638,Ünsal Özkuvanci,1,2012
2308639,Üyesi Mesut Atasever,1,2019
2308640,Þröstur Pétursson,1,2015
2308641,Þórir Harðarson,1,2008


In [2]:
gender_list = pd.read_csv('gender_list.csv')
gender_list

Unnamed: 0,name,gender,probability,count
0,,,0.00,0
1,,,0.00,0
2,'Maseka,,0.00,0
3,A,male,0.68,19908
4,A'aeshah,,0.00,0
...,...,...,...,...
161709,Ünzüle,,0.00,0
161710,Ürün,,0.00,0
161711,Üyesi,,0.00,0
161712,Þórhildur,,0.00,0


In [3]:
gender_list_filtered = gender_list[(gender_list['probability'] > 0.8) & (gender_list['count'] > 1)]
gender_list_filtered

Unnamed: 0,name,gender,probability,count
5,A'ang,male,1.00,3
8,A'na,female,1.00,2
9,A-Li,male,0.88,8
11,A-Man,male,0.93,15
14,A-Ra,female,0.96,26
...,...,...,...,...
161694,Øyvind,male,1.00,192
161698,Ülari,male,1.00,2
161702,Ülo,male,1.00,2
161703,Ümit,male,1.00,29


In [4]:
student_mentor = pd.read_csv('../processed/author_mentor.csv')
student_mentor

Unnamed: 0,author,number_publication,year,mentor
0,H. Vincent Poor,1771,1978,John B. Thomas
1,Mohamed-Slim Alouini,1407,1997,Marvin K. Simon
2,Wei Wang,1284,1987,Thomas C. Henderson
3,Lajos Hanzo,1236,1993,Raymond Steele
4,Wen Gao 0001,1190,1995,Jiarong Hong
...,...,...,...,...
1236795,Ümit Özlale,1,2018,Gonca Gürsun
1236796,Ünsal Akaslan,1,2016,Selami Serhatlioglu
1236797,Ünsal Özkuvanci,1,2012,Sadik Kara
1236798,Üyesi Mesut Atasever,1,2019,Witold Chmielarz


In [8]:
unique_student_first_name = student_mentor['author'].map(lambda x: HumanName(x).first).unique()

unique_mentor_first_name = student_mentor['mentor'].map(lambda x: HumanName(x).first).unique()

unique_first_name = np.unique(np.concatenate((unique_student_first_name, unique_mentor_first_name), axis=None))
len(unique_first_name)

161714

In [9]:
# init a Genderize instance, get the list of genders
gender = Genderize(api_key='8398523e8bc59e36035174fd47c877a9')
gender_list = gender.get(unique_first_name)
df_gender = pd.DataFrame(gender_list)
df_gender.to_csv('gender_list.csv', index=None)

In [17]:
author_reduced = pd.concat([student_mentor['author'], student_mentor['mentor']], axis=0)
author_reduced = author_reduced.unique()
author_reduced = pd.Series(author_reduced)
first_name = author_reduced.map(lambda x: HumanName(x).first)

Unnamed: 0,0,1
0,H. Vincent Poor,H.
1,Mohamed-Slim Alouini,Mohamed-Slim
2,Wei Wang,Wei
3,Lajos Hanzo,Lajos
4,Wen Gao 0001,Wen
...,...,...
1348411,Mehmet Ilker Berkman,Mehmet
1348412,Takashi Kamihigashi,Takashi
1348413,Fernando Manuel Ramos,Fernando
1348414,Borut Gersak,Borut


In [18]:
name = pd.concat([author_reduced, first_name], axis=1)
name.columns = ['author', 'name']
name

Unnamed: 0,author,name
0,H. Vincent Poor,H.
1,Mohamed-Slim Alouini,Mohamed-Slim
2,Wei Wang,Wei
3,Lajos Hanzo,Lajos
4,Wen Gao 0001,Wen
...,...,...
1348411,Mehmet Ilker Berkman,Mehmet
1348412,Takashi Kamihigashi,Takashi
1348413,Fernando Manuel Ramos,Fernando
1348414,Borut Gersak,Borut


In [20]:
join_name = name.merge(gender_list, on='name')
join_name

Unnamed: 0,author,name,gender,probability,count
0,H. Vincent Poor,H.,,0.0,0
1,H. V. Jagadish,H.,,0.0,0
2,H. Jaap van den Herik,H.,,0.0,0
3,H. Jonathan Chao,H.,,0.0,0
4,H. T. Kung,H.,,0.0,0
...,...,...,...,...,...
1347730,Yuan-Bang Cheng,Yuan-Bang,,0.0,0
1347731,Hsiuying Wang,Hsiuying,,0.0,0
1347732,Gibong Jeong,Gibong,male,1.0,5
1347733,Changri Luo,Changri,male,1.0,11


In [48]:
c = 0
namesor = NamSorApi()
for index, row in join_name.iterrows():
    if row['probability'] < 0.8 and index >= 550676:
        try:
            c += 1
            if c == 3500:
                break
            gender = namesor.get_gender(row['author'])
            join_name.loc[index, 'gender'] = gender['likelyGender']
            join_name.loc[index, 'probability'] = gender['probabilityCalibrated']
        except Exception:
            print('boom')
            time.sleep(5)
join_name

Unnamed: 0,author,name,gender,probability,count
0,H. Vincent Poor,H.,male,0.972930,0
1,H. V. Jagadish,H.,male,0.936064,0
2,H. Jaap van den Herik,H.,male,0.504165,0
3,H. Jonathan Chao,H.,male,0.976723,0
4,H. T. Kung,H.,male,0.946764,0
...,...,...,...,...,...
1347730,Yuan-Bang Cheng,Yuan-Bang,,0.000000,0
1347731,Hsiuying Wang,Hsiuying,,0.000000,0
1347732,Gibong Jeong,Gibong,male,1.000000,5
1347733,Changri Luo,Changri,male,1.000000,11


In [50]:
join_name.to_csv('author_gender.csv')


In [52]:
temp = join_name[(join_name['probability'] < 0.6) & (join_name['probability'] > 0.5)]
temp

Unnamed: 0,author,name,gender,probability,count
2,H. Jaap van den Herik,H.,male,0.504165,0
11,H. Vicky Zhao,H.,male,0.516929,0
18,H. F. Machiel Van der Loos,H.,male,0.500001,0
20,H. Hannah Inbarani,H.,female,0.566456,0
36,H. M. N. Dilum Bandara,H.,male,0.500039,0
...,...,...,...,...,...
1346960,Starley B. Shade,Starley,male,0.570000,7
1347185,Maido Remm,Maido,male,0.560000,68
1347482,Manicka Dhanasekar,Manicka,male,0.520000,23
1347579,Minge Xie,Minge,male,0.520000,21


In [54]:
temp2 = join_name[join_name['probability'] > 0.8]
temp2

Unnamed: 0,author,name,gender,probability,count
0,H. Vincent Poor,H.,male,0.972930,0
1,H. V. Jagadish,H.,male,0.936064,0
3,H. Jonathan Chao,H.,male,0.976723,0
4,H. T. Kung,H.,male,0.946764,0
5,H. Jin Kim,H.,male,0.946304,0
...,...,...,...,...,...
1347728,Liaoni Wu,Liaoni,female,1.000000,1
1347729,Hsinying Liang,Hsinying,female,1.000000,2
1347732,Gibong Jeong,Gibong,male,1.000000,5
1347733,Changri Luo,Changri,male,1.000000,11


In [56]:
temp3 = temp2.drop(columns=['name', 'count'])
temp3

Unnamed: 0,author,gender,probability
0,H. Vincent Poor,male,0.972930
1,H. V. Jagadish,male,0.936064
3,H. Jonathan Chao,male,0.976723
4,H. T. Kung,male,0.946764
5,H. Jin Kim,male,0.946304
...,...,...,...
1347728,Liaoni Wu,female,1.000000
1347729,Hsinying Liang,female,1.000000
1347732,Gibong Jeong,male,1.000000
1347733,Changri Luo,male,1.000000


In [58]:
temp3.to_csv('author_gender_filtered.csv', index=None)

