In [1]:
#please make sure you have the package: pip install nameparser
#current numpy version is 1.21
import os
from datetime import datetime
import nltk
from nameparser.parser import HumanName
import re
from nltk.corpus import stopwords
import json
import glob
nltk.download('words')
nltk.download('stopwords')

class NLP:
    def __init__(self,write=True):
        self.w=write
        
    def get_article_titles(self):
        file_list = glob.glob("*.txt") 
        fnames = []
        for fname in file_list:
            fname = fname.split('_')[0]
            fnames.append(fname)
        with open ('article_titles.json', 'w') as output_title:
            output_title.write(json.dumps(fnames))
    
    def get_researcher_names(self):
        person_articles = []
        for article in glob.glob('./*.txt'):
            article = open(article, 'r',encoding='utf-8')
            # read all text
            text = article.read()
            t000=datetime.now()
            #Depending on data structure and its cleanliness, we might need articles = articles.drop(columns=['xx', 'xx', 'xx'], axis=1)
        
            #From now on the sample code will extract researcher names from main text,which as I was told has already been extracted nicely.
            #Therefore,I will focus on the data cleaning on main text.
            
            # Remove punctuation
            text = re.sub('[,\.!?]', '', text)
            tokens = nltk.tokenize.word_tokenize(text)
        
            #Remove stopword (this step is not needed for name extraction. I put it here in case topic analysis is needed for later)
            stop_words = stopwords.words('english')
            tokens_without_stopwords = []
            for word in tokens:
                if word not in stop_words:
                    tokens_without_stopwords.append(word) 
                    
            #Part of sppech tag    
            pos = nltk.pos_tag(tokens_without_stopwords)
            #Named entity
            ne = nltk.ne_chunk(pos, binary = False)
            person = []
            person_list = []
            name = ''
            for subtree in ne.subtrees(filter=lambda t: t.label() == 'PERSON'):
                for leaf in subtree.leaves():
                    person.append(leaf[0])
                if len(person) > 1: #avoid grabbing lone surnames
                    for part in person:
                        name += part + ' '
                    person_list.append(name)
                    name = ''
                person = []
            person_articles.append(person_list)
            
        with open ('researcher_names_articles.json', 'w') as output_name:
            output_name.write(json.dumps(person_articles))
                
                
        t001=datetime.now()
        dt00=t001-t000
        print('Name Extraction is completed after '+str(dt00.total_seconds())+' seconds')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\78234\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\78234\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
nlp=NLP(write=True)
nlp.get_article_titles()
nlp.get_researcher_names()
#nlp.dictionary_titles_names()

Name Extraction is completed after 0.094017 seconds


In [76]:
article_titles = []

with open('article_titles.json') as output_title:
    for title in output_title:
        article_titles.append(json.loads(title))

researcher_names = []
        
with open('researcher_names_articles.json') as output_name:
    for name in output_name:
        researcher_names.append(json.loads(name))
                
genders = []
if os.path.exists('researcher_gender_articles.json'):
    with open('researcher_gender_articles.json') as output_gender:
        genders = json.load(output_gender)

researcher_dict = {}
for i in range(len(researcher_names[0])):
    researcher_dict[article_titles[0][i]] = researcher_names[0][i]

                
def get_next_researcher():
    return list(researcher_dict.values())[len(genders)]


get_next_researcher()

['Stephen Fried Johns Hopkins ',
 'Charles University Czech Republic ',
 'Human Frontier Science Program ',
 'Anneliese M Faustino Johns Hopkins ',
 'Mikhail Makarov Alma ',
 'Sanchez Rocha Ivan Cherepashuk Robin Krystufek Klara Hlouchova Charles University ',
 'Volha Dzmitruk Tatsiana Charnavets Michal Lebl Czech Academy Sciences ',
 'Kosuke Fujishima Tokyo Institute Technology ']

In [68]:
%%javascript
        
function set_gender(gender){
    var kernel = IPython.notebook.kernel;
    kernel.execute("genders.append(" + gender + ")");
    load_next_researcher();
}

function handle_output(out){
    var res = out.content.data["text/plain"];
    $("div#researcher").html(res);
}
        
function load_next_researcher(){
    var code_input = "get_next_researcher()";
    var kernel = IPython.notebook.kernel;
    var callbacks = { 'iopub' : {'output' : handle_output}};
    kernel.execute(code_input, callbacks, {silent:false});
}

<IPython.core.display.Javascript object>

In [69]:
%%html
<div name="researcherbox">
    Instructions: Click in textbox. Enter a 1 if the researcher is female, enter 0 otherwise. <br>
Researcher Name: <div id="researcher" value="text"></div><br>
<input type=researcher_names id="capture"></input><br>
</div>
        
<script>

function set_gender(gender){
    var kernel = IPython.notebook.kernel;
    kernel.execute("genders.append(" + gender + ")");
    load_next_researcher();
}

function handle_output(out){
    var res = out.content.data["text/plain"];
    $("div#researcher").html(res);
}
        
function load_next_researcher(){
    var code_input = "get_next_researcher()";
    var kernel = IPython.notebook.kernel;
    var callbacks = { 'iopub' : {'output' : handle_output}};
    kernel.execute(code_input, callbacks, {silent:false});
}

$("input#capture").keypress(function(e) {
if(e.which == 48) {
    set_gender(0);
    $("input#capture").val("");
}else if (e.which == 49){
    set_gender(1);
    $("input#capture").val("");
  }
});
        
load_next_researcher();
</script>

In [73]:
with open('researcher_genders.json', 'w') as output_gender:
    json.dump(genders,output_gender)