In [12]:
#env python3
# -*- coding: utf-8 -*-
#current numpy version is 1.21

"""
This sample module extracts researcher names from news article content by Spacy package.
For the purpose of demonstration, I extracted news from "Science Daily" website,(https://www.sciencedaily.com/)

Steps: 1. Get news titles for future use, such as pair up with interviewed researchers.
       2. Data cleaning and get researcher names from news content.
       3. Label researcher genders and for future use, by Javascript and HTML magic cells.

Outputs: All three output json files are saved in child folder "News", including:
         1. json file containing news titles.
         2. json file containing researcher names.
         3. json file containing researcher genders. 

@author: Xinyu Ji GitHub: https://github.com/XinyuJi123
Common disclaimers apply. Subject to change at all time.

Last review: 13/03/2023
"""

import spacy
import os
from datetime import datetime
import re
import json
import glob
from nltk.corpus import stopwords
from spacy.tokens import Span
import en_core_web_sm
#from spacy.lang.en.stop_words import STOP_WORDS

class NLP:
    def __init__(self,write=True):
        self.w=write
        
    def get_article_titles(self):
        write=self.w
        
        file_list = os.listdir("News/") 
        fnames = []
        for fname in file_list:
            fname = fname.split('_')[0]
            fnames.append(fname)
        if write == True:
            with open ('article_titles_Spacy.json', 'w') as output_title:
                output_title.write(json.dumps(fnames))
    
    def get_researcher_names(self):
        write=self.w
        
        person_articles = []
        
        #initiating Spacy nlp pipeline.
        nlp = en_core_web_sm.load()
        stopwords = nlp.Defaults.stop_words
        
        t000=datetime.now()
        for article in glob.glob('News/*.txt'):
            article = open(article, 'r',encoding='utf-8')
            # read all text
            text = article.read()
            # Remove punctuation
            text = re.sub('[,\.!?]', '', text)
            
            #Removing stopwords is not need for the purpose of researcher name extraction. 
            #These are codes for future use.
            #tokens_without_stopwords=[]
            #for token in text.split():
                #if token.lower() not in stopwords:
                    #tokens_without_stopwords.append(token)
            
            #tokens_without_stopwords = str(tokens_without_stopwords)
            
            
            text = nlp(text)
            #Researcher name extraction
            person = [ent.text for ent in text.ents if ent.label_=='PERSON']
            person_articles.append(person)
            
        if write == True:
            with open ('researcher_names_articles_Spacy.json', 'w') as output_name:
                output_name.write(json.dumps(person_articles))
                
                
        t001=datetime.now()
        dt00=t001-t000
        print('Name Extraction is completed after '+str(dt00.total_seconds())+' seconds')

In [13]:
Spacy=NLP(write=True)
Spacy.get_article_titles()
Spacy.get_researcher_names()

Name Extraction is completed after 0.921307 seconds


In [15]:
#The following codes design an interface to label the mentioned researchers from news articles with their genders. 
# Other labels such as race, nationality, and other background factors can also be achieved in a similar way.
#(WIP v1.0 by Javascript and HTML magic cells)


#Initialting input json files preparation.
article_titles = []

with open('article_titles_Spacy.json') as output_title:
    for title in output_title:
        article_titles.append(json.loads(title))

researcher_names = []
        
with open('researcher_names_articles_Spacy.json') as output_name:
    for name in output_name:
        researcher_names.append(json.loads(name))
                
genders = []
if os.path.exists('researcher_gender_articles_Spacy.json'):
    with open('researcher_gender_articles_Spacy.json') as output_gender:
        genders = json.load(output_gender)

researcher_dict = {}
for i in range(len(researcher_names[0])):
    researcher_dict[article_titles[0][i]] = researcher_names[0][i]

#Make sure that we can start labelling from exactly where we left last time                
def get_next_researcher():
    return list(researcher_dict.values())[len(genders)]


['Stephen Fried',
 'Anneliese M. Faustino',
 'Mikhail Makarov',
 'Alma C. Sanchez Rocha',
 'Ivan Cherepashuk',
 'Robin Krystufek',
 'Klara Hlouchova',
 'Volha Dzmitruk',
 'Tatsiana Charnavets',
 'Michal Lebl',
 'Kosuke Fujishima']

In [16]:
%%javascript
        
function set_gender(gender){
    var kernel = IPython.notebook.kernel;
    kernel.execute("genders.append(" + gender + ")");
    load_next_researcher();
}

function handle_output(out){
    var res = out.content.data["text/plain"];
    $("div#researcher").html(res);
}
        
function load_next_researcher(){
    var code_input = "get_next_researcher()";
    var kernel = IPython.notebook.kernel;
    var callbacks = { 'iopub' : {'output' : handle_output}};
    kernel.execute(code_input, callbacks, {silent:false});
}

<IPython.core.display.Javascript object>

In [17]:
%%html
<div name="researcherbox">
    Instructions: Click in textbox. Enter a 1 if the researcher is female, enter 0 otherwise. <br>
Researcher Name: <div id="researcher" value="text"></div><br>
<input type=researcher_names id="capture"></input><br>
</div>
        
<script>

function set_gender(gender){
    var kernel = IPython.notebook.kernel;
    kernel.execute("genders.append(" + gender + ")");
    load_next_researcher();
}

function handle_output(out){
    var res = out.content.data["text/plain"];
    $("div#researcher").html(res);
}
        
function load_next_researcher(){
    var code_input = "get_next_researcher()";
    var kernel = IPython.notebook.kernel;
    var callbacks = { 'iopub' : {'output' : handle_output}};
    kernel.execute(code_input, callbacks, {silent:false});
}

$("input#capture").keypress(function(e) {
if(e.which == 48) {
    set_gender(0);
    $("input#capture").val("");
}else if (e.which == 49){
    set_gender(1);
    $("input#capture").val("");
  }
});
        
load_next_researcher();
</script>

In [18]:
with open('researcher_genders_Spacy.json', 'w') as output_gender:
    json.dump(genders,output_gender)