# Human Data Scientist Task

In [1]:
import re

# Module to get the details of a research paper using doi.
from habanero import Crossref

# Module to extract text from pdf
from pdfminer.high_level import extract_text

# Azure related modules
from azure.core.credentials import AzureKeyCredential
from azure.ai.textanalytics import TextAnalyticsClient

from datetime import datetime


In [2]:
# Exract text here.

pdf_file_path = './Azure Papers/VitD-Song.pdf'
text = extract_text(pdf_file_path, maxpages = 3)
print(text)

Original Article
https://doi.org/10.9758/cpn.2020.18.2.203
Clinical Psychopharmacology and Neuroscience 2020;18(2):203-213

pISSN 1738-1088 / eISSN 2093-4327
Copyrightⓒ 2020, Korean College of Neuropsychopharmacology

Vitamin D Supplementation is Beneficial for Children with Autism 
Spectrum Disorder: A Meta-analysis

Liyao  Song1,*,  Xiaomei  Luo1,*,  Qing  Jiang1,  Zhi  Chen2,  Lifang  Zhou1,  Dan  Wang1,  Ai  Chen1,3
1Department  of  Pediatrics,  2Nursing  Department,  The  Affiliated  Hospital  of  Southwest  Medical  University,  Luzhou,  3Department  of  Pediatric, 
Sichuan  Provincial  Hospital  for  Women  and  Children,  Chengdu,  Sichuan  Province,  China

Objective: We conducted a meta-analysis of  randomized controlled  trials  to explore  whether  vitamin  D  supplementation 
is  beneficial  for  symptom  improvement  in  children  with  autism  spectrum  disorder.
Methods:  We  systematically  searched  the  PubMed  database,  EMBASE,  Cochrane  Library,  Web  of  Science

In [3]:
# Regex to match doi from text
regexDOI = re.compile('[https://?doi|DOI][\s\.\:]{0,2}(10\.\d{4}[\w\:\.\-\/a-z]+)[A-Z\s]')
doi = regexDOI.findall(text)
doi

['10.9758/cpn.2020.18.2.203']

In [4]:
# Get paper details using DOI.
cr = Crossref()

# Example uses
#cr.works(ids = '10.1176/appi.ajp.2011.10050764')
try:
    paper_id = doi[0]
    if paper_id[-1] == '.':
        paper_id = paper_id[:-1]

    paper_information = cr.works(ids = paper_id)
except:
    print("doi not found in paper")

In [5]:
# Extracting individual information of the paper
try:
    authors = paper_information['message']['author']
    title = paper_information['message']['title']
    published_on = paper_information['message']['published']
    paper_url = paper_information['message']['URL']
    journal_name = paper_information['message']['container-title']

    # Converting date as date-time object
    date = published_on['date-parts'][0]
    if len(date) == 3:
        yy = date[0]
        mm = date[1]
        dd = date[2]
    else:
        yy = date[0]
        mm = date[1]
        dd = 1
        
    date = datetime(yy, mm,dd)

    # Formatting author names
    all_authors = []
    for author in authors:
        full_name = author['given'] + ' ' + author['family']
        all_authors.append(full_name)
except:
    print("doi not found in paper")

In [6]:
# Printing all the details of the paper
try:
    print("Below is the details of the paper:")
    print("Article name: ", title[0], end = "\n\n")

    print("Authors: ", end = "")

    for author in all_authors:
        print(author, end = "")
        print(", ", end = "")
        
    print("\n\nYear: ", date.year)
    print("\nJournal Name: ", journal_name[0])
except:
    print("doi not found in paper")


Below is the details of the paper:
Article name:  Vitamin D Supplementation is Beneficial for Children with Autism Spectrum Disorder: A Meta-analysis

Authors: Liyao Song, Xiaomei Luo, Qing Jiang, Zhi Chen, Lifang Zhou, Dan Wang, Ai Chen, 

Year:  2020

Journal Name:  Clinical Psychopharmacology and Neuroscience


In [7]:
## Preparing data to make a dataframe
try:
    articleName = []
    articleName.append(title[0])
    year_ = []
    year_.append(str(date.year))
    journal_name_ = []
    journal_name_.append(journal_name[0])

    ent_dict = {"ArticleName": articleName,"Authors": all_authors, "Year": year_, "JournalName": journal_name_ }
    ent_dict
except:
    ent_dict = {}
    print("doi not found in paper")

### Azure 

In [8]:
"""
Function to fetch named entity from Azure
Input - File path of Azure Credentials, List of texts
Output - List of entities for each documents
"""
def entity_recognition_from_azure(credentials_file_path: str, text: list) -> list:
    
    # Reading Azure credentials from a text file
    with open(credentials_file_path) as f:
        api_key, endpoint = f.readline().split(",")
        
    # Azure credentials
    credential = AzureKeyCredential(api_key)

    text_analytics_client = TextAnalyticsClient(endpoint, credential)
    
    
    poller = text_analytics_client.begin_analyze_healthcare_entities(text)
    result = poller.result()

    docs = [doc for doc in result if not doc.is_error]
    
    return docs


"""
Function to print named entity on console
Input - List of entities for each documents fetched from Azure
"""
def print_entities(docs: list):
    print("Results of Healthcare Entities Analysis:")
    for idx, doc in enumerate(docs):
        for idx1, entity in enumerate(doc.entities):
            print("Entity {}: {}".format(idx1,entity.text))
            print("...Normalized Text: {}".format(entity.normalized_text))
            print("...Category: {}".format(entity.category))
            print("...Subcategory: {}".format(entity.subcategory))
            print("...Offset: {}".format(entity.offset))
            print("...Confidence score: {}".format(entity.confidence_score))
            if entity.data_sources is not None:
                print("...Data Sources:")
                for data_source in entity.data_sources:
                    print("......Entity ID: {}".format(data_source.entity_id))
                    print("......Name: {}".format(data_source.name))
            print("\n")
            

"""
Function to print named entity on console
Input - List of entities for each documents fetched from Azure
"""
def print_relations(docs: list):
    print("Results of Healthcare Entities Analysis (Relations):")
    for idx, doc in enumerate((docs)):
        for idx1, relation in enumerate(doc.entity_relations):
            print("Relation number {}".format(idx1))                  
            print("Relation of type: {} has the following roles".format(relation.relation_type))
            for role in relation.roles:
                print("...Role '{}' with entity '{}'".format(role.name, role.entity.text))
            print("\n")
    
    

In [9]:
text = [text]
docss = entity_recognition_from_azure("credentials.txt", text)

In [10]:
print_entities(docss)

Results of Healthcare Entities Analysis:
Entity 0: ;18
...Normalized Text: None
...Category: Time
...Subcategory: None
...Offset: 108
...Confidence score: 0.76


Entity 1: Vitamin D Supplementation
...Normalized Text: Vitamin D supplementation
...Category: MedicationName
...Subcategory: None
...Offset: 218
...Confidence score: 0.6
...Data Sources:
......Entity ID: C4524013
......Name: UMLS
......Entity ID: 10079910
......Name: MDR


Entity 2: Children
...Normalized Text: Child
...Category: Age
...Subcategory: None
...Offset: 262
...Confidence score: 0.98
...Data Sources:
......Entity ID: C0008059
......Name: UMLS
......Entity ID: 0000003204
......Name: AOD
......Entity ID: 0060058
......Name: CCPSS
......Entity ID: 0000002765
......Name: CHV
......Entity ID: 5002-0012
......Name: CSP
......Entity ID: U000730
......Name: DXP
......Entity ID: CHD
......Name: HL7V3.0
......Entity ID: U005340
......Name: LCH
......Entity ID: sh85023418
......Name: LCH_NW
......Entity ID: LA9949-4
......Nam

In [11]:
print_relations(docss)

Results of Healthcare Entities Analysis (Relations):
Relation number 0
Relation of type: CourseOfCondition has the following roles
...Role 'Condition' with entity 'symptom'
...Role 'Course' with entity 'improvement'


Relation number 1
Relation of type: CourseOfCondition has the following roles
...Role 'Course' with entity 'improves'
...Role 'Condition' with entity 'symptoms'


Relation number 2
Relation of type: CourseOfCondition has the following roles
...Role 'Course' with entity 'improves'
...Role 'Condition' with entity 'autism  spectrum  disorder'


Relation number 3
Relation of type: ValueOfExamination has the following roles
...Role 'Value' with entity 'reduced'
...Role 'Examination' with entity 'Social  Responsiveness  Scale'


Relation number 4
Relation of type: ValueOfExamination has the following roles
...Role 'Value' with entity 'reduced'
...Role 'Examination' with entity 'Child Autism  Rating Scale'


Relation number 5
Relation of type: Abbreviation has the following role

In [12]:
age_entity = []
for idx, doc in enumerate(docss):
    for idx1, entity in enumerate(doc.entities):
        if entity.category == "Age":
            text_ = entity.text
            offset = entity.offset
            
            age_entity.append([text_, offset])


In [13]:
age_entity

[['Children', 262],
 ['children', 847],
 ['children', 1554],
 ['children', 2031],
 ['Children', 2101],
 ['children', 3295],
 ['childhood', 4871],
 ['fetal', 4903],
 ['children', 6758],
 ['children', 7038],
 ['children', 7591],
 ['children', 7806],
 ['18 years', 7822],
 ['Children', 8713],
 ['children', 11387]]

In [14]:
actual_snippet = []
for ent,offset in age_entity:
    actual_snippet.append([ent, text[0][offset - 15:offset + 25]])


In [15]:
actual_snippet

[['Children', 'Beneficial for Children with Autism \nSpe'],
 ['children', 'provement  in  children  with  autism  s'],
 ['children', 'els  among 203 children included \nfrom  '],
 ['children', 'eneficial for  children with  autism \nsp'],
 ['Children', '  Vitamin  D;  Children;  Autism  spectr'],
 ['children', 'dividuals were children [3-5]. The \ninci'],
 ['childhood', 'm, not only in childhood but al-\nso duri'],
 ['fetal', '\nso during the fetal period [18,19]. Vit'],
 ['children', 'improvement in children with ASD af-\nter'],
 ['children', 'provement \nin  children  with  autism.  '],
 ['children', '(1) studies on children with ASD \ndiagno'],
 ['children', 'dies including children aged ≤ 18 years;'],
 ['18 years', 'hildren aged ≤ 18 years; (3) studies in '],
 ['Children', 'Disorder Among Children 205\n\nFig. 2. Sea'],
 ['children', ', the included children with au-\ntism ha']]

In [16]:

for idx, doc in enumerate(docss):
    for idx1, entity in enumerate(doc.entities):
        cat = entity.category
        text_str = entity.text
        if cat not in ent_dict.keys():
            ent_dict[cat] = [text_str]
        else:
            previous_text = ent_dict[cat]
            previous_text.append(text_str)
            ent_dict[cat] = previous_text
 

In [17]:
for key in ent_dict.keys():
    temp = ent_dict[key]
    temp = set(temp)
    ent_dict[key] = temp

In [18]:
ent_dict

{'ArticleName': {'Vitamin D Supplementation is Beneficial for Children with Autism Spectrum Disorder: A Meta-analysis'},
 'Authors': {'Ai Chen',
  'Dan Wang',
  'Lifang Zhou',
  'Liyao Song',
  'Qing Jiang',
  'Xiaomei Luo',
  'Zhi Chen'},
 'Year': {'2020'},
 'JournalName': {'Clinical Psychopharmacology and Neuroscience'},
 'Time': {'30,36', ';18', 'September', 'ago', 'late 1990s'},
 'MedicationName': {'1,25-dihydroxy vitamin D3',
  'D',
  'Vitamin  D',
  'Vitamin D',
  'Vitamin D Supplementation',
  'min',
  'vita-\nmin  D',
  'vitamin',
  'vitamin  D',
  'vitamin D'},
 'Age': {'18 years', 'Children', 'childhood', 'children', 'fetal'},
 'Diagnosis': {'ASD',
  'ASDs',
  'Alzheimer’s  disease',
  'Asperger’s syn-\ndrome',
  'Autism \nSpectrum Disorder',
  'Autism  spectrum  disorder',
  'Autism Spectrum Disorder',
  'Parkinson’s disease',
  'Pervasive Developmental Disorder',
  'af',
  'autism',
  'autism \nspectrum  disorder',
  'autism  spectrum  disorder',
  'brain  disease',
  'depr

In [19]:
import pandas as pd
df = pd.DataFrame.from_dict(ent_dict, orient='index')

In [20]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
ArticleName,Vitamin D Supplementation is Beneficial for Ch...,,,,,,,,,,...,,,,,,,,,,
Authors,Zhi Chen,Xiaomei Luo,Liyao Song,Ai Chen,Dan Wang,Qing Jiang,Lifang Zhou,,,,...,,,,,,,,,,
Year,2020,,,,,,,,,,...,,,,,,,,,,
JournalName,Clinical Psychopharmacology and Neuroscience,,,,,,,,,,...,,,,,,,,,,
Time,3036,September,late 1990s,ago,;18,,,,,,...,,,,,,,,,,
MedicationName,vitamin D,vita-\nmin D,min,Vitamin D Supplementation,Vitamin D,D,vitamin,"1,25-dihydroxy vitamin D3",vitamin D,Vitamin D,...,,,,,,,,,,
Age,childhood,fetal,Children,children,18 years,,,,,,...,,,,,,,,,,
Diagnosis,brain disease,Autism spectrum disorder,language devel-\nopment retardation,epilepsy,Autism Spectrum Disorder,Pervasive Developmental Disorder,Alzheimer’s disease,ASD,autism,af,...,ASDs,Autism \nSpectrum Disorder,Parkinson’s disease,,,,,,,
CareEnvironment,Department of Paediatrics,Pediatrics,Pediatric,,,,,,,,...,,,,,,,,,,
Gender,males,females,trans,Women,,,,,,,...,,,,,,,,,,


In [21]:
### Saving the result as a CSV file. The name of the file corresponds to the name of the pdf file.
output_file_name = pdf_file_path.split("/")[2].split(".")[0]
output_file_path = "./outputs/" + output_file_name + ".csv"
df.to_csv(output_file_path)

In [22]:
# Some testing texts

documents = ["""Autism is a neurodevelopmental disorder characterized by impairment in three core symptom domains:
                socialization, communication, and repetitive/stereotyped behaviours. Other associated symptom
                domains are also affected including impulsivity/aggression, self-injury, anxiety, and mood lability.
                Divalproex has been shown to have efficacy in treating epilepsy, bipolar disorder, mood lability, and
                impulsive aggression. The present study evaluated the use of divalproex in the treatment of repetitive,
                compulsive-like symptoms of autism spectrum disorder (ASD). Thirteen individuals with ASD participated
                in an 8-wk, double-blind, placebo-controlled trial of divalproex sodium vs. placebo. There was a
                significant group difference on improvement in repetitive behaviours as measured by the Children’s
                Yale–Brown Obsessive Compulsive Scale (C-YBOCS) (p=0.037) and a large effect size (d=1.616). This
                study provides preliminary support for the use of divalproex in treating repetitive behaviours in ASD.
                Further research is needed to evaluate the specificity and mechanism of action of these findings."""]


document1 = ["""
                Patients were recruited and screened for the presence
                of ASDs at the Seaver and New York Autism Center
                of Excellence. Twenty-five subjects were screened.
                Thirteen subjects were randomized, had at least one
                post-treatment outcome measure, and were included
                in the intent-to-treat (ITT) group. One subject dropped
                out in week 5 due to lack of efficacy (on medication)
                and 12 subjects completed the trial.
                The average age of subjects was 9.5 yr (12 were
                child/adolescents with an age range 5–17 yr, and one
                was an adult, age 40 yr). Eight of the subjects were
                Caucasian, two were African American, two were
                Asian and one was Hispanic. Baseline assessments
                included comprehensive psychiatric, diagnostic,
                psychological, and medical evaluations. Diagnoses
                were established using DSM-IV criteria, Autism
                Diagnostic Interview – Revised (ADI-R), and Autism
                Diagnostic Observation Schedule (ADOS). Ten of
                the participants were diagnosed with autistic disorder,
                two were diagnosed with Asperger’s disorder and
                one was diagnosed with pervasive developmental
                disotherwise specified (PDD-NOS).
                Psychological testing included measures of cognitive
                functioning and adaptive behaviour. The sample
                represents the moderate to low functioning end of the
                autism spectrum. IQ scores for the majority of participants
                were in the mild to moderate mental retardation
                range, with a mean IQ score of 60 (range=30–104).
                Scores on the Vineland Adaptive Behaviour Scale fell
                in the moderately to severely impaired range of functioning
                (mean=44, standard deviation=22).
                Inclusion criteria for the study included subjects
                meeting DSM-IV and ADI-R criteria for an ASD
                and scoring as moderately ill on Clinical Global
                Impression Scale for Autistic Disorder (CGI-AD)
                rating. Patients were not selected on the basis of levels
                of repetitive or aggressive behaviours on study
                measures. Exclusion criteria included medical illnesses
                (with the exception of stable seizure disorder), past
                history of psychotic disorders, and recent or current
                use of divalproex, terfenadine (Seldane), or astemizole
                (Hismanal). Subjects using any psychoactive medication
                were allowed to participate in the trial only if
                the dose remained stable for at least 3 months prior to
                and during the trial. Only one participant was on a
                stable dose of risperidone prior to the study and continued
                throughout the 8 wk of the study. No other
                participant was on concomitant medications. This
                study was approved by the Mount Sinai Institutional
                Review Board. All participants provided informed
                consent prior to participation.
"""]


documents2 = ["""
Abstract
Background: Suboptimal physical activity levels and tolerance, poor motor skills and poor physical health are
demonstrated in children with Autism Spectrum Disorder (ASD). We speculate that social interaction and
communication deficits in children with ASD are two major factors that hinder these children from actively
participating in group physical activities. While previous studies have demonstrated that exercise intervention improves
motor skills and behavioral outcomes in children with ASD, these programs tend to focus only on a single sport, which
may not cater to the interests of different children with ASD. In this protocol, a game-based exercise training program
designed by a multi-disciplinary team (pediatrics, physical education and psychology) will be implemented by frontline
healthcare providers trained following the train-the-trainer (TTT) model and subjected to validation.
Method: Using a randomized controlled trial design, the effectiveness of the game-based exercise program will be
examined for 112 young children with ASD. These children were randomly assigned to two groups, which will be
tested and trained in either one of the two arms of the waitlist conditions (control and intervention). The assessment
of physical and psychological traits will be conducted at baseline (pre-test), at 16-weeks (post-treatment) and at 32-
weeks (follow-up) of the program.
Discussion: Most of the interventions designed for ASD children target either their psychological traits or physical
conditions, without bridging the two states. With the recognition of bidirectional relations between mental and
physical health, the present game-based exercise program which includes multiple level of difficulties was developed
to equip ASD children with the necessary skills for engaging in sustainable team sports or even professional sport
training. The program, if effective, will provide an entertaining and engaging training for whole-person development
among children with ASD.
Trial registration: This study is registered with the Chinese Clinical Trial Registry (ChiCTR-IOR-17011898). Registered 6th
July 2017.Abstract
Background: Suboptimal physical activity levels and tolerance, poor motor skills and poor physical health are
demonstrated in children with Autism Spectrum Disorder (ASD). We speculate that social interaction and
communication deficits in children with ASD are two major factors that hinder these children from actively
participating in group physical activities. While previous studies have demonstrated that exercise intervention improves
motor skills and behavioral outcomes in children with ASD, these programs tend to focus only on a single sport, which
may not cater to the interests of different children with ASD. In this protocol, a game-based exercise training program
designed by a multi-disciplinary team (pediatrics, physical education and psychology) will be implemented by frontline
healthcare providers trained following the train-the-trainer (TTT) model and subjected to validation.
Method: Using a randomized controlled trial design, the effectiveness of the game-based exercise program will be
examined for 112 young children with ASD. These children were randomly assigned to two groups, which will be
tested and trained in either one of the two arms of the waitlist conditions (control and intervention). The assessment
of physical and psychological traits will be conducted at baseline (pre-test), at 16-weeks (post-treatment) and at 32-
weeks (follow-up) of the program.
Discussion: Most of the interventions designed for ASD children target either their psychological traits or physical
conditions, without bridging the two states. With the recognition of bidirectional relations between mental and
physical health, the present game-based exercise program which includes multiple level of difficulties was developed
to equip ASD children with the necessary skills for engaging in sustainable team sports or even professional sport
training. The program, if effective, will provide an entertaining and engaging training for whole-person development
among children with ASD.
Trial registration: This study is registered with the Chinese Clinical Trial Registry (ChiCTR-IOR-17011898). Registered 6th
July 2017.

"""    
]

