# Human Data Scientist Task

In [1]:
import re

# Module to get the details of a research paper using doi.
from habanero import Crossref

# Module to extract text from pdf
from pdfminer.high_level import extract_text

# Azure related modules
from azure.core.credentials import AzureKeyCredential
from azure.ai.textanalytics import TextAnalyticsClient

from datetime import datetime


In [2]:
# Exract text here.

pdf_file_path = './Azure Papers/GlutenCaseinFree-Elder.pdf'
text = extract_text(pdf_file_path, maxpages = 3)
print(text[:500])

Journal of Autism and Developmental Disorders, Vol. 36, No. 3, April 2006 ((cid:1) 2006)
DOI 10.1007/s10803-006-0079-0
Published Online: March 23, 2006

The Gluten-Free, Casein-Free Diet In Autism: Results
of A Preliminary Double Blind Clinical Trial

Jennifer Harrison Elder,1,3 Meena Shankar,2 Jonathan Shuster,2 Douglas Theriaque,2
Sylvia Burns,1 and Lindsay Sherrill1

This study tested the eﬃcacy of a gluten-free and casein-free (GFCF) diet in treating autism
using a randomized, double blind r


In [3]:
# Regex to match doi from text
regexDOI = re.compile('[https://?doi|DOI][\s\.\:]{0,2}(10\.\d{4}[\w\:\.\-\/a-z]+)[A-Z\s]')
doi = regexDOI.findall(text)
print(f"doi extracted from the paper: {doi[0]}")

doi extracted from the paper: 10.1007/s10803-006-0079-0


In [4]:
# Get paper details using DOI.
cr = Crossref()

# Example uses
#cr.works(ids = '10.1176/appi.ajp.2011.10050764')
try:
    paper_id = doi[0]
    if paper_id[-1] == '.':
        paper_id = paper_id[:-1]

    paper_information = cr.works(ids = paper_id)
except:
    print("doi not found in paper")

In [5]:
# Extracting individual information of the paper
try:
    authors = paper_information['message']['author']
    title = paper_information['message']['title']
    published_on = paper_information['message']['published']
    paper_url = paper_information['message']['URL']
    journal_name = paper_information['message']['container-title']

    # Converting date as date-time object
    date = published_on['date-parts'][0]
    if len(date) == 3:
        yy = date[0]
        mm = date[1]
        dd = date[2]
    else:
        yy = date[0]
        mm = date[1]
        dd = 1
        
    date = datetime(yy, mm,dd)

    # Formatting author names
    all_authors = []
    for author in authors:
        full_name = author['given'] + ' ' + author['family']
        all_authors.append(full_name)
except:
    print("doi not found in paper")

In [6]:
# Printing all the details of the paper
try:
    print("Below is the details of the paper:")
    print("Article name: ", title[0], end = "\n\n")

    print("Authors: ", end = "")

    for author in all_authors:
        print(author, end = "")
        print(", ", end = "")
        
    print("\n\nYear: ", date.year)
    print("\nJournal Name: ", journal_name[0])
except:
    print("doi not found in paper")


Below is the details of the paper:
Article name:  The Gluten-Free, Casein-Free Diet In Autism: Results of A Preliminary Double Blind Clinical Trial

Authors: Jennifer Harrison Elder, Meena Shankar, Jonathan Shuster, Douglas Theriaque, Sylvia Burns, Lindsay Sherrill, 

Year:  2006

Journal Name:  Journal of Autism and Developmental Disorders


In [7]:
## Preparing data to make a dataframe
try:
    articleName = []
    articleName.append(title[0])
    year_ = []
    year_.append(str(date.year))
    journal_name_ = []
    journal_name_.append(journal_name[0])

    ent_dict = {"ArticleName": articleName,"Authors": all_authors, "Year": year_, "JournalName": journal_name_ }
    ent_dict
except:
    ent_dict = {}
    print("doi not found in paper")

### Azure 

In [8]:
"""
Function to fetch named entity from Azure
Input - File path of Azure Credentials, List of texts
Output - List of entities for each documents
"""
def entity_recognition_from_azure(credentials_file_path: str, text: list) -> list:
    
    # Reading Azure credentials from a text file
    with open(credentials_file_path) as f:
        api_key, endpoint = f.readline().split(",")
        
    # Azure credentials
    credential = AzureKeyCredential(api_key)

    text_analytics_client = TextAnalyticsClient(endpoint, credential)
    
    
    poller = text_analytics_client.begin_analyze_healthcare_entities(text)
    result = poller.result()

    docs = [doc for doc in result if not doc.is_error]
    
    return docs


"""
Function to print named entity on console
Input - List of entities for each documents fetched from Azure
"""
def print_entities(docs: list):
    print("Results of Healthcare Entities Analysis:")
    for idx, doc in enumerate(docs):
        for idx1, entity in enumerate(doc.entities):
            print("Entity {}: {}".format(idx1,entity.text))
            print("...Normalized Text: {}".format(entity.normalized_text))
            print("...Category: {}".format(entity.category))
            print("...Subcategory: {}".format(entity.subcategory))
            print("...Offset: {}".format(entity.offset))
            print("...Confidence score: {}".format(entity.confidence_score))
            if entity.data_sources is not None:
                print("...Data Sources:")
                for data_source in entity.data_sources:
                    print("......Entity ID: {}".format(data_source.entity_id))
                    print("......Name: {}".format(data_source.name))
            print("\n")
            

"""
Function to print named entity on console
Input - List of entities for each documents fetched from Azure
"""
def print_relations(docs: list):
    print("Results of Healthcare Entities Analysis (Relations):")
    for idx, doc in enumerate((docs)):
        for idx1, relation in enumerate(doc.entity_relations):
            print("Relation number {}".format(idx1))                  
            print("Relation of type: {} has the following roles".format(relation.relation_type))
            for role in relation.roles:
                print("...Role '{}' with entity '{}'".format(role.name, role.entity.text))
            print("\n")
    
    

In [9]:
text = [text]
docss = entity_recognition_from_azure("credentials.txt", text)

In [10]:
# print_entities(docss)

In [11]:
# print_relations(docss)

In [12]:
age_entity = []
for idx, doc in enumerate(docss):
    for idx1, entity in enumerate(doc.entities):
        if entity.category == "Age":
            text_ = entity.text
            offset = entity.offset
            
            age_entity.append([text_, offset])


In [13]:
actual_snippet = []
for ent,offset in age_entity:
    actual_snippet.append([ent, text[0][offset - 15:offset + 25]])


In [14]:

for idx, doc in enumerate(docss):
    for idx1, entity in enumerate(doc.entities):
        cat = entity.category
        text_str = entity.text
        if cat not in ent_dict.keys():
            ent_dict[cat] = [text_str]
        else:
            previous_text = ent_dict[cat]
            previous_text.append(text_str)
            ent_dict[cat] = previous_text
 

In [15]:
for key in ent_dict.keys():
    temp = ent_dict[key]
    temp = set(temp)
    ent_dict[key] = temp

In [16]:
import pandas as pd
df = pd.DataFrame.from_dict(ent_dict, orient='index')

In [17]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28,29,30,31,32,33,34,35,36,37
ArticleName,"The Gluten-Free, Casein-Free Diet In Autism: R...",,,,,,,,,,...,,,,,,,,,,
Authors,Lindsay Sherrill,Sylvia Burns,Jennifer Harrison Elder,Meena Shankar,Douglas Theriaque,Jonathan Shuster,,,,,...,,,,,,,,,,
Year,2006,,,,,,,,,,...,,,,,,,,,,
JournalName,Journal of Autism and Developmental Disorders,,,,,,,,,,...,,,,,,,,,,
Diagnosis,infections,autism spectrum disorder,inﬂammatory,autistic symptoms,immune reactions,disorder,Developmental Disorders,ASD,autism,hyperactive,...,,,,,,,,,,
Date,"3, April 2006","March 23, 2006",,,,,,,,,...,,,,,,,,,,
TreatmentName,restrict additives,diets,casein,free diet,preservatives,Dietary Intervention,GFCF diet,gluten–casein free diet,restricted diets,Gluten-Free,...,Dietary treatment,sugars,,,,,,,,
AdministrativeEvent,clinical trials,controlled clinical trial,studies,Double Blind Clinical Trial,randomized single blind\nstudy,study,Dietary Clinical Trials,randomized double blind clinical\ntrial,double blind\ncontrolled clinical trials,,...,,,,,,,,,,
SymptomOrSign,improvement,autistic symptoms,impairments in\nfunctional communication abili...,symptoms,motor\nproblems,abnormal\nintestinal\npermeability,social relatedness,cut,behavior,aggressive behav-\nior,...,,,,,,,,,,
Age,2–16 years,child,4.1 years,children,7.32 years,,,,,,...,,,,,,,,,,


In [18]:
### Saving the result as a CSV file. The name of the file corresponds to the name of the pdf file.
output_file_name = pdf_file_path.split("/")[2].split(".")[0]
output_file_path = "./outputs/" + output_file_name + ".csv"
df.to_csv(output_file_path)

In [19]:
# Some testing texts

documents = ["""Autism is a neurodevelopmental disorder characterized by impairment in three core symptom domains:
                socialization, communication, and repetitive/stereotyped behaviours. Other associated symptom
                domains are also affected including impulsivity/aggression, self-injury, anxiety, and mood lability.
                Divalproex has been shown to have efficacy in treating epilepsy, bipolar disorder, mood lability, and
                impulsive aggression. The present study evaluated the use of divalproex in the treatment of repetitive,
                compulsive-like symptoms of autism spectrum disorder (ASD). Thirteen individuals with ASD participated
                in an 8-wk, double-blind, placebo-controlled trial of divalproex sodium vs. placebo. There was a
                significant group difference on improvement in repetitive behaviours as measured by the Children’s
                Yale–Brown Obsessive Compulsive Scale (C-YBOCS) (p=0.037) and a large effect size (d=1.616). This
                study provides preliminary support for the use of divalproex in treating repetitive behaviours in ASD.
                Further research is needed to evaluate the specificity and mechanism of action of these findings."""]


document1 = ["""
                Patients were recruited and screened for the presence
                of ASDs at the Seaver and New York Autism Center
                of Excellence. Twenty-five subjects were screened.
                Thirteen subjects were randomized, had at least one
                post-treatment outcome measure, and were included
                in the intent-to-treat (ITT) group. One subject dropped
                out in week 5 due to lack of efficacy (on medication)
                and 12 subjects completed the trial.
                The average age of subjects was 9.5 yr (12 were
                child/adolescents with an age range 5–17 yr, and one
                was an adult, age 40 yr). Eight of the subjects were
                Caucasian, two were African American, two were
                Asian and one was Hispanic. Baseline assessments
                included comprehensive psychiatric, diagnostic,
                psychological, and medical evaluations. Diagnoses
                were established using DSM-IV criteria, Autism
                Diagnostic Interview – Revised (ADI-R), and Autism
                Diagnostic Observation Schedule (ADOS). Ten of
                the participants were diagnosed with autistic disorder,
                two were diagnosed with Asperger’s disorder and
                one was diagnosed with pervasive developmental
                disotherwise specified (PDD-NOS).
                Psychological testing included measures of cognitive
                functioning and adaptive behaviour. The sample
                represents the moderate to low functioning end of the
                autism spectrum. IQ scores for the majority of participants
                were in the mild to moderate mental retardation
                range, with a mean IQ score of 60 (range=30–104).
                Scores on the Vineland Adaptive Behaviour Scale fell
                in the moderately to severely impaired range of functioning
                (mean=44, standard deviation=22).
                Inclusion criteria for the study included subjects
                meeting DSM-IV and ADI-R criteria for an ASD
                and scoring as moderately ill on Clinical Global
                Impression Scale for Autistic Disorder (CGI-AD)
                rating. Patients were not selected on the basis of levels
                of repetitive or aggressive behaviours on study
                measures. Exclusion criteria included medical illnesses
                (with the exception of stable seizure disorder), past
                history of psychotic disorders, and recent or current
                use of divalproex, terfenadine (Seldane), or astemizole
                (Hismanal). Subjects using any psychoactive medication
                were allowed to participate in the trial only if
                the dose remained stable for at least 3 months prior to
                and during the trial. Only one participant was on a
                stable dose of risperidone prior to the study and continued
                throughout the 8 wk of the study. No other
                participant was on concomitant medications. This
                study was approved by the Mount Sinai Institutional
                Review Board. All participants provided informed
                consent prior to participation.
"""]


documents2 = ["""
Abstract
Background: Suboptimal physical activity levels and tolerance, poor motor skills and poor physical health are
demonstrated in children with Autism Spectrum Disorder (ASD). We speculate that social interaction and
communication deficits in children with ASD are two major factors that hinder these children from actively
participating in group physical activities. While previous studies have demonstrated that exercise intervention improves
motor skills and behavioral outcomes in children with ASD, these programs tend to focus only on a single sport, which
may not cater to the interests of different children with ASD. In this protocol, a game-based exercise training program
designed by a multi-disciplinary team (pediatrics, physical education and psychology) will be implemented by frontline
healthcare providers trained following the train-the-trainer (TTT) model and subjected to validation.
Method: Using a randomized controlled trial design, the effectiveness of the game-based exercise program will be
examined for 112 young children with ASD. These children were randomly assigned to two groups, which will be
tested and trained in either one of the two arms of the waitlist conditions (control and intervention). The assessment
of physical and psychological traits will be conducted at baseline (pre-test), at 16-weeks (post-treatment) and at 32-
weeks (follow-up) of the program.
Discussion: Most of the interventions designed for ASD children target either their psychological traits or physical
conditions, without bridging the two states. With the recognition of bidirectional relations between mental and
physical health, the present game-based exercise program which includes multiple level of difficulties was developed
to equip ASD children with the necessary skills for engaging in sustainable team sports or even professional sport
training. The program, if effective, will provide an entertaining and engaging training for whole-person development
among children with ASD.
Trial registration: This study is registered with the Chinese Clinical Trial Registry (ChiCTR-IOR-17011898). Registered 6th
July 2017.Abstract
Background: Suboptimal physical activity levels and tolerance, poor motor skills and poor physical health are
demonstrated in children with Autism Spectrum Disorder (ASD). We speculate that social interaction and
communication deficits in children with ASD are two major factors that hinder these children from actively
participating in group physical activities. While previous studies have demonstrated that exercise intervention improves
motor skills and behavioral outcomes in children with ASD, these programs tend to focus only on a single sport, which
may not cater to the interests of different children with ASD. In this protocol, a game-based exercise training program
designed by a multi-disciplinary team (pediatrics, physical education and psychology) will be implemented by frontline
healthcare providers trained following the train-the-trainer (TTT) model and subjected to validation.
Method: Using a randomized controlled trial design, the effectiveness of the game-based exercise program will be
examined for 112 young children with ASD. These children were randomly assigned to two groups, which will be
tested and trained in either one of the two arms of the waitlist conditions (control and intervention). The assessment
of physical and psychological traits will be conducted at baseline (pre-test), at 16-weeks (post-treatment) and at 32-
weeks (follow-up) of the program.
Discussion: Most of the interventions designed for ASD children target either their psychological traits or physical
conditions, without bridging the two states. With the recognition of bidirectional relations between mental and
physical health, the present game-based exercise program which includes multiple level of difficulties was developed
to equip ASD children with the necessary skills for engaging in sustainable team sports or even professional sport
training. The program, if effective, will provide an entertaining and engaging training for whole-person development
among children with ASD.
Trial registration: This study is registered with the Chinese Clinical Trial Registry (ChiCTR-IOR-17011898). Registered 6th
July 2017.

"""    
]

