# Get ready

First, download, import, prep packages and such. 

Then, check the file location and import the .csv files. Remove any with empty text fields. 

Save a data frame with all the texts and another with only those texts that mention the keywords of interest. 

In [1]:
%%capture

# installing necessary pdf conversion packages via pip
# the '%%capture' at the top of this cell suppresses the output (which is normally quite long and annoying looking). 
# You can remove or comment it out if you prefer to see the output. 
!pip install nltk
!pip install spacy -q
!python -m spacy download en_core_web_lg -q


In [2]:
%%capture

import os                         # os is a module for navigating your machine (e.g., file directories).
import nltk                       # nltk stands for natural language tool kit and is useful for text-mining. 
from nltk import word_tokenize    # and some of its key functions
from nltk import sent_tokenize  
tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.corpus import wordnet                    # Finally, things we need for lemmatising!
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer() 
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
nltk.download('averaged_perceptron_tagger')        # Like a POS-tagger...
nltk.download('wordnet')
nltk.download('webtext')
from nltk.corpus import webtext

import pandas as pd
pd.set_option('display.max_colwidth', 200)
import numpy as np
import statistics
import datetime
date = datetime.date.today()

import codecs
import csv                        # csv is for importing and working with csv files

from collections import Counter

import statistics
import re                         # things we need for RegEx corrections
import matplotlib.pyplot as plt
import string 
import spacy 
from spacy.matcher import Matcher 
from spacy.tokens import Span 
from spacy import displacy 
nlp = spacy.load('en_core_web_lg')
nlp.max_length = 1500000 #or any large value, as long as you don't run out of RAM

import math 

English_punctuation = "-!\"#$%&()'*-–+,./:;<=>?@[\]^_`{|}~''“”"      # Things for removing punctuation, stopwords and empty strings
table_punctuation = str.maketrans('','', English_punctuation)

In [3]:
print(os.listdir("..\\results")  )

files = []
def import_results(input):
    for f in os.listdir(input):
        f = pd.read_csv(input + '\\'+ f,encoding='latin1')
        files.append(f)
    output = pd.concat(files)
    return output

['ESHG2001abstractICHG.csv', 'ESHG2002Abstracts.csv', 'ESHG2003Abstracts.csv', 'ESHG2004.csv', 'ESHG2005Abstracts.csv', 'ESHG2006Abstracts.csv', 'ESHG2007Abstracts.csv', 'ESHG2008Abstracts.csv', 'ESHG2009Abstracts.csv', 'ESHG2010Abstracts.csv', 'ESHG2011Abstracts.csv', 'ESHG2012Abstracts.csv', 'ESHG2013Abstracts.csv', 'ESHG2014Abstracts.csv', 'ESHG2015Abstracts.csv', 'ESHG2016Abstracts.csv', 'ESHG2017 electronic posters.csv', 'ESHG2017 oral presentation.csv', 'ESHG2018 electronic posters.csv', 'ESHG2018 oral presentation.csv', 'ESHG2019 electronic posters.csv', 'ESHG2019 oral presentation.csv', 'ESHG2020 electronic posters.csv', 'ESHG2020 oral presentation.csv', 'ESHG2021 electronic posters.csv', 'ESHG2021 oral presentation.csv']


In [4]:
all_results = import_results("..\\results")
len(all_results)

34630

In [5]:
year_04 = pd.read_csv('..\\results\\ESHG2004.csv')
year_04 = year_04.iloc[:, [0,1]]
year_04

Unnamed: 0,Year,Text
0,2004,"L01Multiple Sulfatase Defi ciency: Molecular defect and properties of the autosomal forms of epigenetic mosaicism can be caused by missing enzyme. retrotransposon activity. K. von Figura, M. Maria..."
1,2004,L04Regional differences in genetic testing and counselling in Europe - An overview
2,2004,"L02Biogenesis of mitochondria: Human diseases linked to S. Aymé protein transport, folding and degradation INSERM, Paris, France. W. Neupert"
3,2004,L05Hereditary Breast/Ovarian Cancer risk: international energy present in oxidizable substrates is transduced into energy comparison of the acceptability of Preventive strategies stored in ATP. Mi...
4,2004,L06Variation in prenatal counselling in Europe: the example of highly motile within the cell. Quite a number of genes are involved Klinefelter in these processes which are closely linked to the in...
...,...,...
2200,2004,"C7 A10 in the aetiology of cystinuria, we could not identify any Affected children may have only one episode of illness or multiple mutation in SL"
2201,2004,"C7 A10 in the two families. Nevertheless, there remains recurrences. A common mutation (985A >G) has been identiÜed the possibility that other genes are involved in cystinuria. Further among pa..."
2202,2004,"P0845Inactivation of the spasmolytic trefoil peptide (Tff2) leads In this study, two unrelated MCAD patients, compound heterozygous to increased expression of additional gastroprotective factors..."
2203,2004,"P0843MCDR1 Locus - Screening for candidate genes. functional disturbance in stomach and gut, Tff2-/- constructs do N. Udar1, M. Chalukya1, R. Silva-Garcia1, J. Yeh1, P. Wong1,2, K. Small1 not di..."


In [6]:
all_results = pd.concat([all_results, year_04])
len(all_results)

36835

In [7]:
no_null_texts = all_results[~all_results['Text'].isnull()]
len(no_null_texts)

36184

In [8]:
matched_texts = no_null_texts[no_null_texts['Text'].str.contains('autis|Autis|ASD|Asperger|asperger')]
len(matched_texts)

929

# Sent tokenisation

In [None]:
sentences  = [sent_tokenize(abstract) for abstract in matched_texts['Text'] ] # make tokenized sentences list
matched_texts['Sentence'] = sentences                                         # copy that list back into df as a new column
matched_texts = matched_texts.explode('Sentence')                             # explode to create 1 row per sentence token


In [None]:
matched_sents = matched_texts[matched_texts['Sentence'].str.contains('autis|Autis|ASD|Asperger|asperger')]
                                                                            # keep only those rows that contain the keywords
matched_sents = matched_sents.drop_duplicates()                             # drop any duplicates
len(matched_sents)                                                          # check length of remaining data frame

## Person-first pattern

In [None]:
pattern_2 = [{"POS": "NOUN"},
             {'LOWER': 'with'},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {"TEXT": {"REGEX": "^[Aa]utism$"}}]

pattern_3 = [{"POS": "NOUN"},
             {'LOWER': 'with'},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {"TEXT": {"REGEX": "^[Aa]sperger$"}}]

pattern_4 = [{"POS": "NOUN"},
             {'LOWER': 'with'},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {"TEXT": {"REGEX": "^ASD$"}}]

# Matcher class object 
matcher = Matcher(nlp.vocab) 
matcher.add("matching_1", [pattern_2, pattern_3, pattern_4]) 

In [None]:
def find_pattern1_match(input):
    thingy = nlp(input)
    match = matcher(thingy)
    if match == []:
        out_value = ''
    else:
        hold_multi_spans = []
        for match_id, start, end in match:
                string_id = nlp.vocab.strings[match_id]  # Get string representation
                span = thingy[start:end]  # The matched span
                hold_multi_spans.append(span)
        out_value = hold_multi_spans
    return out_value



In [None]:
matched_sents['Person-first'] = matched_sents.apply(lambda row: find_pattern1_match(row.Sentence), axis = 1)


## Identity first pattern

In [None]:
pattern_a = [{"TEXT": {"REGEX": "^[Aa]utistic"}},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {"POS": "NOUN"}]

pattern_b = [{"TEXT": {"REGEX": "^[Aa]sperger"}},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {"POS": "NOUN"}]

pattern_c = [{"TEXT": {"REGEX": "^ASD"}},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {"POS": "NOUN"}]

# Matcher class object 
matcher = Matcher(nlp.vocab) 
matcher.add("matching_2", [pattern_a, pattern_b, pattern_c]) 


In [None]:
matched_sents['Identity-first'] = matched_sents.apply(lambda row: find_pattern1_match(row.Sentence), axis = 1)

In [None]:
len(matched_sents)

In [None]:
matched_sents = matched_sents.explode('Person-first')                             # explode to create 1 row per sentence token
len(matched_sents)

In [None]:
matched_sents = matched_sents.explode('Identity-first')                             # explode to create 1 row per sentence token
len(matched_sents)

In [None]:
final = matched_sents[(matched_sents['Person-first'] != '') | (matched_sents['Identity-first'] != '')]
len(final)

In [None]:
final

In [None]:
final.to_csv('..\\output\\text_match_results.csv') 

In [None]:
print(os.listdir("..\\..\\2023_Second_analysis\\output")  )

## Chart person first or identity first by year

In [9]:
person_identity_first = pd.read_csv('..\\..\\2023_Second_analysis\\output\\text_match_results.csv')
person_identity_first = person_identity_first.dropna(how='all')
person_identity_first['Year'] = person_identity_first['Year'].astype('Int64')

In [10]:
person_count = person_identity_first.groupby(['Year'])['Person-first'].count()
identity_count = person_identity_first.groupby(['Year'])['Identity-first'].count()


In [11]:
person_identity_count=pd.concat([person_count,identity_count],axis=1)


In [None]:
person_identity_count.plot()
plt.show()

In [13]:
person_examples = person_identity_first.groupby(['Person-first'])['Person-first'].count()
identity_examples = person_identity_first.groupby(['Identity-first'])['Identity-first'].count()


In [14]:
person_identity_examples=pd.concat([person_examples,identity_examples],axis=1)


In [17]:
person_identity_examples.sort_values(by=['Person-first'], ascending=False)

Unnamed: 0,Person-first,Identity-first
patient with autism,36.0,
patient with ASD,22.0,
child with ASD,13.0,
child with autism,12.0,
individual with autism,12.0,
...,...,...
autistic individual,,12.0
autistic patient,,23.0
autistic population,,6.0
autistic proband,,3.0


In [18]:
person_identity_examples.sort_values(by=['Identity-first'], ascending=False)

Unnamed: 0,Person-first,Identity-first
ASD patient,,31.0
autistic patient,,23.0
autistic child,,14.0
autistic individual,,12.0
ASD case,,11.0
...,...,...
subject with autism,2.0,
syndrome with autism,1.0,
trio with ASD,1.0,
uncle with autism,1.0,


In [19]:
person_identity_examples.notnull().sum()

Person-first      40
Identity-first    31
dtype: int64

In [30]:
has_person = person_identity_first[~person_identity_first['Person-first'].isnull()]
len(has_person)


156

In [33]:
has_identity = person_identity_first[~person_identity_first['Identity-first'].isnull()]
len(has_identity)


163

## Count abstracts by the structures they use

In [37]:
person_by_title = person_identity_first.groupby(['Title'])['Person-first'].count()
identity_by_title = person_identity_first.groupby(['Title'])['Identity-first'].count()
title = pd.concat([person_by_title,identity_by_title],axis=1)
title

Unnamed: 0_level_0,Person-first,Identity-first
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
16p11.2 microdeletion and microduplication in two Lithuanian patients with speech delay,0,1
A 3 year old patient with autism and microdeletion in the KIAA0442 (AUTS2 ) gene,1,0
A c GH Array study in non syndromic (primary) autism disorder,1,1
A clinical study of patients with pericentromeric deletion and duplication within 16p11.2 p12.2,3,0
"A cross disorder dosage sensitivity map of the human genome Ryan Lewis Collins 1, Joseph T",1,0
...,...,...
cADHERiN 11 as a possible candidate gene for autism,1,0
common variants in cadherin 10 gene show association with autism spectrum disorders in Finnish population,1,1
complex genomic structure underlying an interrupted microdeletion in 16p11.2 p12.1 with breakpoints mapping to non homologous LcRs,1,0
multiple minor congenital defects associated with autism spectrum disorders,1,2


In [38]:
title.sort_values(by=['Identity-first'], ascending=False)

Unnamed: 0_level_0,Person-first,Identity-first
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
PON1 phenotype and gene polymorphisms in children with autism,0,5
Role of serotonin transporter promoter length polymorphism in autism: A south African population based study,0,5
Polymorphism in Serotonin Transporter Gene in Autism,1,5
"B Contribution of compound heterozygous CACNA1H mutations in autism spectrum disorder susceptibilityAbstracts from the 54thEuropean Society of Human Genetics (ESHG) Conference 266 European Journal of Human Genetics (2022) 30:88 â 608Marta Viggiano1, Cinzia Cameli1, Annio Posar2,3, Maria C",0,4
C Rare variants targeting genes that encode forcytochrome P450 enzymes in Autism Spectrum Disorder,0,4
...,...,...
"D Mitochondrial DNA in ï¬uences the susceptibility to Autism Spectrum Disorders and the severity of the clinicalphenotype Leonardo Caporali 1, Claudio Fiorini1,2, Flavia Palombo1, Flavia Baccari3, Martina Romagnoli1, Paola Visconti4, Annio Posar4, Maria Cristina Scaduto4, Elena Maestrini5, Cinzia Cameli5, Marta Viggiano5, Anna Olivieri6, Antonio Torroni6, Elena Bacchelli5, Magali Rochat7, Valerio Carelli1,2,Alessandra Maresca1 1IRCCS Istituto delle Scienze Neurologiche di Bologna, Programma di Neurogenetica, Bologna, Italy,2Department of Biomedical and Neuromotor Sciences, University of Bologna, Bologna, Italy,3IRCCS Istituto delle Scienze Neurologiche di Bologna, UOSI Epidemiologia eStatistica, Bologna, Italy, Bologna, Italy,4IRCCS Istituto delle Scienze Neurologiche di Bologna, UOSI Disturbi dello Spettro Autistico, Bologna, Italy, Bologna, Italy,5Department of Pharmacy and Biotechnology, University of Bologna, Italy, Bologna, Italy,6Depart ment of Biology and Biotechnology ""L",1,0
Genetic alterations of postsynaptic NMDA receptor related complex are associated with autism spectrum disorder,1,0
Genetic causes of pervasive developmental disorders,1,0
Genetic counselling in idiopathic autism: parental knowledge and perspectivesM,1,0


In [39]:
title.sort_values(by=['Person-first'], ascending=False)

Unnamed: 0_level_0,Person-first,Identity-first
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Prevalence of PTEN mutations in Turkish children with autism spectrum disorders and macrocephaly,7,0
D Implicating genetic risk variants for circadian rhythm and sleep trait dif ï¬culties in individuals with autism spectrum disorder,5,1
M MEF2C haploinsufficiency is a recurrent finding in patients with autism spectrum disorders,5,0
c hildren autism and mitochondrial DNA mutations,4,0
Autism s pectrum Disorders and s eizure s yndrome,4,1
...,...,...
C New candidate genes in autism spectrum disorder,0,1
Increased frequency of the autism broader phenotype in mothers transmitting etiological CNVs to sons affected by Autism Spectrum Disorder (ASD),0,3
B Evidence for altered calcium signaling and altered mitochondrial function in an autism case study,0,3
"B Contribution of compound heterozygous CACNA1H mutations in autism spectrum disorder susceptibilityAbstracts from the 54thEuropean Society of Human Genetics (ESHG) Conference 266 European Journal of Human Genetics (2022) 30:88 â 608Marta Viggiano1, Cinzia Cameli1, Annio Posar2,3, Maria C",0,4


In [57]:
columns = ['Person-first','Identity-first']
filter_ = (title[columns] > 0).all(axis=1)
title[filter_]
len(title[filter_])


32

In [58]:
title[filter_].sort_values(by=['Person-first'], ascending=False)

Unnamed: 0_level_0,Person-first,Identity-first
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
D Implicating genetic risk variants for circadian rhythm and sleep trait dif ï¬culties in individuals with autism spectrum disorder,5,1
Autism s pectrum Disorders and s eizure s yndrome,4,1
B MLPA analysis as a diagnostic test in patients withautism spectrum disorders,3,2
Cytogenetic analysis in autistic disorder,2,2
A NCAM2 deletion in a patient with autism,2,1
Two cases with different microaberrations of the long arm of chromosome 15 and autism,2,1
Microrrearrangements of human chromosome 15q11 q13 in families with autistic disorder,1,3
Contribution of chromosomal aberrations in mosaicism to Autism Spectrum Disorders,1,1
Custom designed CGH array in autism spectrum disorders,1,1
Frequent 22q11 aberrations in patients with non syndromic autism spectrum disorders shown by sNP array based segmental aneuploidy screening,1,2


In [59]:
title[filter_].sort_values(by=['Identity-first'], ascending=False)

Unnamed: 0_level_0,Person-first,Identity-first
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Polymorphism in Serotonin Transporter Gene in Autism,1,5
Whole genome sequencing reveals the mutation characteristics in Autism Spectrum Disorder,1,3
Microrrearrangements of human chromosome 15q11 q13 in families with autistic disorder,1,3
No evidence for an association with the serotonin transporter gene polymorphisms (5 HTTVNTR and 5 HTTLPR) and autism,1,3
Pharmacogenetic Study of Second Generation Antipsychotic Therapy in Autism Spectrum Disorders,1,2
Frequent 22q11 aberrations in patients with non syndromic autism spectrum disorders shown by sNP array based segmental aneuploidy screening,1,2
C Genetic determinants for social skill training outcomesin autism spectrum disorder,1,2
Abnormal growth and dysmorphic features in children with autism spectrum disorders,1,2
Cytogenetic analysis in autistic disorder,2,2
Contribution of rare and common variants of the PTCHD1 gene to Autism Spectrum Disorder and Intellectual Disability,1,2


## Word counts by part of speech


In [None]:
POS_p_i = []

for token in p_i_doc:
    this_token = [token.text, token.lemma_, token.pos_, token.tag_]
    if any (s in token.text for s in ['autistic', 'Autistic', 'autism', 'Autism', 'ASD', 'asd', 'Asperger', 'asperger']):
        POS_p_i.append(this_token)

In [None]:
with open('..\\counts\\ESHG\\POS.csv', "w", encoding='utf8') as outfile:
        write = csv.writer(outfile)
        for item in POS_p_i:
            write.writerow([item])

In [None]:
p_f_lower = [word.lower() for word in person_first]     # make those tokens lowercase
p_f_no_punct = [w.translate(table_punctuation) for w in p_f_lower] # remove the punctuation
p_f_no_space = (list(filter(lambda x: x, p_f_no_punct)))           # remove any extra whitespace

In [None]:
#for saving output
os.makedirs('folder/subfolder', exist_ok=True)  
df.to_csv('folder/subfolder/out.csv') 