# Get ready

First, download, import, prep packages and such. 

Then, check the file location and import the .csv files. Remove any with empty text fields. 

Save a data frame with all the texts and another with only those texts that mention the keywords of interest. 

In [1]:
%%capture

# installing necessary pdf conversion packages via pip
# the '%%capture' at the top of this cell suppresses the output (which is normally quite long and annoying looking). 
# You can remove or comment it out if you prefer to see the output. 
!pip install nltk
!pip install spacy -q
!python -m spacy download en_core_web_lg -q


In [2]:
%%capture

import os                         # os is a module for navigating your machine (e.g., file directories).
import nltk                       # nltk stands for natural language tool kit and is useful for text-mining. 
from nltk import word_tokenize    # and some of its key functions
from nltk import sent_tokenize  
tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.corpus import wordnet                    # Finally, things we need for lemmatising!
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer() 
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
nltk.download('averaged_perceptron_tagger')        # Like a POS-tagger...
nltk.download('wordnet')
nltk.download('webtext')
from nltk.corpus import webtext

import pandas as pd
pd.set_option('display.max_colwidth', 200)
import numpy as np
import statistics
import datetime
date = datetime.date.today()

import codecs
import csv                        # csv is for importing and working with csv files

from collections import Counter

import statistics
import re                         # things we need for RegEx corrections
import matplotlib.pyplot as plt
import string 
import spacy 
from spacy.matcher import Matcher 
from spacy.tokens import Span 
from spacy import displacy 
nlp = spacy.load('en_core_web_lg')
nlp.max_length = 1500000 #or any large value, as long as you don't run out of RAM

import math 

English_punctuation = "-!\"#$%&()'*-–+,./:;<=>?@[\]^_`{|}~''“”"      # Things for removing punctuation, stopwords and empty strings
table_punctuation = str.maketrans('','', English_punctuation)

In [3]:
print(os.listdir("..\\results")  )                                # check 'results' folder is not empty/has correct stuff

files = []                                                        # create empty list to hold names of files in 'results'
def import_results(input):                                        # create a function import the contents of the
    for f in os.listdir(input):                                   # folder named in the function input
        f = pd.read_csv(input + '\\'+ f,encoding='latin1')        # by reading them in as csv files, one by one
        files.append(f)                                           # appending the newly read csv file to a temporary list
    output = pd.concat(files)                                     # then concatenating that temp list to the pre-defined list
    return output                                                 # returning the output

['ESHG2001abstractICHG.csv', 'ESHG2002Abstracts.csv', 'ESHG2003Abstracts.csv', 'ESHG2004.csv', 'ESHG2005Abstracts.csv', 'ESHG2006Abstracts.csv', 'ESHG2007Abstracts.csv', 'ESHG2008Abstracts.csv', 'ESHG2009Abstracts.csv', 'ESHG2010Abstracts.csv', 'ESHG2011Abstracts.csv', 'ESHG2012Abstracts.csv', 'ESHG2013Abstracts.csv', 'ESHG2014Abstracts.csv', 'ESHG2015Abstracts.csv', 'ESHG2016Abstracts.csv', 'ESHG2017 electronic posters.csv', 'ESHG2017 oral presentation.csv', 'ESHG2018 electronic posters.csv', 'ESHG2018 oral presentation.csv', 'ESHG2019 electronic posters.csv', 'ESHG2019 oral presentation.csv', 'ESHG2020 electronic posters.csv', 'ESHG2020 oral presentation.csv', 'ESHG2021 electronic posters.csv', 'ESHG2021 oral presentation.csv']


In [4]:
all_results = import_results("..\\results")      # run the newly defined function on the 'results' folder
len(all_results)                                 # check the length 

34630

In [5]:
print(all_results['Year'].drop_duplicates())     # quick check shows that 2004 (a known problem file) has not imported properly

0    2001.0
0    2002.0
0    2003.0
0       NaN
0    2005.0
0    2006.0
0    2007.0
0    2008.0
0    2009.0
0    2010.0
0    2011.0
0    2012.0
0    2013.0
0    2014.0
0    2015.0
0    2016.0
0    2017.0
0    2018.0
0    2019.0
0    2020.0
0    2021.0
Name: Year, dtype: float64


In [6]:
print(all_results['Year'].isna().sum())                         # Let's just count how many rows NaN instead of the year

2205


In [7]:
no_Nan_in_Year = all_results[~all_results['Year'].isnull()]          # remove the 'Year' = Nan rows
len(no_Nan_in_Year)                                                  # check length again  - do the numbers make sense?

32425

In [8]:
year_04 = pd.read_csv('..\\results\\ESHG2004.csv')      # specifically read in year 2004 (it needed a bit of extra work)
year_04 = year_04.iloc[:, [0,1]]                        # cut a two-column slice out of it with only the year and text
year_04                                                 # check how it looks

Unnamed: 0,Year,Text
0,2004,"L01Multiple Sulfatase Defi ciency: Molecular defect and properties of the autosomal forms of epigenetic mosaicism can be caused by missing enzyme. retrotransposon activity. K. von Figura, M. Maria..."
1,2004,L04Regional differences in genetic testing and counselling in Europe - An overview
2,2004,"L02Biogenesis of mitochondria: Human diseases linked to S. Aymé protein transport, folding and degradation INSERM, Paris, France. W. Neupert"
3,2004,L05Hereditary Breast/Ovarian Cancer risk: international energy present in oxidizable substrates is transduced into energy comparison of the acceptability of Preventive strategies stored in ATP. Mi...
4,2004,L06Variation in prenatal counselling in Europe: the example of highly motile within the cell. Quite a number of genes are involved Klinefelter in these processes which are closely linked to the in...
...,...,...
2200,2004,"C7 A10 in the aetiology of cystinuria, we could not identify any Affected children may have only one episode of illness or multiple mutation in SL"
2201,2004,"C7 A10 in the two families. Nevertheless, there remains recurrences. A common mutation (985A >G) has been identiÜed the possibility that other genes are involved in cystinuria. Further among pa..."
2202,2004,"P0845Inactivation of the spasmolytic trefoil peptide (Tff2) leads In this study, two unrelated MCAD patients, compound heterozygous to increased expression of additional gastroprotective factors..."
2203,2004,"P0843MCDR1 Locus - Screening for candidate genes. functional disturbance in stomach and gut, Tff2-/- constructs do N. Udar1, M. Chalukya1, R. Silva-Garcia1, J. Yeh1, P. Wong1,2, K. Small1 not di..."


In [9]:
all_results_corrected = pd.concat([no_Nan_in_Year, year_04])     # add in those 2004 rows
len(all_results_corrected)                                       # check length again - are we back up to where we started?

34630

In [10]:
no_null_texts = all_results_corrected[~all_results_corrected['Text'].isnull()] 
                                                                    # remove any rows where the 'text' column is empty
len(no_null_texts)                                                  # check length again - still making sense?

33979

In [11]:
matched_texts = no_null_texts[no_null_texts['Text'].str.contains('autis|Autis|ASD|Asperger|asperger')]
                                                                   # keep only rows where text contains a keyword of interest
len(matched_texts)                                                 # check the length

906

In [12]:
no_nans_matched_texts = matched_texts.dropna(axis=1, how="all")   # remove all columns which contain only NaNs
print(len(no_nans_matched_texts))                                 # just check the length has not changed
no_nans_matched_texts                                             # have a look at the columns that remain

906


Unnamed: 0,Title,Session_Code,Authors_and_Affiliations,Text,Year,Email,Author,Affiliations
59,Genetic defects in sterol metabolism,S60.,F. MoebiusInstitute for Biochemical Pharmacology University of Innsbruck Innsbruck AustriaFabian.M,Genetic defects in sterol metabolizing enzymes have recently emerged asimportant causes of dysmorphogenetic syndromes. They affect enzymesrequired for the removal of methyl groups at C4(NSHDL) th...,2001.0,Fabian.Moebius@uibk.ac.at,,
150,Location of the first predisposing gene locus for Asperger syndrome on chromosome 1q21 22,C102.,E. Jarvela1 T. Ylisaukko oja2 T. Nieminen3 E. Kempas1 M. Auranen1 L. Peltonen1 1National Public Health Institute Helsinki Finland 2National Public Health Insitute Helsinki Finland 3Uni...,Asperger syndrome (AS) was first described in 1944 by a Viennese physi cian Hans Asperger who reported a group of boys with autistic psychopa thy whose clinical features resembled autism with som...,2001.0,irma.jarvela@hus.fi,,
334,De novo der(5) identified as an interstitial insertion of chromosome 3 material by COBRA multi colour FISH,P0220.,Engels1 M. KreiÃÂ§ Nachtsheim1 A. Ehrbrecht1 S. Zahn1 R. Schubert1 G. Schwanitz1 C. Ergang1 M. van der Burg2 D. Hansmann3 A. K. Raap2 H. J. Tanke2 J. Wiegant2 1Institute of Human Gene...,bonn.deA 5 year old boy presented with ASD II bilateral cleft palate strabism bilat eral optic nerve coloboma sensorineural hearing loss bilateral inguinalherniae micropenis seizures and a...,2001.0,hengels@meb.uni,,
378,Duplication (4)(q31.1qter) in a newborn with suspi cious clinical diagnosis of Nijmegen breakage syndrome,P0267.,Cernakova1 M. Kvasnicova2 Z. Lovasova2 N. Badova3 E. Seemano va4 K. Spackova5 1Lab. experimental medicine Olomouc Czech Republic 2Dept. Clin. Genetics Banska Bystrica Slovakia 3Dept. ...,Nijmegen breakage syndrome (NBS) is a rare autosomal recessive condi tion characterised by progressive microcephaly early retardation ofgrowth chromosomal instability hypersensitivity to ionisi...,2001.0,iveta.cernakova@hotmail.com,,
385,A case of pure partial trisomy of 5q34 qter associated with asthma allergies and hyper IgE.,P0277.,Demczuk1 B. Dion1 P. Lepage2 B. R. Hadad3 T. Hudson2 V. M. Der Kaloustian4 1Montreal Children s Hospital Research Institute Montreal PQ Canada 2Genome Center Montreal General Hospital M...,Unbalanced chromosomal anomalies can lend information on the positionof genes and on the understanding of pathologic mechanisms of commondiseases. Thus one locus for asthma and allergic disease ...,2001.0,suzanne.demczuk@muhc.mcgill.ca,,
...,...,...,...,...,...,...,...,...
629,,,,P0255Molecular cytogenetic mapping of the breakpoints of the unable to walk or to stand without help. During the Ürst 2 years of constitutional pericentric inversion inv(10)(p11.2q21.2) life rec...,2004.0,,,
723,,,,P0303Partial trisomy 22q11 and tetrasomy22q13 resulting by generic ampliÜcation of BAC miniprep DNA. The clone DNAÓs from complex rearrangement of chromosome 22 in a child with and various cont...,2004.0,,,
1644,,,,P0681No mutation in the LMNA gene in four patients with Glaucomas are a clinically and genetically heterogeneous group of Hallermann-Streiff syndrome optic neuropathies resulting in optic nerve ...,2004.0,,,
2157,,,,"P0835Presence of elevated lactate, lactate/pyruvate ratio and Analysis of the patientsÓclinical and molecular data demonstrated that acylcarnitine proÜle in patients with autism. all Üve patient...",2004.0,,,


# Sent tokenisation

In [None]:
sentences  = [sent_tokenize(abstract) for abstract in no_nans_matched_texts['Text'] ] 
                                                                    # tokenize the text column and store as a list
no_nans_matched_texts['Sentence'] = sentences                       # copy that list back into df as a new column
sentence_per_row = no_nans_matched_texts.explode('Sentence')        # explode the new column to create 1 row per sentence token
len(sentence_per_row)                                               # check the length 


In [None]:
sentence_per_row                                                    # have a look. For the first two rows, 
                                                                    # 'Text' should be same, but 'Sentence' should not

In [None]:
matched_sentences = sentence_per_row[sentence_per_row['Sentence'].str.contains('autis|Autis|ASD|Asperger|asperger')]
                                                                            # keep only those sentences that contain the keywords
len(matched_sentences)

In [None]:
matched_sentences = matched_sentences.drop_duplicates()                         # drop any duplicates
len(matched_sentences)                                                          # check length of remaining data frame

## Person-first pattern

In [None]:
pattern_2 = [{"POS": "NOUN"},                                                   # define the person-first pattern(s)
             {'LOWER': 'with'},                                                 # I made 3 for clarity rather than one with 
             {'DEP':'amod', 'OP':"?"},                                          # a real complex regex string
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {"TEXT": {"REGEX": "^[Aa]utism$"}}]

pattern_3 = [{"POS": "NOUN"},
             {'LOWER': 'with'},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {"TEXT": {"REGEX": "^[Aa]sperger$"}}]

pattern_4 = [{"POS": "NOUN"},
             {'LOWER': 'with'},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {"TEXT": {"REGEX": "^ASD$"}}]

# Matcher class object 
matcher = Matcher(nlp.vocab)                                                  # define a matcher class object
matcher.add("matching_1", [pattern_2, pattern_3, pattern_4])                  # add my three person-first patterns to it

In [None]:
def find_pattern_match(input):                                               # define a function that applies the person-first
    thingy = nlp(input)                                                       # matcher class object to strings
    match = matcher(thingy)                                                   # and returns any matches to the pattern(s)
    if match == []:
        out_value = ''
    else:
        hold_multi_spans = []
        for match_id, start, end in match:
                string_id = nlp.vocab.strings[match_id]  # Get string representation
                span = thingy[start:end]  # The matched span
                hold_multi_spans.append(span)
        out_value = hold_multi_spans
    return out_value

In [None]:
matched_sentences['Person-first'] = matched_sentences.apply(lambda row: find_pattern_match(row.Sentence), axis = 1)
                                                                        # apply the newly defined person-first matcher function
                                                                        # and store the returned output in a new column
len(matched_sentences)                                                  # double check length remains same

## Identity first pattern

In [None]:
pattern_a = [{"TEXT": {"REGEX": "^[Aa]utistic"}},                        # do the same for identity-first patterns
             {'DEP':'amod', 'OP':"?"},                                   # again, i wrote 3 patterns for clarity sake
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {"POS": "NOUN"}]

pattern_b = [{"TEXT": {"REGEX": "^[Aa]sperger"}},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {"POS": "NOUN"}]

pattern_c = [{"TEXT": {"REGEX": "^ASD"}},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {"POS": "NOUN"}]

# Matcher class object                                         
matcher = Matcher(nlp.vocab) 
matcher.add("matching_2", [pattern_a, pattern_b, pattern_c])            # this overwrites the matcher object to identity-first

In [None]:
matched_sentences['Identity-first'] = matched_sentences.apply(lambda row: find_pattern_match(row.Sentence), axis = 1)
                                                                        # apply the newly overwritten matcher function
                                                                        # and store the returned output in a new column
len(matched_sentences)                                                  # check the length - why not?

In [None]:
matched_patterns = matched_sentences[(matched_sentences['Person-first'] != '') | (matched_sentences['Identity-first'] != '')]
                                                     # keep only rows w/ non-null 'Person-first' and/or 'Identity-first' columns
len(matched_patterns)                                # check length

In [None]:
matched_patterns = matched_patterns.explode('Person-first')    # explode 'Person-first' column to create 1 row per match
len(matched_patterns)                                          # check the length

In [None]:
matched_patterns = matched_patterns.explode('Identity-first')  # Do the same for 'Identity-first' column
len(matched_patterns)                                          # check the length

In [None]:
matched_patterns

In [None]:
files = []                                                        # create empty list to hold names of files in 'results'
def import_results(input):                                        # create a function import the contents of the
    for f in os.listdir(input):                                   # folder named in the function input
        f = pd.read_csv(input + '\\'+ f,encoding='latin1')        # by reading them in as csv files, one by one
        files.append(f)                                           # appending the newly read csv file to a temporary list
    output = pd.concat(files)                                     # then concatenating that temp list to the pre-defined list
    return output         

In [None]:
Lem = WordNetLemmatizer()
person_lemma_list = []
lemmatized = []
for phrase in matched_patterns['Person-first']:
    x = str(phrase)
    words = x.split()
    for word in words :
        lemword = Lem.lemmatize(word)
        lemmatized.append(lemword)
    person_lemma_list.append(lemmatized)
    lemmatized = []

person_lemma_list


In [None]:
identity_lemma_list = []
lemmatized = []
for phrase in matched_patterns['Identity-first']:
    x = str(phrase)
    words = x.split()
    for word in words :
        lemword = Lem.lemmatize(word)
        lemmatized.append(lemword)
    identity_lemma_list.append(lemmatized)
    lemmatized = []

identity_lemma_list


In [None]:
matched_patterns['Person-first_lemmatized'] = person_lemma_list
matched_patterns['Identy-first_lemmatized'] = identity_lemma_list
matched_patterns

In [None]:
final.to_csv('..\\output\\text_match_results.csv') 

In [None]:
print(os.listdir("..\\..\\2023_Second_analysis\\output")  )

In [None]:
final.loc[473]


## Chart person first or identity first by year

In [None]:
person_identity_first = pd.read_csv('..\\..\\2023_Second_analysis\\output\\text_match_results.csv')
person_identity_first = person_identity_first.dropna(how='all')
person_identity_first['Year'] = person_identity_first['Year'].astype('Int64')

In [None]:
person_count = person_identity_first.groupby(['Year'])['Person-first'].count()
identity_count = person_identity_first.groupby(['Year'])['Identity-first'].count()


In [None]:
person_identity_count=pd.concat([person_count,identity_count],axis=1)


In [None]:
person_identity_count.plot()
plt.show()

In [None]:
person_examples = person_identity_first.groupby(['Person-first'])['Person-first'].count()
identity_examples = person_identity_first.groupby(['Identity-first'])['Identity-first'].count()


In [None]:
person_identity_examples=pd.concat([person_examples,identity_examples],axis=1)


In [None]:
person_identity_examples.sort_values(by=['Person-first'], ascending=False)

In [None]:
person_identity_examples.sort_values(by=['Identity-first'], ascending=False)

In [None]:
person_identity_examples.notnull().sum()

In [None]:
has_person = person_identity_first[~person_identity_first['Person-first'].isnull()]
len(has_person)


In [None]:
has_identity = person_identity_first[~person_identity_first['Identity-first'].isnull()]
len(has_identity)


## Count abstracts by the structures they use

In [None]:
person_by_title = person_identity_first.groupby(['Title'])['Person-first'].count()
identity_by_title = person_identity_first.groupby(['Title'])['Identity-first'].count()
title = pd.concat([person_by_title,identity_by_title],axis=1)
title

In [None]:
title.sort_values(by=['Identity-first'], ascending=False)

In [None]:
title.sort_values(by=['Person-first'], ascending=False)

In [None]:
columns = ['Person-first','Identity-first']
filter_ = (title[columns] > 0).all(axis=1)
title[filter_]
len(title[filter_])


In [None]:
title[filter_].sort_values(by=['Person-first'], ascending=False)

In [None]:
title[filter_].sort_values(by=['Identity-first'], ascending=False)

## Word counts by part of speech


In [None]:
POS_p_i = []

for token in p_i_doc:
    this_token = [token.text, token.lemma_, token.pos_, token.tag_]
    if any (s in token.text for s in ['autistic', 'Autistic', 'autism', 'Autism', 'ASD', 'asd', 'Asperger', 'asperger']):
        POS_p_i.append(this_token)

In [None]:
with open('..\\counts\\ESHG\\POS.csv', "w", encoding='utf8') as outfile:
        write = csv.writer(outfile)
        for item in POS_p_i:
            write.writerow([item])

In [None]:
p_f_lower = [word.lower() for word in person_first]     # make those tokens lowercase
p_f_no_punct = [w.translate(table_punctuation) for w in p_f_lower] # remove the punctuation
p_f_no_space = (list(filter(lambda x: x, p_f_no_punct)))           # remove any extra whitespace

In [None]:
#for saving output
os.makedirs('folder/subfolder', exist_ok=True)  
df.to_csv('folder/subfolder/out.csv') 