# Import and get ready

In [1]:
!pip install autocorrect          
from autocorrect import Speller   # things we need for spell checking
check = Speller(lang='en')

import csv                        # csv is for importing and working with csv files

import datetime
date = datetime.date.today()

import nltk                       # get nltk 
from nltk import word_tokenize    # and some of its key functions
from nltk import sent_tokenize    
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.corpus import wordnet                    # Finally, things we need for lemmatising!
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer() 
nltk.download('averaged_perceptron_tagger')        # Like a POS-tagger...
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('webtext')

import numpy as np
import os                         # os is a module for navigating your machine (e.g., file directories).
import pandas as pd
import statistics
import re                         # things we need for RegEx corrections

English_punctuation = "-!\"#$%&()'*+,./:;<=>?@[\]^_`{|}~''“”"      # Things for removing punctuation, stopwords and empty strings
table_punctuation = str.maketrans('','', English_punctuation)  

print("Succesfully imported necessary modules")    # The print statement is just a bit of encouragement!



[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\mzyssjkc\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mzyssjkc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mzyssjkc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mzyssjkc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package webtext to
[nltk_data]     C:\Users\mzyssjkc\AppData\Roaming\nltk_data...
[nltk_data]   Package webtext is already up-to-date!


Succesfully imported necessary modules


In [None]:
print(os.listdir("..\\for_analysis\\ESHG")) # This is how to see the contents of any folders shown in the last contents check
print(os.listdir("..\\counts\\ESHG")) # This is how to see the contents of any folders shown in the last contents check


# Read in and check useful things

In [None]:
totals_df = pd.read_csv('..\\counts\\ESHG\\totals.csv', header=None)
totals_df.columns = ["filename", "abstracts"]
totals_df

In [74]:
def toascii():
    with open(r'C:\log.convert', 'r', encoding='utf8') as origfile, open(r'C:\log.toascii', 'w', encoding='ascii') as convertfile:
        for line in origfile:
            line = unidecode(line)
            convertfile.write(line)

In [6]:
raw_select_abstracts = []

with open('..\\counts\\ESHG\\select.csv', newline='', encoding = "ISO-8859-1") as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        raw_select_abstracts.append(row)
        
select_abstracts = (list(filter(lambda x: x, raw_select_abstracts)))
    

In [None]:
select_df = pd.DataFrame.from_records(select_abstracts)
total_select = select_df.iloc[:,0].value_counts()


In [None]:
autism_count = select_df.iloc[:,0].value_counts().rename_axis('unique_values').to_frame('counts')

autism_count = autism_count.reset_index()
autism_count.columns = ['filename', 'autism_abstracts']

autism_count

In [None]:
merged_counts = pd.merge(totals_df, autism_count, on = 'filename')


In [None]:
merged_counts


In [None]:
merged_counts['year'] = merged_counts['filename'].str.extract(r'(\d{4})')
merged_counts


In [None]:
print(merged_counts['abstracts'].sum())
print(merged_counts['autism_abstracts'].sum())

In [None]:
print(merged_counts[['year', 'abstracts']].groupby('year').sum('abstracts'))
#print(merged_counts['autism_abstracts'].sum())

# Working with the contents of the selected abstracts

##  Dump the content of the selected abtsracts into one big string

In [7]:
bag_of_abstracts = ""

for abstract in (select_abstracts):
    bag_of_abstracts += abstract[3]
        
type(bag_of_abstracts)

str

In [8]:
bag_of_abstracts[:10]

'P08 .37 Ho'

## tokenize that big string, and transform the tokens into lowercase

In [9]:
abstract_token_word = word_tokenize(bag_of_abstracts)
abstract_token_word_lower = [word.lower() for word in abstract_token_word]

print(abstract_token_word_lower[:10]) 

['p08', '.37', 'hospital', 'â\x80\x93', 'osi', 'bilbao-basurto', 'â\x80\x93', 'osakidetza', ',', 'bilbao']


## Remove the punctuation from the lowercased tokens

In [15]:
abstract_t_w_l_np = [w.translate(table_punctuation) for w in abstract_token_word_lower]  
                                                               # Iterate over corpus_words, turning punctuation to nothing.
print(abstract_t_w_l_np[:10])       

['p08', '37', 'hospital', 'â\x80\x93', 'osi', 'bilbaobasurto', 'â\x80\x93', 'osakidetza', '', 'bilbao']


In [14]:
for token in abstract_token_word_lower:
    token = token.replace('â\x80\x93', 'â')
    
    
print(abstract_token_word_lower[:10]) 

['p08', '.37', 'hospital', 'â\x80\x93', 'osi', 'bilbao-basurto', 'â\x80\x93', 'osakidetza', ',', 'bilbao']


In [16]:
abstract_t_w_l_np = (list(filter(lambda x: x, abstract_t_w_l_np)))
print(abstract_t_w_l_np[:10])

['p08', '37', 'hospital', 'â\x80\x93', 'osi', 'bilbaobasurto', 'â\x80\x93', 'osakidetza', 'bilbao', 'spain']


## Remove stopwords

In [None]:
print(sorted(stop_words))        # just an option to check what counts as a stopword if you want to see

In [21]:
abstract_t_w_l_np_nsw = []

for word in abstract_t_w_l_np:
    if word not in stop_words:
        abstract_t_w_l_np_nsw.append(word)
        
        
print(abstract_t_w_l_np_nsw[:10])

['p08', '37', 'hospital', 'â\x80\x93', 'osi', 'bilbaobasurto', 'â\x80\x93', 'osakidetza', 'bilbao', 'spain']


# Remove weird things

In [72]:
test = ['p08', '37', 'hospital', 'â\x80\x93', 'osi', 'bilbaobasurto', 'â\x80\x93', 'osakidetza',]
clean_test =[]
import unicodedata

def remove_accents(input_str, output_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    print(''.join(c if c <= '\xff' else unidecode(c) for c in nfkd_form))

In [66]:
test = ['p08', '37', 'hospital', 'â\x80\x93', 'osi', 'bilbaobasurto', 'â\x80\x93', 'osakidetza',]
clean_test =[]
import unicodedata

def remove_accents(input_str, output_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    print( u"".join([c for c in nfkd_form if not unicodedata.combining(c)]))

''.join(c if c <= '\xff' else unidecode(c) for c in string)

In [73]:
for item in test:
    remove_accents(item, clean_test)
    


p08
37
hospital


NameError: name 'unidecode' is not defined

## Spell check the lowercased "real word" tokens

In [22]:
len(abstract_t_w_l_np_nsw)

362901

['p08',
 '37',
 'hospital',
 'â\x80\x93',
 'osi',
 'bilbaobasurto',
 'â\x80\x93',
 'osakidetza',
 'biblio',
 'spain',
 'recurrent',
 'de',
 'novo',
 'mutation',
 'second',
 'variant',
 '3genetics',
 'unit',
 'basurto',
 'university',
 'hospital',
 'â\x80\x93',
 'osakidetza',
 'unknown',
 'signï¬\x81dance',
 'swim6',
 'boy',
 'severe',
 'biblio',
 'spain',
 '4neuropediatrics',
 'unit',
 'basurto',
 'university',
 'intellectual',
 'disability',
 'microcephaly',
 'strabism',
 'hospital',
 'â\x80\x93',
 'osakidetza',
 'biblio',
 'spain',
 '5molecular',
 'genetics',
 'hyperopia',
 'laboratory',
 'genetics',
 'service',
 'biocruces',
 'health',
 'research',
 'institute',
 'crucesuniversityhospitalâ\x80\x93osakidetza',
 'barakaldo',
 'leader1',
 'gamer2',
 'praised1',
 'spell1',
 'may1',
 'spain',
 '6molecular',
 'epi',
 'genetics',
 'laboratory',
 'bioaraba',
 'national',
 'health',
 'institute',
 'arab',
 'university',
 'hospital',
 'â\x80\x93',
 '1division',
 'clinical',
 'genetics',
 'dep

In [24]:
# runs forever without finishing. Find alternative or skip it?
spell = ['wibble', 'wobble', 'nope', 'shite', 'test']
spell_test = []

for word in spell:
    spell_test.append(check(word))    

print(spell_test[:10])

['viable', 'gobble', 'note', 'white', 'test']


In [23]:
# runs forever without finishing. Find alternative or skip it?

abstract_t_w_l_np_cs = []

for word in abstract_t_w_l_np_nsw:
    abstract_t_w_l_np_cs.append(check(word))    

print(abstract_t_w_l_np_cs[:10])

KeyboardInterrupt: 

In [18]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()
abstract_t_w_l_np_nsw_stem = [porter.stem(word) for word in abstract_t_w_l_np_nsw]
print(abstract_t_w_l_np_nsw_stem[:10])

['p08', '37', 'hospit', 'â\x80\x93', 'osi', 'bilbaobasurto', 'â\x80\x93', 'osakidetza', 'bilbao', 'spain']


In [19]:
from collections import Counter
counts = Counter(abstract_t_w_l_np_nsw_stem)
print(type(counts))

<class 'collections.Counter'>


In [20]:
print(counts.most_common(100))

[('gene', 3536), ('none', 3535), ('patient', 3517), ('genet', 3106), ('de', 2423), ('univers', 2211), ('disord', 2200), ('autism', 2057), ('mutat', 1878), ('asd', 1737), ('clinic', 1705), ('studi', 1582), ('syndrom', 1559), ('delet', 1557), ('associ', 1535), ('variant', 1439), ('case', 1427), ('chromosom', 1398), ('hospit', 1252), ('use', 1240), ('medic', 1236), ('phenotyp', 1227), ('analysi', 1171), ('result', 1163), ('report', 1160), ('unit', 1126), ('c', 1116), ('famili', 1099), ('j', 1091), ('e', 1034), ('genom', 1025), ('two', 983), ('franc', 978), ('spectrum', 973), ('includ', 958), ('region', 953), ('p', 952), ('present', 947), ('sequenc', 934), ('diseas', 930), ('featur', 921), ('cnv', 906), ('identifi', 882), ('caus', 872), ('sever', 854), ('show', 804), ('delay', 794), ('disabl', 784), ('human', 770), ('itali', 761), ('institut', 756), ('one', 752), ('l', 733), ('medicin', 725), ('2', 720), ('r', 714), ('intellectu', 713), ('molecular', 707), ('duplic', 694), ('cell', 687), (

## Person-first and identity first

In [None]:
!pip install spacy
import re 
import string 
import nltk 
import spacy 
import pandas as pd 
import numpy as np 
import math 
from tqdm import tqdm 

from spacy.matcher import Matcher 
from spacy.tokens import Span 
from spacy import displacy 

pd.set_option('display.max_colwidth', 200)

In [None]:
!pip install spacy -q
import spacy
!python -m spacy download en_core_web_lg -q
from nltk.corpus import webtext

nlp = spacy.load('en_core_web_lg')

In [None]:
person_identity = nltk.sent_tokenize(bag_of_abstracts)


In [None]:
type(person_identity)

In [None]:
person =""

for sentence in person_identity:
    if any(s in sentence for s in ['autisic', 'Autisic', 'autism', 'Autism']):
        person += sentence


In [None]:
len(person)

In [None]:
for tok in person_doc: 
  print(tok.text, "-->",tok.dep_,"-->", tok.pos_)

In [None]:
pattern_1 = [{"TEXT": {"REGEX": "^[Aa]utistic$"}},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {"POS": "NOUN"}]

In [None]:
# Matcher class object 
matcher = Matcher(nlp.vocab) 
matcher.add("matching_1", [pattern_1]) 

autistic =[]

matches = matcher(person_doc) 
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = person_doc[start:end]  # The matched span
    autistic.append(span.text)

In [None]:
autistic_no_dups = list(set(autistic))
with open('..\\counts\\ESHG\\autism.csv', "w", encoding='ISO-8859-1') as outfile:
        write = csv.writer(outfile)
        for item in autistic_no_dups:
            write.writerow([item])



In [None]:
for item in autistic_no_dups:
    print(item)

In [None]:
pattern_2 = [{"POS": "NOUN"},
             {'LOWER': 'with'},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {"TEXT": {"REGEX": "^[Aa]utism$"}}]

In [None]:
# Matcher class object 
matcher = Matcher(nlp.vocab) 
matcher.add("matching_1", [pattern_2]) 

person_with =[]
matches = matcher(person_doc) 
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = person_doc[start:end]  # The matched span
    person_with.append(span.text)

In [None]:
person_with_no_dups = list(set(person_with))
with open('..\\counts\\ESHG\\person_with.csv', "w", encoding='ISO-8859-1') as outfile:
        write = csv.writer(outfile)
        for item in person_with_no_dups:
            write.writerow([item])

