# Import and get ready

In [None]:
%%capture

# installing necessary pdf conversion packages via pip
# the '%%capture' at the top of this cell suppresses the output (which is normally quite long and annoying looking). 
# You can remove or comment it out if you prefer to see the output. 

!pip install autocorrect        
!pip install pyspellchecker 
!pip install spacy -q
!python -m spacy download en_core_web_lg -q

In [None]:
from autocorrect import Speller   # things we need for spell checking
check = Speller(lang='en')
import codecs
import csv                        # csv is for importing and working with csv files

import nltk                       # get nltk 
from nltk import word_tokenize    # and some of its key functions
from nltk import sent_tokenize    
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.corpus import wordnet                    # Finally, things we need for lemmatising!
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer() 
nltk.download('averaged_perceptron_tagger')        # Like a POS-tagger...
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('webtext')
from collections import Counter

import os                         # os is a module for navigating your machine (e.g., file directories).
import pandas as pd
pd.set_option('display.max_colwidth', 200)

import statistics
import re                         # things we need for RegEx corrections

import string 
import spacy 
import math 
#from tqdm import tqdm 

from nltk.corpus import webtext

nlp = spacy.load('en_core_web_lg')
nlp.max_length = 1500000 #or any large value, as long as you don't run out of RAM

from spacy.matcher import Matcher 
from spacy.tokens import Span 
from spacy import displacy 

English_punctuation = "-!\"#$%&()'*-–+,./:;<=>?@[\]^_`{|}~''“”"      # Things for removing punctuation, stopwords and empty strings
table_punctuation = str.maketrans('','', English_punctuation)  

In [None]:
print(os.listdir("..\\for_analysis\\ESHG")) # This is how to see the contents of any folders shown in the last contents check
print(os.listdir("..\\counts\\ESHG")) # This is how to see the contents of any folders shown in the last contents check


# Read in and organise files

In [None]:
totals_df = pd.read_csv('..\\counts\\ESHG\\totals.csv', header=None)
totals_df.columns = ["filename", "abstracts"]
#totals_df

In [None]:
raw_select_abstracts = []

with open('..\\counts\\ESHG\\select.csv', newline='', encoding = "utf8") as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        raw_select_abstracts.append(row)
        
select_abstracts = (list(filter(lambda x: x, raw_select_abstracts)))

In [None]:
select_df = pd.DataFrame.from_records(select_abstracts)
total_select = select_df.iloc[:,0].value_counts()


In [None]:
autism_count = select_df.iloc[:,0].value_counts().rename_axis('unique_values').to_frame('counts')

autism_count = autism_count.reset_index()
autism_count.columns = ['filename', 'autism_abstracts']

#autism_count

In [None]:
merged_counts = pd.merge(totals_df, autism_count, on = 'filename')
merged_counts['year'] = merged_counts['filename'].str.extract(r'(\d{4})')
merged_counts

In [None]:
print(merged_counts['abstracts'].sum())
print(merged_counts['autism_abstracts'].sum())

In [None]:
print(merged_counts[['year', 'abstracts']].groupby('year').sum('abstracts'))


In [None]:
print(merged_counts[['year', 'autism_abstracts']].groupby('year').sum('autism_abstracts'))

# Working with the contents of the selected abstracts

##  Dump the content of the selected abtsracts into one big string

In [1]:
bag_of_abstracts = ""

for abstract in (select_abstracts):
    print(abstract[3])
    bag_of_abstracts += abstract[3]
        
print(type(bag_of_abstracts))
#print(bag_of_abstracts[:100])

NameError: name 'select_abstracts' is not defined

## Bag-of-words
### Prepping

The bag of words approach starts by tokenizing that big string into word-tokens and then works to clean up those tokens by making them all lowercase, removing punctuation, removing whitespace, etc.

This step also invloves removing stopwords, with an optional step of checking which words count as stop words. 


In [None]:
abstract_token_word = word_tokenize(bag_of_abstracts)      # make all the words into tokens
lower = [word.lower() for word in abstract_token_word]     # make those tokens lowercase
no_punct = [w.translate(table_punctuation) for w in lower] # remove the punctuation
no_space = (list(filter(lambda x: x, no_punct)))           # remove any extra whitespace


In [None]:
print(sorted(stop_words))        # OPTIONAL: check what counts as a stopword if you want to see

In [None]:
no_stops = []

for word in no_space:
    if word not in stop_words:
        no_stops.append(word)


### Consolidate

The consolidation step is about trying to get as many versions of the words to be "the same" as reasonably possible. 

This means correcting spelling errors (optional), substituting synonyms (also optional) and stemming the words. 

#### Spell correction

In [None]:
from spellchecker import SpellChecker
import re

WORD = re.compile(r'\w+')
spell = SpellChecker()

def reTokenize(doc):
    tokens = WORD.findall(doc)
    return tokens

text = ["Hi, welcmoe to speling.","This is jsut an exapmle, but cosnider a veri big coprus."]

def spell_correct(text):
    sptext =  [' '.join([spell.correction(w).lower() for w in reTokenize(doc)])  for doc in text]    
    return sptext    


In [None]:
for word in no_stops[:10]:
    print(spell.correction(word))

In [None]:
# runs forever without finishing. Find alternative or skip it?
correct_spell = []

with open('..\\counts\\ESHG\\spell_checked.txt', "w", encoding = "utf8") as outfile:
    for word in no_stops:
        corrected = spell.correction(word)
        outfile.write(corrected)


#### Stemming


In [None]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()
stemmed = [porter.stem(word) for word in no_stops]

### NLP analysis - count word frequencies

This step actually does the NLP work for the 'bag-of-words' approach. here, we can find how many times the 35 most popular words occur and also find the exact occurence counts for select words. 

In [None]:
counts = Counter(stemmed)
print(type(counts))

In [None]:
print(counts.most_common(35))

In [None]:
print(counts['autist'])
print(counts['asd'])
print(counts['asperg'])
print(counts['autism'])

## Person-first and identity first

This is the second NLP approach, this time taking the words in context instead of as isolated objects. 

It too has a prepping step, which consists of tokenising the actracts into sentences, selecting those sentences which contain the keywords of interest, and converting the select sentences into SpaCy objects (which also part-of-speech tags them and lemmatises them). 

### Prepping

In [None]:
person_identity = nltk.sent_tokenize(bag_of_abstracts)   # this creates a list of sentences
#print(type(person_identity))
#print(person_identity[:10])
#print(len(person_identity))

In [None]:
select_sentences = []

for sentence in person_identity:
    if any(s in sentence for s in ['autistic', 'Autistic', 'autism', 'Autism', 'ASD', 'asd', 'Asperger', 'asperger']):
        #person += sentence
        select_sentences.append(sentence)

print(select_sentences[:10])
print(len(select_sentences))

In [None]:
p_i_string = ' '                   # create a blank string to store the output
for x in select_sentences:         # iterate over the list, appending each string to the previous
    p_i_string += ' ' + x
    
#print(p_i_string[:100])           # optional check to see what the output looks like

In [None]:
p_i_doc = nlp(p_i_string)         # convert the string to a SpaCy object, 
                                  # this also POS-tags and lemmatises the words as it goes. 

### Person first pattern

The second NLP analysis - examining the context of words that fit certain patterns. In this case, this means identifying, counting and examening strings of words that match a person-first pattern. 

In [None]:
pattern_2 = [{"POS": "NOUN"},
             {'LOWER': 'with'},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {"TEXT": {"REGEX": "^[Aa]utism$"}}]

pattern_3 = [{"POS": "NOUN"},
             {'LOWER': 'with'},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {"TEXT": {"REGEX": "^[Aa]sperger$"}}]

pattern_4 = [{"POS": "NOUN"},
             {'LOWER': 'with'},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {"TEXT": {"REGEX": "^ASD$"}}]

In [None]:
# Matcher class object 
matcher = Matcher(nlp.vocab) 
matcher.add("matching_1", [pattern_2, pattern_3, pattern_4]) 

person_first =[]
matches = matcher(p_i_doc) 
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = p_i_doc[start:end]  # The matched span
    person_first.append(span.text)
    
#print(person_first)

In [None]:
p_f_lower = [word.lower() for word in person_first]     # make those tokens lowercase
p_f_no_punct = [w.translate(table_punctuation) for w in p_f_lower] # remove the punctuation
p_f_no_space = (list(filter(lambda x: x, p_f_no_punct)))           # remove any extra whitespace


In [None]:
person_first_no_dups = list(set(p_f_no_space))
with open('..\\counts\\ESHG\\person_first.csv', "w", encoding='ISO-8859-1') as outfile:
        write = csv.writer(outfile)
        for item in person_first_no_dups:
            write.writerow([item])

In [None]:
print(len(p_f_no_space))
print(len(person_first_no_dups))

### Identity first pattern

In [None]:
pattern_a = [{"TEXT": {"REGEX": "^[Aa]utistic"}},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {"POS": "NOUN"}]

pattern_b = [{"TEXT": {"REGEX": "^[Aa]sperger"}},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {"POS": "NOUN"}]

pattern_c = [{"TEXT": {"REGEX": "^ASD"}},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {"POS": "NOUN"}]

In [None]:
# Matcher class object 
matcher = Matcher(nlp.vocab) 
matcher.add("matching_1", [pattern_a, pattern_b, pattern_c]) 

identity_first =[]

In [None]:
    
matches = matcher(p_i_doc) 
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = p_i_doc[start:end]  # The matched span
    identity_first.append(span.text)

#print(matches[:10])

In [None]:
i_f_lower = [word.lower() for word in identity_first]     # make those tokens lowercase
i_f_no_punct = [w.translate(table_punctuation) for w in i_f_lower] # remove the punctuation
i_f_no_space = (list(filter(lambda x: x, i_f_no_punct)))           # remove any extra whitespace


In [None]:
identity_first_no_dups = list(set(i_f_no_space))
with open('..\\counts\\ESHG\\identity_first.csv', "w", encoding='utf8') as outfile:
        write = csv.writer(outfile)
        for item in identity_first_no_dups:
            write.writerow([item])

In [None]:
print(len(identity_first))
print(len(identity_first_no_dups))

### Word counts by part of speech


In [None]:
POS_p_i = []

for token in p_i_doc:
    this_token = [token.text, token.lemma_, token.pos_, token.tag_]
    if any (s in token.text for s in ['autistic', 'Autistic', 'autism', 'Autism', 'ASD', 'asd', 'Asperger', 'asperger']):
        POS_p_i.append(this_token)

In [None]:
with open('..\\counts\\ESHG\\POS.csv', "w", encoding='utf8') as outfile:
        write = csv.writer(outfile)
        for item in POS_p_i:
            write.writerow([item])