# Get ready

First, download, import, prep packages and such. 

Then, check the file location and import the .csv files. Remove any with empty text fields. 

Save a data frame with all the texts and another with only those texts that mention the keywords of interest. 

In [2]:
%%capture

# installing necessary pdf conversion packages via pip
# the '%%capture' at the top of this cell suppresses the output (which is normally quite long and annoying looking). 
# You can remove or comment it out if you prefer to see the output. 
!pip install nltk
!pip install spacy -q
!python -m spacy download en_core_web_lg -q


In [3]:
%%capture

import os                         # os is a module for navigating your machine (e.g., file directories).
import nltk                       # nltk stands for natural language tool kit and is useful for text-mining. 
from nltk import word_tokenize    # and some of its key functions
from nltk import sent_tokenize  
tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.corpus import wordnet                    # Finally, things we need for lemmatising!
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer() 
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
nltk.download('averaged_perceptron_tagger')        # Like a POS-tagger...
nltk.download('wordnet')
nltk.download('webtext')
from nltk.corpus import webtext

import pandas as pd
pd.set_option('display.max_colwidth', 200)
import numpy as np
import statistics
import datetime
date = datetime.date.today()

import codecs
import csv                        # csv is for importing and working with csv files

from collections import Counter

import statistics
import re                         # things we need for RegEx corrections
import matplotlib.pyplot as plt
import string 
import spacy 
from spacy.matcher import Matcher 
from spacy.tokens import Span 
from spacy import displacy 
nlp = spacy.load('en_core_web_lg')
nlp.max_length = 1500000 #or any large value, as long as you don't run out of RAM

import math 

English_punctuation = "-!\"#$%&()'*-–+,./:;<=>?@[\]^_`{|}~''“”"      # Things for removing punctuation, stopwords and empty strings
table_punctuation = str.maketrans('','', English_punctuation)

In [None]:
print(os.listdir("..\\results")  )

files = []
def import_results(input):
    for f in os.listdir(input):
        f = pd.read_csv(input + '\\'+ f,encoding='latin1')
        files.append(f)
    output = pd.concat(files)
    return output

In [None]:
all_results = import_results("..\\results")
len(all_results)

In [None]:
year_04 = pd.read_csv('..\\results\\ESHG2004.csv')
year_04 = year_04.iloc[:, [0,1]]
year_04

In [None]:
all_results = pd.concat([all_results, year_04])
len(all_results)

In [None]:
no_null_texts = all_results[~all_results['Text'].isnull()]
len(no_null_texts)

In [None]:
matched_texts = no_null_texts[no_null_texts['Text'].str.contains('autis|Autis|ASD|Asperger|asperger')]
len(matched_texts)

# Sent tokenisation

In [None]:
sentences  = [sent_tokenize(abstract) for abstract in matched_texts['Text'] ] # make tokenized sentences list
matched_texts['Sentence'] = sentences                                         # copy that list back into df as a new column
matched_texts = matched_texts.explode('Sentence')                             # explode to create 1 row per sentence token


In [None]:
matched_sents = matched_texts[matched_texts['Sentence'].str.contains('autis|Autis|ASD|Asperger|asperger')]
                                                                            # keep only those rows that contain the keywords
matched_sents = matched_sents.drop_duplicates()                             # drop any duplicates
len(matched_sents)                                                          # check length of remaining data frame

## Person-first pattern

In [None]:
pattern_2 = [{"POS": "NOUN"},
             {'LOWER': 'with'},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {"TEXT": {"REGEX": "^[Aa]utism$"}}]

pattern_3 = [{"POS": "NOUN"},
             {'LOWER': 'with'},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {"TEXT": {"REGEX": "^[Aa]sperger$"}}]

pattern_4 = [{"POS": "NOUN"},
             {'LOWER': 'with'},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {"TEXT": {"REGEX": "^ASD$"}}]

# Matcher class object 
matcher = Matcher(nlp.vocab) 
matcher.add("matching_1", [pattern_2, pattern_3, pattern_4]) 

In [None]:
def find_pattern1_match(input):
    thingy = nlp(input)
    match = matcher(thingy)
    if match == []:
        out_value = ''
    else:
        hold_multi_spans = []
        for match_id, start, end in match:
                string_id = nlp.vocab.strings[match_id]  # Get string representation
                span = thingy[start:end]  # The matched span
                hold_multi_spans.append(span)
        out_value = hold_multi_spans
    return out_value



In [None]:
matched_sents['Person-first'] = matched_sents.apply(lambda row: find_pattern1_match(row.Sentence), axis = 1)


## Identity first pattern

In [None]:
pattern_a = [{"TEXT": {"REGEX": "^[Aa]utistic"}},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {"POS": "NOUN"}]

pattern_b = [{"TEXT": {"REGEX": "^[Aa]sperger"}},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {"POS": "NOUN"}]

pattern_c = [{"TEXT": {"REGEX": "^ASD"}},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {'DEP':'amod', 'OP':"?"},
             {"POS": "NOUN"}]

# Matcher class object 
matcher = Matcher(nlp.vocab) 
matcher.add("matching_2", [pattern_a, pattern_b, pattern_c]) 


In [None]:
matched_sents['Identity-first'] = matched_sents.apply(lambda row: find_pattern1_match(row.Sentence), axis = 1)

In [None]:
len(matched_sents)

In [None]:
matched_sents = matched_sents.explode('Person-first')                             # explode to create 1 row per sentence token
len(matched_sents)

In [None]:
matched_sents = matched_sents.explode('Identity-first')                             # explode to create 1 row per sentence token
len(matched_sents)

In [None]:
final = matched_sents[(matched_sents['Person-first'] != '') | (matched_sents['Identity-first'] != '')]
len(final)

In [None]:
final

In [None]:
final.to_csv('..\\output\\text_match_results.csv') 

In [1]:
print(os.listdir("..\\..\\2023_Second_analysis\\output")  )

NameError: name 'os' is not defined

## Chart person first or identity first by year

In [None]:
person_identity_first = pd.read_csv('..\\..\\2023_Second_analysis\\output\\text_match_results.csv')
person_identity_first = person_identity_first.dropna(how='all')
person_identity_first['Year'] = person_identity_first['Year'].astype('Int64')

In [None]:
person_count = person_identity_first.groupby(['Year'])['Person-first'].count()
identity_count = person_identity_first.groupby(['Year'])['Identity-first'].count()


In [None]:
person_identity_count=pd.concat([person_count,identity_count],axis=1)


In [None]:
person_identity_count.plot()
plt.show()

In [None]:
person_examples = person_identity_first.groupby(['Person-first'])['Person-first'].count()
identity_examples = person_identity_first.groupby(['Identity-first'])['Identity-first'].count()


In [None]:
person_identity_examples=pd.concat([person_examples,identity_examples],axis=1)


In [None]:
person_identity_examples.sort_values(by=['Person-first'], ascending=False)

In [None]:
person_identity_examples.sort_values(by=['Identity-first'], ascending=False)

In [None]:
person_identity_examples.notnull().sum()

In [None]:
has_person = person_identity_first[~person_identity_first['Person-first'].isnull()]
len(has_person)


In [None]:
has_identity = person_identity_first[~person_identity_first['Identity-first'].isnull()]
len(has_identity)


## Count abstracts by the structures they use

In [None]:
person_by_title = person_identity_first.groupby(['Title'])['Person-first'].count()
identity_by_title = person_identity_first.groupby(['Title'])['Identity-first'].count()
title = pd.concat([person_by_title,identity_by_title],axis=1)
title

In [None]:
title.sort_values(by=['Identity-first'], ascending=False)

In [None]:
title.sort_values(by=['Person-first'], ascending=False)

In [None]:
columns = ['Person-first','Identity-first']
filter_ = (title[columns] > 0).all(axis=1)
title[filter_]
len(title[filter_])


In [None]:
title[filter_].sort_values(by=['Person-first'], ascending=False)

In [None]:
title[filter_].sort_values(by=['Identity-first'], ascending=False)

## Word counts by part of speech


In [None]:
POS_p_i = []

for token in p_i_doc:
    this_token = [token.text, token.lemma_, token.pos_, token.tag_]
    if any (s in token.text for s in ['autistic', 'Autistic', 'autism', 'Autism', 'ASD', 'asd', 'Asperger', 'asperger']):
        POS_p_i.append(this_token)

In [None]:
with open('..\\counts\\ESHG\\POS.csv', "w", encoding='utf8') as outfile:
        write = csv.writer(outfile)
        for item in POS_p_i:
            write.writerow([item])

In [None]:
p_f_lower = [word.lower() for word in person_first]     # make those tokens lowercase
p_f_no_punct = [w.translate(table_punctuation) for w in p_f_lower] # remove the punctuation
p_f_no_space = (list(filter(lambda x: x, p_f_no_punct)))           # remove any extra whitespace

In [None]:
#for saving output
os.makedirs('folder/subfolder', exist_ok=True)  
df.to_csv('folder/subfolder/out.csv') 