# Preliminary work to clean the texts

## Install and import necessary things

Start off by installing the required packages (if you don't already have them installed) and then importing all required packages. 

In [8]:
%%capture

# installing necessary pdf conversion packages via pip
# the '%%capture' at the top of this cell suppresses the output (which is normally quite long and annoying looking). 
# You can remove or comment it out if you prefer to see the output. 

!pip install autocorrect          

In [9]:
import os                         # os is a module for navigating your machine (e.g., file directories).
import nltk                       # nltk stands for natural language tool kit and is useful for text-mining. 
from nltk import word_tokenize    # and some of its key functions
from nltk import sent_tokenize    
import numpy as np
import pandas as pd
import csv                        # csv is for importing and working with csv files
import statistics
import datetime
date = datetime.date.today()

from autocorrect import Speller   # things we need for spell checking
check = Speller(lang='en')

import re                         # things we need for RegEx corrections

# Split the strings writtin in the .txt file 

## Define the splitting functions

So far, the contents of the .txt file is one loooooooooooooooong string. We want to split that string into multiple strings at specified points, according to patterns or regular expressions that are relevant to the specific text. So, I define a function that looks through a string for matches to a regular expression and returns those matches. 

I then define a function that takes an input folder, an output folder to store the split files, another output folder, and a pre-defined regular expression. This function looks in the input folder, applies the split_with_separators function (among other things) and writes them to a new file saved in the first output folder. It also records the filename and the number of matches to the regular expression in a new .csv file, also saves this in the second output folder. 

In [10]:
def split_with_separators(regex, s):
    matches = list(filter(None, regex.split(s)))
    return matches

def split_text(input, output1, output2, regex):
    totals = []
    for filename in os.listdir(input):
        with open(input + "\\" + filename, "r", encoding='utf-8') as f:
            name = filename.replace(r'.txt', "")
            for line in f:
                matches = split_with_separators(regex, line)
                length_matches = len(matches)
                del matches[0]
                with open(output1 + "\\" + name + ".txt", "w", encoding='utf-8') as fp:
                    for match in matches:
                        row_contents = matches[0] + " " + matches[1]
                        del matches [0]
                        del matches [0]
                        fp.write("%s\n" % row_contents)
                totals_row = [name, length_matches]
                totals.append(totals_row)
    with open(output2 + "\\" + "totals.csv", "w", encoding='utf-8') as out_total:
        writer = csv.writer(out_total)
        for row in totals:
            writer.writerow(row)

## Test the splitting function

Let's apply these splitting functions to the Test folder. To start, we check the contents of the input folder, define the regular expression, run the function with the relevant arguments, and check the output folder. 

For this test, the regular expression is simple the letter 'e'. 

In [4]:
os.listdir("..\\output_texts\\Test")  

['input_pdf_1.txt', 'input_pdf_2.txt', 'input_pdf_3.txt']

In [37]:
regex_test = re.compile(r"(e.*?)")


split_text ("..\output_texts\Test", "..\\for_analysis\\Test", "..\\counts\\Test", regex_test)


In [38]:
print(os.listdir("..\\for_analysis\\Test"))
print(os.listdir("..\\counts\\Test"))

['input_pdf_1.txt', 'input_pdf_2.txt', 'input_pdf_3.txt']
['totals.csv']


All seems well, but you should probably inspect the actual files to be sure things look the way you expect. 

## Run the splitting function on the target files

This time, the regular expression is more complicated and is meant to capture the way that a limited number of capitol letters followed by one or more digits mark the start of each abstract. 

In [11]:
os.listdir("..\\output_texts\\ESHG")  

['2001abstractICHG.txt',
 '2002Abstracts.txt',
 '2003Abstracts.txt',
 '2004.txt',
 '2004Abstracts.txt',
 '2005Abstracts.txt',
 '2006Abstracts.txt',
 '2007Abstracts.txt',
 '2008Abstracts.txt',
 '2009Abstracts.txt',
 '2010Abstracts.txt',
 '2011Abstracts.txt',
 '2012Abstracts.txt',
 '2013Abstracts.txt',
 '2014Abstracts.txt',
 '2015Abstracts.txt',
 '2016Abstracts.txt',
 '2017 electronic posters.txt',
 '2017 oral presentations.txt',
 '2017 posters.txt',
 '2018 electronic posters.txt',
 '2018 EMPAG.txt',
 '2018 oral presentation.txt',
 '2018 posters.txt',
 '2019 oral presentation.txt',
 '2019 posters.txt',
 '2019 posters2.txt',
 '2020 eposters.txt',
 '2020 interactive eposter.txt',
 '2020 oral presentation.txt',
 '2021 eposters.txt',
 '2021 oral presentations.txt']

In [14]:
regex_ESHG = re.compile(r"([PL|S|C|E]\d+)")

split_text ("..\output_texts\ESHG", "..\\for_analysis\\ESHG", "..\\counts\\ESHG", regex_ESHG)

In [15]:
print(os.listdir("..\\for_analysis\\ESHG"))
print(os.listdir("..\\counts\\ESHG"))

['2001abstractICHG.txt', '2002Abstracts.txt', '2003Abstracts.txt', '2004.txt', '2004Abstracts.txt', '2005Abstracts.txt', '2006Abstracts.txt', '2007Abstracts.txt', '2008Abstracts.txt', '2009Abstracts.txt', '2010Abstracts.txt', '2011Abstracts.txt', '2012Abstracts.txt', '2013Abstracts.txt', '2014Abstracts.txt', '2015Abstracts.txt', '2016Abstracts.txt', '2017 electronic posters.txt', '2017 oral presentations.txt', '2017 posters.txt', '2018 electronic posters.txt', '2018 EMPAG.txt', '2018 oral presentation.txt', '2018 posters.txt', '2019 oral presentation.txt', '2019 posters.txt', '2019 posters2.txt', '2020 eposters.txt', '2020 interactive eposter.txt', '2020 oral presentation.txt', '2021 eposters.txt', '2021 oral presentations.txt']
['autism.csv', 'identity_first.csv', 'person_with.csv', 'POS.csv', 'select.csv', 'totals.csv']


# Check and return only select abstracts

## Define the checking function

Now, each abstract should be a row of its own within each .txt file. But not all of abstracts will be relevant to the research question, so we need to remove the irrelevant rows and keep the relevant ones. 

The first step to doing that is to define a function that takes an input folder, a list of keywords, and an output folder as arguments. The function opens the files in the input folder, searches  through each row in the current file for matches to the list of keywords, and writes the name of the file plus the contents of the row that contains a keyword match to a list. Finally, the function eliminates duplicates in that list and writes it to a .csv file is the output folder. 

For this research question, the keywords of interest should catch 'autistic', 'autism', 'asperger's' and 'aspergers' regardless of whether they start with an upper or lowercase letter, plus 'ASD'. 

Note: this function is not applied to the Test files because the function is not so very slow now (.txt files are much faster to work with than .pdf) and because the test and target files are now so very different. 

In [43]:
def match_strings_in_text(input, output, list_of_strings):
    list_of_results = []
    # Open the file in read only mode
    for filename in os.listdir(input):
        name = filename.replace(r'.txt', "")
        line_number = 0
        with open(input + "\\" + filename, "r", encoding='ISO-8859-1') as read_obj:
            for line in read_obj:
                line_number += 1
                # For each line, check if line contains any string from the list of strings
                for string_to_search in list_of_strings:
                    if string_to_search in line:
                    # If any string is found in line, then append that line along with line number in list
                        list_of_results.append((name, string_to_search, line_number, line.rstrip()))
    no_dups_results = list(set(list_of_results))
    with open(output + "\\select.csv", "w", encoding='ISO-8859-1') as outfile:
        write = csv.writer(outfile)
        write.writerows(no_dups_results)



In [44]:
 match_strings_in_text('..\\for_analysis\\ESHG', "..\\counts\\ESHG", ['autis', 'Autis', 'ASD', 'Asperger', 'asperger',])

Always sensible to double check the output. 

In [45]:
print(os.listdir("..\\counts\\ESHG"))

['select.csv', 'totals.csv']
