# Word frequencies


Now that we have the abstracts in two nice neat .csv files, we need to download/import the packages needed, import the .csv files, and then can get on with the first part of the analysis. 

## Get ready 

As always, we start with a couple of code cells that load up and nickname some useful packages, then check file locations, then import files and check them. 


In [1]:
%%capture

# installing necessary pdf conversion packages via pip
# the '%%capture' at the top of this cell suppresses the output (which is normally quite long and annoying looking). 
# You can remove or comment it out if you prefer to see the output. 
!pip install nltk


In [2]:
%%capture

import os                         # os is a module for navigating your machine (e.g., file directories).
import nltk                       # nltk stands for natural language tool kit and is useful for text-mining. 
from nltk import word_tokenize    # and some of its key functions

nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.corpus import wordnet                    # Finally, things we need for lemmatising!
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer() 
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
nltk.download('averaged_perceptron_tagger')        # Like a POS-tagger...
nltk.download('wordnet')
nltk.download('webtext')
from nltk.corpus import webtext

import pandas as pd
pd.set_option('display.max_colwidth', 200)
import numpy as np
import statistics
import datetime
date = datetime.date.today()

import codecs
import csv                        # csv is for importing and working with csv files

from collections import Counter

import statistics
import re                         # things we need for RegEx corrections
import matplotlib.pyplot as plt
import string 

import math 

English_punctuation = "-!\"#$%&()'*-–+,./:;<=>?@[\]^_`{|}~''“”"      # Things for removing punctuation, stopwords and empty strings
table_punctuation = str.maketrans('','', English_punctuation)

In [3]:
print(os.listdir("..\\output")  )                                # check 'results' folder is not empty/has correct stuff

#no_null_texts.to_csv('..\\output\\all_abstracts_no_null_texts.csv')

#no_nans_matched_texts.to_csv('..\\output\\matched_abstracts_no_null_texts.csv') 

['all_abstracts_no_null_texts.csv', 'matched_abstracts_no_null_texts.csv', 'text_match_results.csv']


## Import

Having checked the contents of the output folder and seen the files we expected to see, we can now import and check them. 

In [10]:
all_texts = pd.read_csv('..\\output\\all_abstracts_no_null_texts.csv')            # one for all of the texts and then
matched_texts = pd.read_csv('..\\output\\matched_abstracts_no_null_texts.csv')    # one for just those that match the keyword

In [11]:
print (len(all_texts))                        # it is always useful to double check that the length matches your expectations
print (len(matched_texts))                    # in this case, we already know how many rows to expect in each file. 

33979
906


## Get some basic stats about how texts are spread out over time

We know that all of the rows in the files have at least two columns with contents - 'Year' and 'Text'. This means that it is probably a useful thing to get a little schematic and/or table that counts row according to year. Let's do that now! 

In [19]:
all_counts_by_year = all_texts['Year'].value_counts()         # this creates a little table with two columns - year and count
print(all_counts_by_year)                                     # however, when we print it we can see it has no headers,
                                                              # is not in order, etc. 

2013.0    2373
2001.0    2336
2014.0    2240
2004.0    2205
2016.0    2040
2011.0    1967
2015.0    1951
2008.0    1896
2012.0    1871
2010.0    1716
2009.0    1704
2021.0    1570
2007.0    1541
2005.0    1520
2019.0    1437
2006.0    1422
2002.0    1266
2003.0     996
2020.0     693
2017.0     651
2018.0     584
Name: Year, dtype: int64


In [20]:
all_counts_by_year = pd.DataFrame(all_counts_by_year)                      # so we convert the table to a data frame
all_counts_by_year = all_counts_by_year.rename(columns={"Year": "Counts"}) # name the columns
all_counts_by_year = all_counts_by_year.rename_axis('Year').reset_index()  # set the axis to year and reset the index
all_counts_by_year = all_counts_by_year.sort_values(by=['Year'])           # and sort by value of year.
print(all_counts_by_year)                                                  # Let's just check it worked. 

      Year  Counts
1   2001.0    2336
16  2002.0    1266
17  2003.0     996
3   2004.0    2205
13  2005.0    1520
15  2006.0    1422
12  2007.0    1541
7   2008.0    1896
10  2009.0    1704
9   2010.0    1716
5   2011.0    1967
8   2012.0    1871
0   2013.0    2373
2   2014.0    2240
6   2015.0    1951
4   2016.0    2040
19  2017.0     651
20  2018.0     584
14  2019.0    1437
18  2020.0     693
11  2021.0    1570


In [None]:
no_null_texts = all_results[~all_results['Text'].isnull()]
len(no_null_texts)

In [None]:
no_null_counts_by_year = no_null_texts['Year'].value_counts()
no_null_counts_by_year = pd.DataFrame(no_null_counts_by_year)
no_null_counts_by_year = no_null_counts_by_year.rename(columns={"Year": "All"})
no_null_counts_by_year = no_null_counts_by_year.rename_axis('Year').reset_index()
no_null_counts = no_null_counts_by_year.sort_values(by=['Year'])

In [None]:
matched_texts = no_null_texts[no_null_texts['Text'].str.contains('autis|Autis|ASD|Asperger|asperger')]
len(matched_texts)

In [None]:
matched_counts_by_year = matched_texts['Year'].value_counts()
matched_counts_by_year = pd.DataFrame(matched_counts_by_year)
matched_counts_by_year = matched_counts_by_year.rename(columns={"Year": "Matches"})
matched_counts_by_year = matched_counts_by_year.rename_axis('Year').reset_index()
matched_counts_by_year = matched_counts_by_year.sort_values(by=['Year'])

In [None]:
counts_year = no_null_counts_by_year.merge(matched_counts_by_year, on='Year', how='left')
counts_year = counts_year.sort_values(by='Year')
counts_year = counts_year.set_index('Year')
counts_year.plot()
plt.show()

In [None]:
print(counts_year)

# Count word frequencies 
## Bag of words

Proceed through the 'bag of words' steps for the data frames with all texts and then again for the data frame with only the texts that mention the keywords of interest. This approach finds word frequencies for all years together. 

In [None]:
def bag_of_words_analysis(input, how_many):
    holding_string = ""
    for text in input['Text']:
        holding_string += text
    holding_string = word_tokenize(holding_string)
    holding_string = [word.lower() for word in holding_string]
    holding_string = [w.translate(table_punctuation) for w in holding_string]
    holding_string = (list(filter(lambda x: x, holding_string)))
    holding_string = [token for token in holding_string if not token.isdigit()]
    holding_string = [token for token in holding_string if token not in stop_words]
    holding_string = [porter.stem(token) for token in holding_string]
    list_for_count = []
    for token in holding_string:
        list_for_count.append(token)
    counts = Counter(list_for_count)
    return counts.most_common(how_many)

In [None]:
bag_of_words_analysis(no_null_texts, 15)

In [None]:
bag_of_words_analysis(matched_texts, 15)

### Word Frequency by year

In [None]:

def track_word_over_time(input, target_word):
    years = input['Year'].drop_duplicates()
    target_counts = []
    for year in years:
        year_bag = ""
        for text in input['Text'][input['Year']==year]:
            year_bag += text
        year_bag = word_tokenize(year_bag)
        year_bag = [word.lower() for word in year_bag]
        year_bag = [w.translate(table_punctuation) for w in year_bag]
        year_bag = (list(filter(lambda x: x, year_bag)))
        year_bag = [token for token in year_bag if not token.isdigit()]
        year_bag = [token for token in year_bag if token not in stop_words]
        year_bag = [porter.stem(token) for token in year_bag]
        list_for_count = []
        for token in year_bag:
            list_for_count.append(token)
        counts = Counter(list_for_count)
        target_counts.append(counts[target_word])
        
    target_word_by_year = pd.DataFrame(list(zip(years, target_counts)), columns = ['Year', str(target_word)])
    return target_word_by_year


In [None]:
ASD = track_word_over_time(no_null_texts, 'asd')
autism = track_word_over_time(no_null_texts, 'autism')
autistic = track_word_over_time(no_null_texts, 'autist')
asperger = track_word_over_time(no_null_texts, 'asperger')

target_words = ASD.merge(autism, on='Year').merge(asperger, on='Year').merge(autistic, on='Year')

In [None]:
target_words = target_words.set_index('Year')
target_words = target_words.sort_values(by=['Year'])
target_words.plot.line()
plt.show()

In [None]:
diseas = track_word_over_time(no_null_texts, 'diseas')
disord = track_word_over_time(no_null_texts, 'disord')
condition = track_word_over_time(no_null_texts, 'condit')
syndrom = track_word_over_time(no_null_texts, 'syndrom')

target_words_2 = diseas.merge(disord, on='Year').merge(syndrom, on='Year').merge(condition, on='Year')

In [None]:
target_words_2 = target_words_2.set_index('Year')
target_words_2 = target_words_2.sort_values(by=['Year'])
target_words_2.plot.line()
plt.show()

### Word frequency by session code
