# Get ready

First, download, import, prep packages and such. 

Then, check the file location and import the .csv files. Remove any with empty text fields. 

Save a data frame with all the texts and another with only those texts that mention the keywords of interest. 

In [1]:
%%capture

# installing necessary pdf conversion packages via pip
# the '%%capture' at the top of this cell suppresses the output (which is normally quite long and annoying looking). 
# You can remove or comment it out if you prefer to see the output. 
!pip install nltk


In [2]:
%%capture

import os                         # os is a module for navigating your machine (e.g., file directories).
import nltk                       # nltk stands for natural language tool kit and is useful for text-mining. 
from nltk import word_tokenize    # and some of its key functions

nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.corpus import wordnet                    # Finally, things we need for lemmatising!
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer() 
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
nltk.download('averaged_perceptron_tagger')        # Like a POS-tagger...
nltk.download('wordnet')
nltk.download('webtext')
from nltk.corpus import webtext

import pandas as pd
pd.set_option('display.max_colwidth', 200)
import numpy as np
import statistics
import datetime
date = datetime.date.today()

import codecs
import csv                        # csv is for importing and working with csv files

from collections import Counter

import statistics
import re                         # things we need for RegEx corrections
import matplotlib.pyplot as plt
import string 

import math 

English_punctuation = "-!\"#$%&()'*-–+,./:;<=>?@[\]^_`{|}~''“”"      # Things for removing punctuation, stopwords and empty strings
table_punctuation = str.maketrans('','', English_punctuation)

In [3]:
print(os.listdir("..\\results")  )

files = []
def import_results(input):
    for f in os.listdir(input):
        f = pd.read_csv(input + '\\'+ f,encoding='latin1')
        files.append(f)
    output = pd.concat(files)
    return output

['2004.csv', 'ESHG2001abstractICHG.csv', 'ESHG2002Abstracts.csv', 'ESHG2003Abstracts.csv', 'ESHG2005Abstracts.csv', 'ESHG2006Abstracts.csv', 'ESHG2007Abstracts.csv', 'ESHG2008Abstracts.csv', 'ESHG2009Abstracts.csv', 'ESHG2010Abstracts.csv', 'ESHG2011Abstracts.csv', 'ESHG2012Abstracts.csv', 'ESHG2013Abstracts.csv', 'ESHG2014Abstracts.csv', 'ESHG2015Abstracts.csv', 'ESHG2016Abstracts.csv', 'ESHG2017 electronic posters.csv', 'ESHG2017 oral presentation.csv', 'ESHG2018 electronic posters.csv', 'ESHG2018 oral presentation.csv', 'ESHG2019 electronic posters.csv', 'ESHG2019 oral presentation.csv', 'ESHG2020 electronic posters.csv', 'ESHG2020 oral presentation.csv', 'ESHG2021 electronic posters.csv', 'ESHG2021 oral presentation.csv']


In [None]:
with open('..\\..\\2022_First_analysis\\for_analysis\\ESHG\\2004.txt', 'r') as in_file:
    stripped = (line.strip() for line in in_file)
    lines = (line.split("\n") for line in stripped if line)
    with open('log.csv', 'w') as out_file:
        writer = csv.writer(out_file)
        writer.writerow(('title', 'intro'))
        writer.writerows(lines)
        
        
#read_file.to_csv (r'Path where the CSV will be saved\File name.csv', index=None)

In [None]:
all_results = import_results("..\\results")
len(all_results)

In [None]:
all_counts_by_year = all_results['Year'].value_counts()
all_counts_by_year = pd.DataFrame(all_counts_by_year)
all_counts_by_year = all_counts_by_year.rename(columns={"Year": "Counts"})
all_counts_by_year = all_counts_by_year.rename_axis('Year').reset_index()
all_counts_by_year = all_counts_by_year.sort_values(by=['Year'])

In [None]:
no_null_texts = all_results[~all_results['Text'].isnull()]
len(no_null_texts)

In [None]:
no_null_counts_by_year = no_null_texts['Year'].value_counts()
no_null_counts_by_year = pd.DataFrame(no_null_counts_by_year)
no_null_counts_by_year = no_null_counts_by_year.rename(columns={"Year": "All"})
no_null_counts_by_year = no_null_counts_by_year.rename_axis('Year').reset_index()
no_null_counts = no_null_counts_by_year.sort_values(by=['Year'])

In [None]:
matched_texts = no_null_texts[no_null_texts['Text'].str.contains('autis|Autis|ASD|Asperger|asperger')]
len(matched_texts)

In [None]:
matched_counts_by_year = matched_texts['Year'].value_counts()
matched_counts_by_year = pd.DataFrame(matched_counts_by_year)
matched_counts_by_year = matched_counts_by_year.rename(columns={"Year": "Matches"})
matched_counts_by_year = matched_counts_by_year.rename_axis('Year').reset_index()
matched_counts_by_year = matched_counts_by_year.sort_values(by=['Year'])

In [None]:
counts_year = no_null_counts_by_year.merge(matched_counts_by_year, on='Year', how='left')
counts_year = counts_year.sort_values(by='Year')
counts_year = counts_year.set_index('Year')
counts_year.plot()
plt.show()

In [None]:
print(counts_year)

# Count word frequencies 
## Bag of words

Proceed through the 'bag of words' steps for the data frames with all texts and then again for the data frame with only the texts that mention the keywords of interest. This approach finds word frequencies for all years together. 

In [None]:
def bag_of_words_analysis(input, how_many):
    holding_string = ""
    for text in input['Text']:
        holding_string += text
    holding_string = word_tokenize(holding_string)
    holding_string = [word.lower() for word in holding_string]
    holding_string = [w.translate(table_punctuation) for w in holding_string]
    holding_string = (list(filter(lambda x: x, holding_string)))
    holding_string = [token for token in holding_string if not token.isdigit()]
    holding_string = [token for token in holding_string if token not in stop_words]
    holding_string = [porter.stem(token) for token in holding_string]
    list_for_count = []
    for token in holding_string:
        list_for_count.append(token)
    counts = Counter(list_for_count)
    return counts.most_common(how_many)

In [None]:
bag_of_words_analysis(no_null_texts, 13)

In [None]:
bag_of_words_analysis(matched_texts, 13)

### Word Frequency by year

In [None]:

def track_word_over_time(input, target_word):
    years = input['Year'].drop_duplicates()
    target_counts = []
    for year in years:
        year_bag = ""
        for text in input['Text'][input['Year']==year]:
            year_bag += text
        year_bag = word_tokenize(year_bag)
        year_bag = [word.lower() for word in year_bag]
        year_bag = [w.translate(table_punctuation) for w in year_bag]
        year_bag = (list(filter(lambda x: x, year_bag)))
        year_bag = [token for token in year_bag if not token.isdigit()]
        year_bag = [token for token in year_bag if token not in stop_words]
        year_bag = [porter.stem(token) for token in year_bag]
        list_for_count = []
        for token in year_bag:
            list_for_count.append(token)
        counts = Counter(list_for_count)
        target_counts.append(counts[target_word])
        
    target_word_by_year = pd.DataFrame(list(zip(years, target_counts)), columns = ['Year', str(target_word)])
    return target_word_by_year


In [None]:
gene = track_word_over_time(no_null_texts, 'gene')
autism = track_word_over_time(no_null_texts, 'autism')
mutat = track_word_over_time(no_null_texts, 'mutat')

target_words = gene.merge(autism, on='Year').merge(mutat, on='Year')

In [None]:
target_words = target_words.set_index('Year')
target_words.plot.line()
plt.show()

In [None]:
diseas = track_word_over_time(no_null_texts, 'diseas')
disord = track_word_over_time(no_null_texts, 'disord')


In [None]:
condition = track_word_over_time(no_null_texts, 'condit')
syndrom = track_word_over_time(no_null_texts, 'syndrom')

target_words_2 = diseas.merge(disord, on='Year').merge(syndrom, on='Year')

In [None]:
target_words_2

In [None]:
target_words_2 = target_words.set_index('Year')
target_words_2.plot.line()
plt.show()

### Word frequency by session code
