In [93]:
#Imports
import json
import pyconll
import pandas as pd
import os
import numpy as np
import re
from tqdm.notebook import trange, tqdm
import matplotlib
import matplotlib.pyplot as plt
import openpyxl
import seaborn as sns
import math
from scipy.stats import norm

In [4]:
#Constants
JSON_PATH = "Parsed"

In [5]:
#Other necessary variables
books = {}

In [6]:
#Loading the conllus (jsons) as Dataframes

for file in os.listdir(JSON_PATH):
    #Opening json contents
    with open(JSON_PATH+"/"+file) as json_file:
        #Transform into dataframe
        df = pd.read_json(json_file)
        #Append as dict juuuuust in case we need the metadata
        #Clip at 17 as the format for the filenames are standardized
        books[file[:17]] = df


In [7]:
#Functions which extract data from a dictionary to get data on sentences

#Function that removes PUNCT
def getNoPunct(sentences: dict) -> dict:
    no_punct = {}
    #Remove the rows from each dataframe what are classified as PUNCT
    for key in sentences:
        df = sentences[key]
        no_punct[key] = df[df.upos != "PUNCT"]
    return no_punct

#Function that takes in a dictionary of [book_name, conllu_dataframe] and returns a dictionary with [book_name, sentences_dataframe]
def getTokenData(books: dict) -> dict:
    return_dict = {}
    with tqdm(range(len(books.keys())), desc="Extracting sentences...") as pbar:
        #For key-value pair in dict
        for key in books:
            #Init a new array for sentences
            sentence_dfs = []
            df = books[key]
            
            #Only care about the sentences
            for sentence in df['sentences']:
                #Add dfs created from sentences to list
                sentence_dfs.append(pd.DataFrame.from_dict(sentence['tokens']))
            #Map book_name to a dataframe from all its sentences
            sentece_df = pd.concat(sentence_dfs, ignore_index=True)
            return_dict[key]=sentece_df
            #Update pbar
            pbar.update(1)
    return return_dict

#Function which returns a dictionary [book_name, lemma_freq_pivot_table]
def getLemmaFrequencies(sentences: dict) -> dict:
    lemma_freqs = {}
    for key in sentences:
        lemma_freqs[key] = getLemmaFreq(sentences[key])
    return lemma_freqs

#Return dataframe with lemmas in descending order (ignoring PUNCT and non alnums)
def getLemmaFreq(df: pd.DataFrame) -> pd.DataFrame:
    #Get rid of PUNCT
    no_punct = df[df.upos != "PUNCT"].copy()
    #Make all into strings
    no_punct['lemma'] = no_punct['lemma'].apply(lambda x: str(x))
    #Remove non-alnums
    no_punct['lemma'] = no_punct['lemma'].apply(lambda x: ''.join(filter(str.isalnum, x)))
    #Filter rows with nothing in them
    no_punct = no_punct[no_punct.text != '']
    #Return a pivot_table turned DataFrame that counts the occurances of each lemma and sorts them in descending order
    return pd.DataFrame.pivot_table(no_punct, columns='lemma', aggfunc='size').sort_values(ascending=False).reset_index().rename(columns={0: "frequency"})

#Get frequencies of words (not PUNCT, all to lowercase, and removed all non alnums)
def getWordFrequencies(sentences: dict) -> dict:
    word_freqs = {}
    for key in sentences:
        df = sentences[key]
        #Get rid of PUNCT
        no_punct = df[df.upos != "PUNCT"].copy()
        #Make words lowercase
        no_punct['text'] = no_punct['text'].apply(lambda x: x.lower())
        #Remove non-alnums
        no_punct['text'] = no_punct['text'].apply(lambda x: ''.join(filter(str.isalnum, x)))
        #Filter rows with nothing in them
        no_punct = no_punct[no_punct.text != '']
        #Map book_name to pivot table
        word_freqs[key] = pd.DataFrame.pivot_table(no_punct, columns='text', aggfunc='size').sort_values(ascending=False).reset_index().rename(columns={0: "frequency"})
    return word_freqs

#Get amount of non-PUNCT tokens in sentences
def getWordAmounts(sentences: dict) -> dict:
    word_amounts = {}
    for key in sentences:
        df = sentences[key]
        #Get rid of PUNCT
        no_punct = df[df.upos != "PUNCT"]
        word_amounts[key] = len(no_punct)
    return word_amounts

#Get PoS frequencies
def getPOSFrequencies(sentences: dict) -> dict:
    pos_freqs = {}

    for key in sentences:
        #Map book_name to pivot table
        pos_freqs[key] = pd.DataFrame.pivot_table(sentences[key], columns='upos', aggfunc='size').sort_values(ascending=False).reset_index().rename(columns={0: "frequency"})

    return pos_freqs

In [17]:
#Functions to get metrics from sentences

#Function the get the average length of the unique lemmas in the sentenes
def getAvgLen(data: dict, column: str) -> dict:
    avg_lens = {}
    for key in data:
        i = 1
        total_len = 0
        df = data[key]
        #For each lemma count the length and add one to counter
        for lemma in df[column]:
            #Only care about strings
            if type(lemma) is str:
                total_len += len(lemma)
                i += 1
        #If no lemmas were found (should never happen but just in case), we make the avg_len be 0
        if i==1:
            avg_lens[key] = 0
        else:
            #Map book_name to avg lemma length
            avg_lens[key] = total_len/(i-1.0)
    return avg_lens

#Function to calculate DP (deviation of proportions) of all the words in the corpus
def getDP(v: dict, f_df: pd.DataFrame, s: dict) -> pd.DataFrame:

    #First get the minimum s
    min_s = 1
    for key in s:
        if s[key] < min_s:
            min_s = s[key]
    #For corpus parts that are length 1
    if min_s == 1:
        min_s = 0

    v_series = {}
    #Transform v into more usable form
    for key in v:
        v_df = v[key]
        ser = v_df[v_df.columns[1]]
        ser.index = v_df[v_df.columns[0]]
        v_series[key] = ser
    
    texts = []
    DP = []
    DP_norm = []
    with tqdm(range(len(f_df.index)), desc="DP calculations") as pbar:

        #Loop through every single word in the corpus
        for k in range(len(f_df.index)):
            #Get the freq of the word in the whole corpus
            word = f_df.iloc[k, 0]
            f = f_df.iloc[k, 1]
            abs_sum = 0
            #For each document in the corpus
            for key in v_series:
                #Freq of word in document. Set to 0 if not found
                v_i = 0
                try:
                    v_i = v_series[key].loc[word]*1.0
                except:
                    v_i = 0.0
                #Comparative size of document to whole corpus
                s_i = s[key]
                #Calculate the abs_sum used in calculating DP as written by Gries [2020]
                abs_sum += abs(((v_i)/f)-s_i)
            #Append word to list
            texts.append(word)
            #Calculate and append DP
            dp = 0.5*abs_sum
            DP.append(dp)
            #Append DP_norm to list (alltho with how many documents we have, the normalization doesn't work very well at all)
            DP_norm.append(dp/(1-min_s))
            #Update pbar
            pbar.update(1)
    return pd.DataFrame({'text': texts, 'DP': DP, 'DP_norm': DP_norm})

#Function to get contextual diversity
def getCD(v: dict):
    #Get number of books
    books_num = len(v.keys())
    word_series = []
    #For each dataframe attached to a book, look for a frequency list and gather all the words in a list
    for key in v:
        v_df = v[key]
        word_series.append(v_df[v_df.columns[0]])
    #Add all words to a new dataframe
    series = pd.concat(word_series, ignore_index=True)
    #Create pivot table to count in how many books does a word appear in
    CD_raw = series.value_counts()
    #Return Contextual Diversity by dividing the number of appearances by the total number of books
    return CD_raw.apply(lambda x: x/books_num)

In [9]:
#Functions to do with sub-corpora

#Simple function to get sub_corpora from the whole package based on the target age group
#Naming conventions are ISBN_age-group_register, where age-group is an int [1,3]
def getSubCorp(corp: dict, num: int) -> dict:
    sub_corp = {}
    for key in corp:
        if key.find('_'+str(num)+'_') != -1:
            sub_corp[key] = corp[key]
    return sub_corp

#Combine sub-corp dicts into one dict
def combineSubCorpDicts(corps: list) -> dict:
    whole = corps[0].copy()
    for i in range(1, len(corps)):
        whole.update(corps[i])
    return whole

#Takes in a list of dataframes (or series) and combines them together
def combineSubCorpsData(corps: list):
    dfs = []
    for df in corps:
        dfs.append(df)
    combined = pd.concat(dfs)
    if type(combined) is pd.DataFrame:
        return combined.groupby(combined.columns[0])[combined.columns[1]].sum().reset_index()
    else:
        return combined.groupby(level=0).sum().reset_index()

In [10]:
#Move to working with just sentence data
#Whole corpus
sentences = getTokenData(books)

Extracting sentences...:   0%|          | 0/56 [00:00<?, ?it/s]

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x77dfb8b4a830>>
Traceback (most recent call last):
  File "/home/tenojo/miniconda3/envs/Test/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


In [11]:
#Subcorpora based on the target age groups
sentences_1 = getSubCorp(sentences, 1)
sentences_2 = getSubCorp(sentences, 2)
sentences_3 = getSubCorp(sentences, 3)

#Versions of sentences for more meaningful data
sentences_no_punct_1 = getNoPunct(sentences_1)
sentences_no_punct_2 = getNoPunct(sentences_2)
sentences_no_punct_3 = getNoPunct(sentences_3)
sentences_no_punct = combineSubCorpDicts([sentences_no_punct_1, sentences_no_punct_2, sentences_no_punct_3])

In [12]:
#Count lemma frequencies

lemma_freqs_1 = getLemmaFrequencies(sentences_1)
lemma_freqs_2 = getLemmaFrequencies(sentences_2)
lemma_freqs_3 = getLemmaFrequencies(sentences_3)

lemma_freqs = combineSubCorpDicts([lemma_freqs_1, lemma_freqs_2, lemma_freqs_3])

#Count word frequencies

word_freqs_1 = getWordFrequencies(sentences_1)
word_freqs_2 = getWordFrequencies(sentences_2)
word_freqs_3 = getWordFrequencies(sentences_3)

word_freqs = combineSubCorpDicts([word_freqs_1, word_freqs_2, word_freqs_3])

#Just for interest's sake, info on how many tokens (non-punct) are in each book

word_amounts_1 = getWordAmounts(sentences_1)
word_amounts_2 = getWordAmounts(sentences_2)
word_amounts_3 = getWordAmounts(sentences_3)

word_amounts = combineSubCorpDicts([word_amounts_1, word_amounts_2, word_amounts_3])

In [13]:
#Count the average uniq lemma lengths
avg_uniq_lemma_lens_1 = getAvgLen(lemma_freqs_1, 'lemma')
avg_uniq_lemma_lens_2 = getAvgLen(lemma_freqs_2, 'lemma')
avg_uniq_lemma_lens_3 = getAvgLen(lemma_freqs_3, 'lemma')
avg_uniq_lemma_lens = getAvgLen(lemma_freqs, 'lemma')
#print(avg_uniq_lemma_lens)

#Count the average uniq word lengths
avg_uniq_word_lens_1 = getAvgLen(word_freqs_1, 'text')
avg_uniq_word_lens_2 = getAvgLen(word_freqs_2, 'text')
avg_uniq_word_lens_3 = getAvgLen(word_freqs_3, 'text')
avg_uniq_word_lens = getAvgLen(word_freqs, 'text')
#print(avg_uniq_word_lens)

#Count the average lemma lengths
avg_lemma_lens_1 = getAvgLen(sentences_no_punct_1, 'lemma')
avg_lemma_lens_2 = getAvgLen(sentences_no_punct_2, 'lemma')
avg_lemma_lens_3 = getAvgLen(sentences_no_punct_3, 'lemma')
avg_lemma_lens = getAvgLen(sentences_no_punct, 'lemma')
#print(avg_lemma_lens)

#Count the average word lengths
avg_word_lens_1 = getAvgLen(sentences_no_punct_1, 'text')
avg_word_lens_2 = getAvgLen(sentences_no_punct_2, 'text')
avg_word_lens_3 = getAvgLen(sentences_no_punct_3, 'text')
avg_word_lens = getAvgLen(sentences_no_punct, 'text')
#print(avg_word_lens)


#Combining results into dfs

avg_uniq_lens_df_1 = pd.DataFrame.from_dict([avg_uniq_lemma_lens_1, avg_uniq_word_lens_1]).transpose().rename(columns={0: 'Unique lemmas avg length', 1: 'Unique words avg length'})
avg_uniq_lens_df_2 = pd.DataFrame.from_dict([avg_uniq_lemma_lens_2, avg_uniq_word_lens_2]).transpose().rename(columns={0: 'Unique lemmas avg length', 1: 'Unique words avg length'})
avg_uniq_lens_df_3 = pd.DataFrame.from_dict([avg_uniq_lemma_lens_3, avg_uniq_word_lens_3]).transpose().rename(columns={0: 'Unique lemmas avg length', 1: 'Unique words avg length'})
avg_uniq_lens_df = pd.DataFrame.from_dict([avg_uniq_lemma_lens, avg_uniq_word_lens]).transpose().rename(columns={0: 'Unique lemmas avg length', 1: 'Unique words avg length'})


avg_lens_df_1 = pd.DataFrame.from_dict([avg_lemma_lens_1, avg_word_lens_1]).transpose().rename(columns={0: 'All lemmas avg length', 1: 'All words avg length'})
avg_lens_df_2 = pd.DataFrame.from_dict([avg_lemma_lens_2, avg_word_lens_2]).transpose().rename(columns={0: 'All lemmas avg length', 1: 'All words avg length'})
avg_lens_df_3 = pd.DataFrame.from_dict([avg_lemma_lens_3, avg_word_lens_3]).transpose().rename(columns={0: 'All lemmas avg length', 1: 'All words avg length'})
avg_lens_df = pd.DataFrame.from_dict([avg_lemma_lens, avg_word_lens]).transpose().rename(columns={0: 'Unique lemmas avg length', 1: 'Unique words avg length'})

In [14]:
#Functions for getting values for different variables used in metrics

#Function for getting the total length of the corpus
def getL(word_amounts: dict) -> int:
    l = 0
    for key in word_amounts:
        l += word_amounts[key]
    return l
#Function for getting how big each part is in relation to the total size of the corpus
def getS(word_amounts: dict, l: int) -> dict:
    s = {}
    for key in word_amounts:
        s[key] = (word_amounts[key]*1.0)/l
    return s

#Get the total frequencies of passed freq_data in the corpus
def getTotal(freq_data: dict) -> pd.DataFrame:
    dfs = []
    #Add all dataframes to list
    for key in freq_data:
        dfs.append(freq_data[key])
    #Concat all dataframes together
    df = pd.concat(dfs, ignore_index=True)
    #Return a dataframe containing text in one column and total freq in collection in the other
    return df.groupby(df.columns[0])['frequency'].sum().reset_index()

In [18]:
#Constants to be used in different measures

#The length of the corpus in words (no PUNCT)
l_1 = getL(word_amounts_1)
l_2 = getL(word_amounts_2)
l_3 = getL(word_amounts_3)
l = l_1+l_2+l_3
#The length of the corpus in parts
n = len(sentences.keys())
#The percentages of the n corpus part sizes
s_1 = getS(word_amounts_1, l_1)
s_2 = getS(word_amounts_2, l_2)
s_3 = getS(word_amounts_3, l_3)
s = getS(word_amounts, l)
#The overall frequencies of words in corpus
f_words_1 = getTotal(word_freqs_1)
f_words_2 = getTotal(word_freqs_2)
f_words_3 = getTotal(word_freqs_3)
f_words = getTotal(word_freqs)
#The overall frequencies of lemmas in corpus
f_lemmas_1 = getTotal(lemma_freqs_1)
f_lemmas_2 = getTotal(lemma_freqs_2)
f_lemmas_3 = getTotal(lemma_freqs_3)
f_lemmas = getTotal(lemma_freqs)
#The frequencies of words in each corpus part
v_words = word_freqs
#The frequencies of lemmas in each corpus part
v_lemmas = lemma_freqs

#print(f_words.sort_values(by='frequency', ascending=False))
#print(f_lemmas.sort_values(by='frequency', ascending=False))

In [19]:
#Whole corpus
lemma_DP = getDP(v_lemmas, f_lemmas, s)
#Sub-corpora
lemma_DP_1 = getDP(lemma_freqs_1, f_lemmas_1, s_1)
lemma_DP_2 = getDP(lemma_freqs_2, f_lemmas_2, s_2)
lemma_DP_3 = getDP(lemma_freqs_3, f_lemmas_3, s_3)
#Whole corpus
word_DP = getDP(v_words, f_words, s)
#Sub-corpora
word_DP_1 = getDP(word_freqs_1, f_words_1, s_1)
word_DP_2 = getDP(word_freqs_2, f_words_2, s_2)
word_DP_3 = getDP(word_freqs_3, f_words_3, s_3)

DP calculations:   0%|          | 0/87030 [00:00<?, ?it/s]

DP calculations:   0%|          | 0/9410 [00:00<?, ?it/s]

DP calculations:   0%|          | 0/38232 [00:00<?, ?it/s]

DP calculations:   0%|          | 0/67062 [00:00<?, ?it/s]

DP calculations:   0%|          | 0/202732 [00:00<?, ?it/s]

DP calculations:   0%|          | 0/18650 [00:00<?, ?it/s]

DP calculations:   0%|          | 0/87211 [00:00<?, ?it/s]

DP calculations:   0%|          | 0/156177 [00:00<?, ?it/s]

In [64]:
#Testing outputs
test = 0
for key in s:
    test += s[key]
#print(test)

#with pd.ExcelWriter("Data/lemma_DP.xlsx") as writer:
#    lemma_DP.to_excel(writer)

#lemma_DP['frequency'] = f_lemmas['frequency']

#lemma_DP = lemma_DP.drop(columns=['frequency'])

#print(lemma_DP.sort_values(by='DP', ascending=True))
#ax = lemma_DP.sort_values(by='DP_norm').plot.hist(bins=100)

#print(len(lemma_DP[lemma_DP['DP_norm']>0.9]))

In [20]:
#Getting CD

#Whole corpus
word_CD = getCD(v_words)
#Sub-corpora
word_CD_1 = getCD(word_freqs_1)
word_CD_2 = getCD(word_freqs_2)
word_CD_3 = getCD(word_freqs_3)

#Whole corpus
lemma_CD = getCD(v_lemmas)
#Sub-corpora
lemma_CD_1 = getCD(lemma_freqs_1)
lemma_CD_2 = getCD(lemma_freqs_2)
lemma_CD_3 = getCD(lemma_freqs_3)

#print(word_CD)

#print(lemma_CD[lemma_CD < 0.05].count())

In [21]:
#Get POS frequencies

#Count POS frequencies

pos_freqs_per_book = getPOSFrequencies(sentences)

pos_freqs_1 = getTotal(getSubCorp(pos_freqs_per_book, 1))
pos_freqs_2 = getTotal(getSubCorp(pos_freqs_per_book, 2))
pos_freqs_3 = getTotal(getSubCorp(pos_freqs_per_book, 3))

pos_freqs_corpus = getTotal(pos_freqs_per_book)

#print(pos_freqs_corpus.sort_values(by='frequency', ascending=False))

In [22]:
#Writing all data into one big xlsx-file
def writeDataToXlsx(name, f_words, f_lemmas, pos_freqs, lemma_DP, word_DP, lemma_CD, word_CD, avg_uniq_lens_df, avg_lens_df):
    with pd.ExcelWriter("Data/"+name+".xlsx") as writer:
        f_words.to_excel(writer, sheet_name="Word frequencies")
        f_lemmas.to_excel(writer, sheet_name="Lemma frequencies")
        pos_freqs.to_excel(writer, sheet_name="POS frequencies")
        lemma_DP.to_excel(writer, sheet_name="Lemma dispersion")
        word_DP.to_excel(writer, sheet_name="Word dispersion")
        lemma_CD.to_excel(writer, sheet_name="Lemma contextual diversity")
        word_CD.to_excel(writer, sheet_name="Word contextual diversity")
        avg_uniq_lens_df.to_excel(writer, sheet_name="Average unique lengths by book")
        avg_lens_df.to_excel(writer, sheet_name="Average lengths by book")

In [23]:
#Commencing the writing part
#writeDataToXlsx("Initial56_whole", f_words, f_lemmas, pos_freqs_corpus, lemma_DP, word_DP, lemma_CD, word_CD, avg_uniq_lens_df, avg_lens_df)
#writeDataToXlsx("Initial56_1", f_words_1, f_lemmas_1, pos_freqs_1, lemma_DP_1, word_DP_1, lemma_CD_1, word_CD_1, avg_uniq_lens_df_1, avg_lens_df_1)
#writeDataToXlsx("Initial56_2", f_words_2, f_lemmas_2, pos_freqs_2, lemma_DP_2, word_DP_2, lemma_CD_2, word_CD_2, avg_uniq_lens_df_2, avg_lens_df_2)
#writeDataToXlsx("Initial56_3", f_words_3, f_lemmas_3, pos_freqs_3, lemma_DP_3, word_DP_3, lemma_CD_3, word_CD_3, avg_uniq_lens_df_3, avg_lens_df_3)

In [None]:
#Sorting out some differences and distributions

# zipf = log10((*word_freq*/10000000+*1*)/(*total_token_count*/10000000 + *unique_words*/10000000))+3.0



#print(f_lemmas)

#t = (f_lemmas.frequency).apply(lambda x: math.log10(x/1000000))

#m,k = norm.fit(f_lemmas.frequency)

#log_l = np.log(np.prod(norm.pdf(f_lemmas.frequency, m, s)))

#print(log_l)


#ax = plt.hist(lemma_DP.DP_norm, range=[0, 1])

TypeError: unsupported operand type(s) for /: 'float' and 'dict'