In [8]:
import pandas as pd
import glob
import os
from datetime import datetime
import numpy as np
from PyPDF2 import PdfReader

In [23]:
# Step 1: PDF files (output spelling tests letterster)
# /vol/bigdata3/datasets3/dutch_child_audio/letterster/spelling/letterster_dictations/01_pdf

pdf_dir = '/vol/bigdata3/datasets3/dutch_child_audio/letterster/spelling/letterster_dictations/01_pdf'
pdf_file_list = glob.glob(os.path.join(pdf_dir, '*.pdf'))

reader = PdfReader(pdf_file_list[0])
page1 = reader.pages[0]
txt_file_page1 = page1.extract_text().split('\n')

page2 = reader.pages[1]
txt_file_page2 = page2.extract_text().split('\n')[1:]
txt_file_page2[0] = txt_file_page2[0].replace('https://app.lexipoort.nl/nl/#/products/letterster/students 2/2', '')

txt_file_page1 + txt_file_page2


['11-03-2024 11:23',
 'https://app.lexipoort.nl/nl/#/products/letterster/students 1/2Letterster toetsresultaten',
 '4 oktober 2023, 09:58',
 'Aantal hervattingen: 1',
 'Verstreken tijd: 00:17:46',
 'Woord Ingevuld antwoord Goed/fout',
 'panda pandaa Fout',
 'benieuwd deniewt Fout',
 'strooi strooj Fout',
 'moeilijk meojlek Fout',
 'drinkflessen drinkvlesen Fout',
 'gedaan gedaan Goed',
 'pijl pel Fout',
 'springt sprinkt Fout',
 'trouw traow Fout',
 'kraaien kraajen Fout',
 'bessenvla desevlaa Fout',
 'bravo praavoo Fout',
 'puree puireej Fout',
 'toch tog Fout',
 'deur deur Goed',
 'denkt denkt Goed',
 'schrijven schven Fout',
 'reuzen reusen Fout',
 'armpje armpje Goed',
 'fiets viets Fout',
 'huis huis Goed',
 'handdoek handeok Fout',
 'voorrang vorang Fout',
 'vliegt vliegt Goed',
 'schuw schuiw Fout',
 'slurf slurv Fout',
 'augurk aowgurk Fout',
 'jassen jase Fout',
 'spaarpot spaarpot Goed',
 'zout saut Fout',
 'strand strant Fout',
 'koopt koopt Goed',
 'autobank outoowdank Fout

In [4]:
# Step 2: Upload pdf to https://www.pdfforge.org/online/nl/extract-text and download as txt files
# Upload these txt files to: /vol/bigdata3/datasets3/dutch_child_audio/letterster/spelling/letterster_dictations/02_txt

txt_dir = '/vol/bigdata3/datasets3/dutch_child_audio/letterster/spelling/letterster_dictations/02_txt'
txt_file_list = glob.glob(os.path.join(txt_dir, '*.txt'))

In [5]:
# Create output directories
dictation_dir = '/vol/bigdata3/datasets3/dutch_child_audio/letterster/spelling/letterster_dictations/03_dictations'
error_cat_dir = '/vol/bigdata3/datasets3/dutch_child_audio/letterster/spelling/letterster_dictations/04_error_categories'

if not os.path.isdir(dictation_dir):
    os.makedirs(dictation_dir)

if not os.path.isdir(error_cat_dir):
    os.makedirs(error_cat_dir)

In [6]:
# Define default functions

def parseDictationString(item):
    target = item.split(' ', 1)[0]
    correct = item.split(' ', -1)[-1]
    realized = item[len(target):len(item)-len(correct)]
    return [target, realized, correct]

def parseErrorCategoryString(item):
    error_cat_rank = item.split(' ')[0]
    error_cat_nr = item.split(' ')[1]
    error_cat_description = item.split(' ', 2)[-1]
    return [error_cat_rank, error_cat_nr, error_cat_description]

def parseTxt(data, author, testID):
    filename = author + '_' + testID + '.csv'

    # Meta data
    date_exported = data[0]
    link = data[1]
    date_administration = data[3].replace(' ', '-')
    attempts = data[3]
    duration = data[4]

    filename = author + '_' + testID + '_' + date_administration + '.tsv'

    # Dictation data
    page1 = [parseDictationString(data[idx]) for idx in np.arange(7, 44, 1)]
    page2 = [parseDictationString(data[idx]) for idx in np.arange(46, 62, 1)]
    dictationDF = pd.DataFrame(page1 + page2, columns = ['target', 'realized', 'correct'])
    dictationDF.to_csv(os.path.join(dictation_dir, filename), sep= '\t')

    # Error categories
    errorCategoryDF = pd.DataFrame([parseErrorCategoryString(data[idx]) for idx in np.arange(64, 64+12, 1)], columns = ['error_rank', 'error_cat_nr', 'error_cat_description'])
    errorCategoryDF.to_csv(os.path.join(error_cat_dir, filename), sep= '\t')

# Read one txt file
txt_file = txt_file_list[0]
with open(txt_file, 'r') as f:
    data = [x[:-1] for x in f.readlines()]

print(data)

# Parse the txt file
author = 'test'
testID = 'test'
parseTxt(data, author, testID)

['11-03-2024 11:23', 'https://app.lexipoort.nl/nl/#/products/letterster/students 1/2', 'Letterster toetsresultaten', '4 oktober 2023, 09:58', 'Aantal hervattingen: 1', 'Verstreken tijd: 00:17:46', 'Woord Ingevuld antwoord Goed/fout', 'panda pandaa Fout', 'benieuwd deniewt Fout', 'strooi strooj Fout', 'moeilijk meojlek Fout', 'drinkflessen drinkvlesen Fout', 'gedaan gedaan Goed', 'pijl pel Fout', 'springt sprinkt Fout', 'trouw traow Fout', 'kraaien kraajen Fout', 'bessenvla desevlaa Fout', 'bravo praavoo Fout', 'puree puireej Fout', 'toch tog Fout', 'deur deur Goed', 'denkt denkt Goed', 'schrijven schven Fout', 'reuzen reusen Fout', 'armpje armpje Goed', 'fiets viets Fout', 'huis huis Goed', 'handdoek handeok Fout', 'voorrang vorang Fout', 'vliegt vliegt Goed', 'schuw schuiw Fout', 'slurf slurv Fout', 'augurk aowgurk Fout', 'jassen jase Fout', 'spaarpot spaarpot Goed', 'zout saut Fout', 'strand strant Fout', 'koopt koopt Goed', 'autobank outoowdank Fout', 'pleintje plantje Fout', 'prei 

In [80]:
def dateString2dateTime(datetime_str):

    # datetime_str = '09/19/22 13:55:26'

    datetime_str = datetime_str.replace('-', '/')

    datetime_object = datetime.strptime(datetime_str, '%d/%m/%y %H:%M')

    print(type(datetime_object))
    print(datetime_object)  # printed in default format

    return datetime_object

dateString2dateTime(data[0])

ValueError: time data '11/03/2024 11:23' does not match format '%d/%m/%y %H:%M'

In [32]:
dateString2dateTime('09-19-22 13:55:26')

ValueError: time data '09/19/22 13:55:26' does not match format '%d/%m/%y %H:%M'