In [1]:
import xml.etree.ElementTree as et
import os
import pandas as pd
import numpy as np

# Load Data

In [2]:
writers_path = "writers.pkl.compressed"
if os.path.exists(writers_path):
    writers = pd.read_pickle(writers_path, compression="gzip")
else:
    writers = pd.DataFrame(
        {
            "handedness": pd.Series(dtype=object),
            "sex": pd.Series(dtype=object),
            "education": pd.Series(dtype=object),
            "birth_year": pd.Series(dtype="int32"),
            "word_count": pd.Series(dtype="int32"),
        }
    )
    for root, dirs, files in os.walk("Writers/"):
        for filename in files:
            file_path = os.path.join(root, filename)
            xml = et.parse(file_path).getroot()

            id = filename[:-4]
            handedness = xml.find("handedness").text
            sex = xml.find("sex")
            if sex is not None:
                sex = sex.text
            education = xml.find("education")
            if education is not None:
                education = education.text
            birth_year = xml.find("birthYear")
            if birth_year is not None:
                birth_year = int(birth_year.text)

            writers.loc[id] = {
                "handedness": handedness,
                "sex": sex,
                "education": education,
                "birth_year": birth_year,
                "word_count": 0,
            }
    writers.to_pickle(writers_path, compression="gzip")
writers


Unnamed: 0,handedness,sex,education,birth_year,word_count
007cfec5-01f1-40c8-8c99-ff5fd4076cd8,Right,Man,Bachelor,1380,0
00806490-2371-478c-a820-76bdb049075a,Right,Man,UpperSecondary,1380,0
0106d1f0-c034-43ae-9e52-6c2d47a59fe4,Left,Woman,Bachelor,1379,14
0160bf44-b78c-4fa3-9da4-14344fa52474,Right,Man,Master,1357,0
01d3e75e-096b-4d8f-a14d-48f55b8bff92,Right,Man,Bachelor,1376,61
...,...,...,...,...,...
fdade6eb-ac47-4de0-baee-c3c6b692c6b7,Right,Man,Bachelor,1378,14
fe35f6fe-4003-4f7b-8b8b-9eda43251ca3,Right,Woman,Bachelor,1379,0
fe5cace8-0e34-4079-8276-129a4aa02c9d,Right,Man,Bachelor,1370,0
fe8be55e-b3e5-437a-9e3a-1cf8f626af6a,Left,Man,Bachelor,1376,141


In [3]:
ground_truths_path = "ground_truths.pkl.compressed"
if os.path.exists(ground_truths_path):
    ground_truths = pd.read_pickle(ground_truths_path, compression="gzip")
else:
    ground_truths = pd.DataFrame(
        {
            "word_count": pd.Series(dtype="int32"),
        }
    )
    for root, dirs, files in os.walk("GroundTruths/"):
        for filename in files:
            file_path = os.path.join(root, filename)
            xml = et.parse(file_path).getroot()

            for text in xml.findall("text"):
                id = int(text.attrib["id"])
                if xml.attrib["type"] == "Text":
                    word_count = len(text.find("content").text.split(" "))
                else:
                    word_count = len(text.findall(".//word"))

                ground_truths.loc[id] = {
                    "word_count": word_count,
                }
    ground_truths.to_pickle(ground_truths_path, compression="gzip")
ground_truths


Unnamed: 0,word_count
1,4
2,15
3,9
4,28
5,19
...,...
71875,7
71876,9
71877,8
71878,8


In [4]:
xml_namespaces = {"inkml": "http://www.w3.org/2003/InkML"}
writepads_path = "writepads.pkl.compressed"
if os.path.exists(writepads_path):
    writepads = pd.read_pickle(writepads_path, compression="gzip")
else:
    writepads = pd.DataFrame(
        {
            "writer_id": pd.Series(dtype=object),
            "input": pd.Series(dtype=object),
            "type": pd.Series(dtype=object),
            "hand": pd.Series(dtype=object),
            "truth_id": pd.Series(dtype="int32"),
        }
    )
    for root, dirs, files in os.walk("Writepads/"):
        for filename in files:
            file_path = os.path.join(root, filename)
            ink = et.parse(file_path).getroot()

            id = int(filename[:-6])
            writer_id = ink.find("inkml:annotationXML/writerId", xml_namespaces).text[9:]
            input = ink.find("inkml:annotationXML/input", xml_namespaces).text
            type = ink.find("inkml:annotationXML/type", xml_namespaces).text
            hand = ink.find("inkml:annotationXML/hand", xml_namespaces).text
            truth_id = int(ink.find("inkml:annotationXML/truthId", xml_namespaces).text)

            writers.loc[writer_id, "word_count"] += ground_truths.loc[truth_id, "word_count"]

            writepads.loc[id] = {
                "writer_id": writer_id,
                "input": input,
                "type": type,
                "hand": hand,
                "truth_id": truth_id,
            }
    writers.to_pickle(writers_path, compression="gzip")
    writepads.to_pickle(writepads_path, compression="gzip")
writepads


Unnamed: 0,writer_id,input,type,hand,truth_id
10,92bacc45-ecd3-4cb5-9f16-9b8c176cc3a2,Mouse,Text,Right,2
1000,240d6437-ee3d-4fa4-9d1e-737b5aacf205,Touch,Text,Left,162
1001,240d6437-ee3d-4fa4-9d1e-737b5aacf205,Touch,Text,Left,163
10122,4266463d-a7ec-4186-b6b9-04168d7416be,Touch,Text,Right,1354
10123,0b732db5-efeb-4f30-9b53-0dc38b9074b3,Touch,Text,Right,1354
...,...,...,...,...,...
9974,7753c4ce-dfd5-4067-8583-bf28e5896d70,Touch,WordGroup,Right,44205
9975,7753c4ce-dfd5-4067-8583-bf28e5896d70,Touch,WordGroup,Right,44206
9976,7753c4ce-dfd5-4067-8583-bf28e5896d70,Touch,WordGroup,Right,44207
9990,0daeb39f-217b-4b9b-870d-fad81b72efdb,Pen,WordGroup,Right,44208


# Utils

In [5]:
def calculate_value_counts(data, labels):
    value_counts = data.value_counts(dropna=False).sort_index().reset_index(drop=True)
    value_counts *= 100 / np.sum(value_counts)
    value_counts = value_counts.round().astype(int)
    value_index = value_counts.index
    result = ''
    for i in range(len(labels)):
        value = 0
        if i in value_index:
            value = value_counts.loc[i]
        result += f'{labels[i]}: {value}%\n'
    return result[:-1]

def calculate_value_counts_direct(value_counts, labels):
    value_counts *= 100 / np.sum(value_counts)
    value_counts = value_counts.round().astype(int)
    value_index = value_counts.index
    result = ''
    for i in range(len(labels)):
        value = 0
        if i in value_index:
            value = value_counts.loc[i]
        result += f'{labels[i]}: {value}%\n'
    return result[:-1]

# Stats

### General Counts

In [6]:
print(f"Total Word Count: {writers['word_count'].sum()}")
valid_writers = writers[writers["word_count"] > 0]
print(f"Total Writer Count: {len(valid_writers)}")

Total Word Count: 81191
Total Writer Count: 226


### Type

In [7]:
labels = ['Text', 'WordGroup']
counts = [0, 0]
for w in writepads.values:
    input = w[2]
    for i in range(len(labels)):
        if input == labels[i]:
            counts[i] += ground_truths.loc[w[4], "word_count"]
print(calculate_value_counts_direct(pd.Series(counts), labels))

Text: 59%
WordGroup: 41%


### Input

In [8]:
labels = ['Mouse', 'Pen', 'Touch', 'Touchpad', 'TouchPen']
counts = [0, 0, 0, 0, 0]
for w in writepads.values:
    input = w[1]
    for i in range(len(labels)):
        if input == labels[i]:
            counts[i] += ground_truths.loc[w[4], "word_count"]
print(calculate_value_counts_direct(pd.Series(counts), labels))

Mouse: 19%
Pen: 24%
Touch: 37%
Touchpad: 2%
TouchPen: 18%


### Hand

In [9]:
labels = ['Left', 'Right']
counts = [0, 0]
for w in writepads.values:
    input = w[3]
    for i in range(len(labels)):
        if input == labels[i]:
            counts[i] += ground_truths.loc[w[4], "word_count"]
print(calculate_value_counts_direct(pd.Series(counts), labels))

Left: 12%
Right: 88%


### Handedness

In [10]:
labels = sorted(['Both', 'Left', 'Right'])
print(calculate_value_counts(valid_writers["handedness"], labels))

Both: 1%
Left: 14%
Right: 85%


### Sex

In [11]:
labels = sorted(['Man', 'Woman']) + ['Unknown']
print(calculate_value_counts(valid_writers["sex"], labels))

Man: 64%
Woman: 33%
Unknown: 3%


### Education

In [12]:
labels = (
    sorted(
        [
            "Bachelor",
            "Doctoral",
            "LowerSecondary",
            "Master",
            "None",
            "Primary",
            "ShortCycleTertiary",
            "UpperSecondary",
        ]
    )
    + ["Unknown"]
)
print(calculate_value_counts(valid_writers["education"], labels))


Bachelor: 51%
Doctoral: 3%
LowerSecondary: 2%
Master: 14%
None: 1%
Primary: 1%
ShortCycleTertiary: 6%
UpperSecondary: 18%
Unknown: 3%


### Word - Writer

In [13]:
bins = [0, 150, 300, 600, 1200, 2400]
hist = np.histogram(valid_writers["word_count"], bins=bins)
user = np.append(hist[0], len(valid_writers[valid_writers["word_count"] >= 2400]))
hist = (user, bins)
result = 'word, writer\n'
labels = []
for i in range(len(user)):
    if bins[i] == 2400:
        labels.append('2400+')
    else:
        labels.append(f'{bins[i]}')
    result += f'{labels[i]}, '
    result += f'{user[i]:.0f}\n'
print('Normal')
print(result[:-1])

sum = 0
for i in reversed(range(len(user))):
    sum += user[i]
    user[i] = sum
result = 'word, writer\n'
labels = []
for i in range(len(user)):
    if bins[i] == 2400:
        labels.append('2400+')
    else:
        labels.append(f'{bins[i]}')
    result += f'{labels[i]}, '
    result += f'{user[i]:.0f}\n'
print()
print('Aggregated')
print(result[:-1])

Normal
word, writer
0, 116
150, 17
300, 45
600, 39
1200, 6
2400+, 3

Aggregated
word, writer
0, 226
150, 110
300, 93
600, 48
1200, 9
2400+, 3


### Age - Writer

In [14]:
year_users = 1401 - valid_writers['birth_year'][pd.notna(valid_writers['birth_year'])]
bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
hist = np.histogram(year_users, bins=bins)
user = np.append(hist[0], len(year_users[year_users >= 100]))
hist = (user, bins)
result = 'age, writer\n'
labels = []
for i in range(len(hist[0])):
    if bins[i] == 100:
        labels.append('100+')
    else:
        labels.append(f'{bins[i]}')
    result += f'{labels[i]}, '
    result += f'{user[i]}\n'
print(result)

age, writer
0, 0
10, 41
20, 134
30, 27
40, 11
50, 2
60, 1
70, 0
80, 1
90, 0
100+, 0

