# Example code for DigiDiaDem Dataset

This notebook contains example code for dataset and experiments as described in [The DigiDiaDem Speech-Cognitive Dataset: Initial Experiments on Detecting Cognitive Impairments from Speech](https://example.com).

Dataset used in this notebook is publicly available [here](http://hdl.handle.net/11234/1-6043).

In [None]:
# import all the required packages and objects
import json
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import HistGradientBoostingClassifier

In [2]:
# download and unzip the dataset
!curl -o allzip.zip https://lindat.mff.cuni.cz/repository/server/api/core/items/f4db9410-a3e9-4d8f-81ab-7c7dea4206bf/allzip?handleId=11234/1-6043
!unzip -o allzip.zip
!unzip -o DigiDiademSpeechCognitiveDataset.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1059k    0 1059k    0     0  9841k      0 --:--:-- --:--:-- --:--:-- 9899k
Archive:  allzip.zip
  inflating: DigiDiademSpeechCognitiveDataset.zip  
  inflating: DigiDiaDemSpeechCognitiveDataset.md  
Archive:  DigiDiademSpeechCognitiveDataset.zip
  inflating: ddd.yaml                
  inflating: expert_features_zipformer_lm-extra06.json  
  inflating: expert_scores.json      
  inflating: metadata_20251031.json  
  inflating: recordings_20251031.json  
  inflating: sessions_20251031.json  
  inflating: test_20251031.json      
  inflating: train_20251031.json     
  inflating: transcriptions_annotation_20251031.json  
  inflating: transcriptions_zipformer_20251031.json  
  inflating: transcriptions_zipformer_lm-extra06_20251031.json  


In [71]:
# pre-load all of the required data for dataset construction
with open("metadata_20251031.json", "r") as f:
    meta = pd.DataFrame(json.load(f)).set_index("screening_id")
with open("expert_features_zipformer_lm-extra06.json", "r") as f:
    features_raw = pd.DataFrame(json.load(f)).set_index("screening_id")
with open("train_20251031.json", "r") as f:
    train_ids = json.load(f)
with open("test_20251031.json", "r") as f:
    test_ids = json.load(f)

In [73]:
# define helper functions

def get_xy(feature_names, target):
    global features_raw
    target_maps = {"0vs23": {0: 0, 2: 1, 3: 1}, "0vs1vs2vs3": {0: 0, 1: 1, 2: 2, 3: 3}}
    targets = meta.loc[meta["kobar_kategorizace_definitivni"].isin(target_maps[target]), "kobar_kategorizace_definitivni"].map(target_maps[target]).sort_index()
    features = features_raw.loc[features_raw.index.intersection(targets.index)].sort_index()
    x_train = features.loc[features.index.isin(train_ids)][feature_names]
    x_test = features.loc[features.index.isin(test_ids)][feature_names]
    y_train = targets.loc[targets.index.isin(train_ids)]
    y_test = targets.loc[targets.index.isin(test_ids)]
    return x_train, y_train, x_test, y_test

def train_and_eval_logreg(x_train, y_train, x_test, y_test):
    model = Pipeline([
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler()),
        ("logreg", LogisticRegression(random_state=42, max_iter=1000, solver="liblinear")),
    ])
    model.fit(x_train, y_train)
    predicted = model.predict(x_test)
    report = classification_report(y_test, predicted, zero_division=0.0)
    return report

def train_and_eval_histgradboost(x_train, y_train, x_test, y_test):
    model = HistGradientBoostingClassifier(random_state=42, early_stopping=False)
    model.fit(x_train, y_train)
    predicted = model.predict(x_test)
    report = classification_report(y_test, predicted, zero_division=0.0)
    return report

def sep(): # just a simple separator snippet
    print('='*80+"\n")

In [48]:
###############################################################
#                           TASK 01                           #
###############################################################

# define what feature names we want to include in the experiment
task_features = ['expertFeatures_1_task1 Correctly repeated numbers']

# obtain the filtered X-y splits for test and train subsets
x_train, y_train, x_test, y_test = get_xy(task_features, "0vs23")

# print the obtained training data (just a sample) to see what they actually look like
print("[Task 01] training data (sample):")
display(x_train.head())
sep()

# train the LOGISTIC REGRESSION classifier and evaluate on the test subset
report = train_and_eval_logreg(x_train, y_train, x_test, y_test)

# print the classificaion report
print("[Task 01] classification report (Logistic Regression)")
print(report)
sep()

# now train the HISTOGRAM GRADIENT BOOSTING classifier and evaluate on the test subset
report = train_and_eval_histgradboost(x_train, y_train, x_test, y_test)

# and again print the classification report
print("[Task 01] classification report (Histogram Gradient Boosting)")
print(report)

[Task 01] training data (sample):


Unnamed: 0_level_0,expertFeatures_1_task1 Correctly repeated numbers
screening_id,Unnamed: 1_level_1
scr-29XBdG3UN32Lm22zGeq9AV,9.0
scr-2CMKXBDSreZoAkLDhcK6a8,9.0
scr-2UvzBxvgymGYCpdn2VnLUN,9.0
scr-2XfEpW35qx7wMfVFxJ9Tdp,9.0
scr-2Xn65aWeAY3yHaAvy7Hd9y,9.0



[Task 01] classification report (Logistic Regression)
              precision    recall  f1-score   support

           0       0.70      1.00      0.82        42
           1       0.00      0.00      0.00        18

    accuracy                           0.70        60
   macro avg       0.35      0.50      0.41        60
weighted avg       0.49      0.70      0.58        60


[Task 01] classification report (Histogram Gradient Boosting)
              precision    recall  f1-score   support

           0       0.71      0.98      0.82        42
           1       0.50      0.06      0.10        18

    accuracy                           0.70        60
   macro avg       0.60      0.52      0.46        60
weighted avg       0.64      0.70      0.60        60



In [59]:
###############################################################
#                           TASK 02                           #
###############################################################

# define what feature names we want to include in the experiment
task_features = ['expertFeatures_2_task2 Correctly repeated characters']

# obtain the filtered X-y splits for test and train subsets
x_train, y_train, x_test, y_test = get_xy(task_features, "0vs23")

# print the obtained training data (just a sample) to see what they actually look like
print("[Task 02] training data (sample):")
display(x_train.head())
sep()

# train the LOGISTIC REGRESSION classifier and evaluate on the test subset
report = train_and_eval_logreg(x_train, y_train, x_test, y_test)

# print the classificaion report
print("[Task 02] classification report (Logistic Regression)")
print(report)
sep()

# now train the HISTOGRAM GRADIENT BOOSTING classifier and evaluate on the test subset
report = train_and_eval_histgradboost(x_train, y_train, x_test, y_test)

# and again print the classification report
print("[Task 02] classification report (Histogram Gradient Boosting)")
print(report)

[Task 02] training data (sample):


Unnamed: 0_level_0,expertFeatures_2_task2 Correctly repeated characters
screening_id,Unnamed: 1_level_1
scr-29XBdG3UN32Lm22zGeq9AV,4.0
scr-2CMKXBDSreZoAkLDhcK6a8,4.0
scr-2UvzBxvgymGYCpdn2VnLUN,4.0
scr-2XfEpW35qx7wMfVFxJ9Tdp,4.0
scr-2Xn65aWeAY3yHaAvy7Hd9y,3.0



[Task 02] classification report (Logistic Regression)
              precision    recall  f1-score   support

           0       0.70      1.00      0.82        42
           1       0.00      0.00      0.00        18

    accuracy                           0.70        60
   macro avg       0.35      0.50      0.41        60
weighted avg       0.49      0.70      0.58        60


[Task 02] classification report (Histogram Gradient Boosting)
              precision    recall  f1-score   support

           0       0.70      1.00      0.82        42
           1       0.00      0.00      0.00        18

    accuracy                           0.70        60
   macro avg       0.35      0.50      0.41        60
weighted avg       0.49      0.70      0.58        60



In [60]:
###############################################################
#                           TASK 03                           #
###############################################################

# define what feature names we want to include in the experiment
task_features = [
	'expertFeatures_3_task3 Character match ratio',
	'expertFeatures_4_task3 Correctly repeated words'
]

# obtain the filtered X-y splits for test and train subsets
x_train, y_train, x_test, y_test = get_xy(task_features, "0vs23")

# print the obtained training data (just a sample) to see what they actually look like
print("[Task 03] training data (sample):")
display(x_train.head())
sep()

# train the LOGISTIC REGRESSION classifier and evaluate on the test subset
report = train_and_eval_logreg(x_train, y_train, x_test, y_test)

# print the classificaion report
print("[Task 03] classification report (Logistic Regression)")
print(report)
sep()

# now train the HISTOGRAM GRADIENT BOOSTING classifier and evaluate on the test subset
report = train_and_eval_histgradboost(x_train, y_train, x_test, y_test)

# and again print the classification report
print("[Task 03] classification report (Histogram Gradient Boosting)")
print(report)

[Task 03] training data (sample):


Unnamed: 0_level_0,expertFeatures_3_task3 Character match ratio,expertFeatures_4_task3 Correctly repeated words
screening_id,Unnamed: 1_level_1,Unnamed: 2_level_1
scr-29XBdG3UN32Lm22zGeq9AV,0.449,6.0
scr-2CMKXBDSreZoAkLDhcK6a8,0.9434,9.0
scr-2UvzBxvgymGYCpdn2VnLUN,0.4227,10.0
scr-2XfEpW35qx7wMfVFxJ9Tdp,0.6792,9.0
scr-2Xn65aWeAY3yHaAvy7Hd9y,1.0,10.0



[Task 03] classification report (Logistic Regression)
              precision    recall  f1-score   support

           0       0.79      0.88      0.83        42
           1       0.62      0.44      0.52        18

    accuracy                           0.75        60
   macro avg       0.70      0.66      0.67        60
weighted avg       0.74      0.75      0.74        60


[Task 03] classification report (Histogram Gradient Boosting)
              precision    recall  f1-score   support

           0       0.86      0.90      0.88        42
           1       0.75      0.67      0.71        18

    accuracy                           0.83        60
   macro avg       0.81      0.79      0.79        60
weighted avg       0.83      0.83      0.83        60



In [61]:
###############################################################
#                           TASK 04m                          #
###############################################################

# define what feature names we want to include in the experiment
task_features = [
	'expertFeatures_5_task4 Sentence count',
	'expertFeatures_6_task4 First person verb proportion',
	'expertFeatures_7_task4 Meaningful words ratio',
	'expertFeatures_8_task4 Pronoun to noun ratio',
	'expertFeatures_9_task4 Count of repeated meaningful words',
	'expertFeatures_10_task4 Unique words to total words',
]

# obtain the filtered X-y splits for test and train subsets
x_train, y_train, x_test, y_test = get_xy(task_features, "0vs23")

# print the obtained training data (just a sample) to see what they actually look like
print("[Task 04m] training data (sample):")
display(x_train.head())
sep()

# train the LOGISTIC REGRESSION classifier and evaluate on the test subset
report = train_and_eval_logreg(x_train, y_train, x_test, y_test)

# print the classificaion report
print("[Task 04m] classification report (Logistic Regression)")
print(report)
sep()

# now train the HISTOGRAM GRADIENT BOOSTING classifier and evaluate on the test subset
report = train_and_eval_histgradboost(x_train, y_train, x_test, y_test)

# and again print the classification report
print("[Task 04m] classification report (Histogram Gradient Boosting)")
print(report)

[Task 04m] training data (sample):


Unnamed: 0_level_0,expertFeatures_5_task4 Sentence count,expertFeatures_6_task4 First person verb proportion,expertFeatures_7_task4 Meaningful words ratio,expertFeatures_8_task4 Pronoun to noun ratio,expertFeatures_9_task4 Count of repeated meaningful words,expertFeatures_10_task4 Unique words to total words
screening_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
scr-29XBdG3UN32Lm22zGeq9AV,6.0,0.0,0.57377,0.173913,6.0,0.769231
scr-2CMKXBDSreZoAkLDhcK6a8,3.0,0.166667,0.647059,0.071429,3.0,0.896552
scr-2UvzBxvgymGYCpdn2VnLUN,10.0,0.0,0.59596,0.028571,13.0,0.638554
scr-2XfEpW35qx7wMfVFxJ9Tdp,15.0,0.043478,0.587302,0.242424,17.0,0.615385
scr-2Xn65aWeAY3yHaAvy7Hd9y,15.0,0.0,0.538462,0.217391,8.0,0.836066



[Task 04m] classification report (Logistic Regression)
              precision    recall  f1-score   support

           0       0.78      0.86      0.82        42
           1       0.57      0.44      0.50        18

    accuracy                           0.73        60
   macro avg       0.68      0.65      0.66        60
weighted avg       0.72      0.73      0.72        60


[Task 04m] classification report (Histogram Gradient Boosting)
              precision    recall  f1-score   support

           0       0.79      0.79      0.79        42
           1       0.50      0.50      0.50        18

    accuracy                           0.70        60
   macro avg       0.64      0.64      0.64        60
weighted avg       0.70      0.70      0.70        60



In [62]:
###############################################################
#                           TASK 04s                          #
###############################################################

# define what feature names we want to include in the experiment
task_features = [
	'expertFeatures_11_task4 Named object count',
	'expertFeatures_12_task4 Described object relation count',
	'expertFeatures_13_task4 Distinct topic count',
	'expertFeatures_14_task4 Description trajectory length',
	'expertFeatures_15_task4 Objects in water count',
	'expertFeatures_16_task4 Objects in sky count',
	'expertFeatures_17_task4 Objects on land count',
	'expertFeatures_18_task4 Explicit child danger mentioned',
	'expertFeatures_19_task4 Explicit animal danger mentioned',
 ]

# obtain the filtered X-y splits for test and train subsets
x_train, y_train, x_test, y_test = get_xy(task_features, "0vs23")

# print the obtained training data (just a sample) to see what they actually look like
print("[Task 04s] training data (sample):")
display(x_train.head())
sep()

# train the LOGISTIC REGRESSION classifier and evaluate on the test subset
report = train_and_eval_logreg(x_train, y_train, x_test, y_test)

# print the classificaion report
print("[Task 04s] classification report (Logistic Regression)")
print(report)
sep()

# now train the HISTOGRAM GRADIENT BOOSTING classifier and evaluate on the test subset
report = train_and_eval_histgradboost(x_train, y_train, x_test, y_test)

# and again print the classification report
print("[Task 04s] classification report (Histogram Gradient Boosting)")
print(report)

[Task 04s] training data (sample):


Unnamed: 0_level_0,expertFeatures_11_task4 Named object count,expertFeatures_12_task4 Described object relation count,expertFeatures_13_task4 Distinct topic count,expertFeatures_14_task4 Description trajectory length,expertFeatures_15_task4 Objects in water count,expertFeatures_16_task4 Objects in sky count,expertFeatures_17_task4 Objects on land count,expertFeatures_18_task4 Explicit child danger mentioned,expertFeatures_19_task4 Explicit animal danger mentioned
screening_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
scr-29XBdG3UN32Lm22zGeq9AV,21.0,13.0,14.0,0.866285,7.0,3.0,5.0,0.0,1.0
scr-2CMKXBDSreZoAkLDhcK6a8,13.0,8.0,13.0,0.594183,3.0,2.0,3.0,1.0,0.0
scr-2UvzBxvgymGYCpdn2VnLUN,33.0,5.0,24.0,2.319715,8.0,5.0,11.0,0.0,1.0
scr-2XfEpW35qx7wMfVFxJ9Tdp,24.0,5.0,16.0,1.277062,5.0,5.0,9.0,1.0,1.0
scr-2Xn65aWeAY3yHaAvy7Hd9y,21.0,4.0,14.0,1.562818,8.0,3.0,4.0,1.0,1.0



[Task 04s] classification report (Logistic Regression)
              precision    recall  f1-score   support

           0       0.81      0.83      0.82        42
           1       0.59      0.56      0.57        18

    accuracy                           0.75        60
   macro avg       0.70      0.69      0.70        60
weighted avg       0.75      0.75      0.75        60


[Task 04s] classification report (Histogram Gradient Boosting)
              precision    recall  f1-score   support

           0       0.77      0.81      0.79        42
           1       0.50      0.44      0.47        18

    accuracy                           0.70        60
   macro avg       0.64      0.63      0.63        60
weighted avg       0.69      0.70      0.69        60



In [63]:
###############################################################
#                           TASK 05                           #
###############################################################

# define what feature names we want to include in the experiment
task_features = [
	'expertFeatures_20_task5 Total recalled words count',
	'expertFeatures_21_task5 Distinct objects recalled count',
	'expertFeatures_22_task5 Repeated recalled words count',
]

# obtain the filtered X-y splits for test and train subsets
x_train, y_train, x_test, y_test = get_xy(task_features, "0vs23")

# print the obtained training data (just a sample) to see what they actually look like
print("[Task 05] training data (sample):")
display(x_train.head())
sep()

# train the LOGISTIC REGRESSION classifier and evaluate on the test subset
report = train_and_eval_logreg(x_train, y_train, x_test, y_test)

# print the classificaion report
print("[Task 05] classification report (Logistic Regression)")
print(report)
sep()

# now train the HISTOGRAM GRADIENT BOOSTING classifier and evaluate on the test subset
report = train_and_eval_histgradboost(x_train, y_train, x_test, y_test)

# and again print the classification report
print("[Task 05] classification report (Histogram Gradient Boosting)")
print(report)

[Task 05] training data (sample):


Unnamed: 0_level_0,expertFeatures_20_task5 Total recalled words count,expertFeatures_21_task5 Distinct objects recalled count,expertFeatures_22_task5 Repeated recalled words count
screening_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
scr-29XBdG3UN32Lm22zGeq9AV,42.0,11.0,9.0
scr-2CMKXBDSreZoAkLDhcK6a8,25.0,11.0,4.0
scr-2UvzBxvgymGYCpdn2VnLUN,28.0,9.0,0.0
scr-2XfEpW35qx7wMfVFxJ9Tdp,20.0,3.0,2.0
scr-2Xn65aWeAY3yHaAvy7Hd9y,24.0,11.0,4.0



[Task 05] classification report (Logistic Regression)
              precision    recall  f1-score   support

           0       0.85      0.95      0.90        42
           1       0.85      0.61      0.71        18

    accuracy                           0.85        60
   macro avg       0.85      0.78      0.80        60
weighted avg       0.85      0.85      0.84        60


[Task 05] classification report (Histogram Gradient Boosting)
              precision    recall  f1-score   support

           0       0.85      0.98      0.91        42
           1       0.92      0.61      0.73        18

    accuracy                           0.87        60
   macro avg       0.89      0.79      0.82        60
weighted avg       0.87      0.87      0.86        60



In [64]:
###############################################################
#                           TASK 06                           #
###############################################################

# define what feature names we want to include in the experiment
task_features = [
	'expertFeatures_23_task6 Correctly named pictures count',
	'expertFeatures_24_task6 Total naming reaction time',
]

# obtain the filtered X-y splits for test and train subsets
x_train, y_train, x_test, y_test = get_xy(task_features, "0vs23")

# print the obtained training data (just a sample) to see what they actually look like
print("[Task 06] training data (sample):")
display(x_train.head())
sep()

# train the LOGISTIC REGRESSION classifier and evaluate on the test subset
report = train_and_eval_logreg(x_train, y_train, x_test, y_test)

# print the classificaion report
print("[Task 06] classification report (Logistic Regression)")
print(report)
sep()

# now train the HISTOGRAM GRADIENT BOOSTING classifier and evaluate on the test subset
report = train_and_eval_histgradboost(x_train, y_train, x_test, y_test)

# and again print the classification report
print("[Task 06] classification report (Histogram Gradient Boosting)")
print(report)

[Task 06] training data (sample):


Unnamed: 0_level_0,expertFeatures_23_task6 Correctly named pictures count,expertFeatures_24_task6 Total naming reaction time
screening_id,Unnamed: 1_level_1,Unnamed: 2_level_1
scr-29XBdG3UN32Lm22zGeq9AV,15.0,73.88
scr-2CMKXBDSreZoAkLDhcK6a8,17.0,57.04
scr-2UvzBxvgymGYCpdn2VnLUN,16.0,59.6
scr-2XfEpW35qx7wMfVFxJ9Tdp,17.0,57.68
scr-2Xn65aWeAY3yHaAvy7Hd9y,19.0,50.8



[Task 06] classification report (Logistic Regression)
              precision    recall  f1-score   support

           0       0.79      0.90      0.84        42
           1       0.67      0.44      0.53        18

    accuracy                           0.77        60
   macro avg       0.73      0.67      0.69        60
weighted avg       0.75      0.77      0.75        60


[Task 06] classification report (Histogram Gradient Boosting)
              precision    recall  f1-score   support

           0       0.81      0.90      0.85        42
           1       0.69      0.50      0.58        18

    accuracy                           0.78        60
   macro avg       0.75      0.70      0.72        60
weighted avg       0.77      0.78      0.77        60



In [65]:
###############################################################
#                           TASK 07                           #
###############################################################

# define what feature names we want to include in the experiment
task_features = ['expertFeatures_25_task7 Correctly recalled pictures count']

# obtain the filtered X-y splits for test and train subsets
x_train, y_train, x_test, y_test = get_xy(task_features, "0vs23")

# print the obtained training data (just a sample) to see what they actually look like
print("[Task 07] training data (sample):")
display(x_train.head())
sep()

# train the LOGISTIC REGRESSION classifier and evaluate on the test subset
report = train_and_eval_logreg(x_train, y_train, x_test, y_test)

# print the classificaion report
print("[Task 07] classification report (Logistic Regression)")
print(report)
sep()

# now train the HISTOGRAM GRADIENT BOOSTING classifier and evaluate on the test subset
report = train_and_eval_histgradboost(x_train, y_train, x_test, y_test)

# and again print the classification report
print("[Task 07] classification report (Histogram Gradient Boosting)")
print(report)

[Task 07] training data (sample):


Unnamed: 0_level_0,expertFeatures_25_task7 Correctly recalled pictures count
screening_id,Unnamed: 1_level_1
scr-29XBdG3UN32Lm22zGeq9AV,8.0
scr-2CMKXBDSreZoAkLDhcK6a8,4.0
scr-2UvzBxvgymGYCpdn2VnLUN,8.0
scr-2XfEpW35qx7wMfVFxJ9Tdp,1.0
scr-2Xn65aWeAY3yHaAvy7Hd9y,5.0



[Task 07] classification report (Logistic Regression)
              precision    recall  f1-score   support

           0       0.88      0.90      0.89        42
           1       0.76      0.72      0.74        18

    accuracy                           0.85        60
   macro avg       0.82      0.81      0.82        60
weighted avg       0.85      0.85      0.85        60


[Task 07] classification report (Histogram Gradient Boosting)
              precision    recall  f1-score   support

           0       0.92      0.81      0.86        42
           1       0.65      0.83      0.73        18

    accuracy                           0.82        60
   macro avg       0.79      0.82      0.80        60
weighted avg       0.84      0.82      0.82        60



In [66]:
###############################################################
#                           TASK 08                           #
###############################################################

# define what feature names we want to include in the experiment
task_features = [
	'expertFeatures_26_task8 Total word count',
	'expertFeatures_27_task8 Animal word count',
	'expertFeatures_28_task8 Repeated animals count',
]

# obtain the filtered X-y splits for test and train subsets
x_train, y_train, x_test, y_test = get_xy(task_features, "0vs23")

# print the obtained training data (just a sample) to see what they actually look like
print("[Task 08] training data (sample):")
display(x_train.head())
sep()

# train the LOGISTIC REGRESSION classifier and evaluate on the test subset
report = train_and_eval_logreg(x_train, y_train, x_test, y_test)

# print the classificaion report
print("[Task 08] classification report (Logistic Regression)")
print(report)
sep()

# now train the HISTOGRAM GRADIENT BOOSTING classifier and evaluate on the test subset
report = train_and_eval_histgradboost(x_train, y_train, x_test, y_test)

# and again print the classification report
print("[Task 08] classification report (Histogram Gradient Boosting)")
print(report)

[Task 08] training data (sample):


Unnamed: 0_level_0,expertFeatures_26_task8 Total word count,expertFeatures_27_task8 Animal word count,expertFeatures_28_task8 Repeated animals count
screening_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
scr-29XBdG3UN32Lm22zGeq9AV,20.0,18.0,0.0
scr-2CMKXBDSreZoAkLDhcK6a8,16.0,10.0,0.0
scr-2UvzBxvgymGYCpdn2VnLUN,19.0,12.0,0.0
scr-2XfEpW35qx7wMfVFxJ9Tdp,8.0,6.0,0.0
scr-2Xn65aWeAY3yHaAvy7Hd9y,11.0,12.0,0.0



[Task 08] classification report (Logistic Regression)
              precision    recall  f1-score   support

           0       0.93      0.90      0.92        42
           1       0.79      0.83      0.81        18

    accuracy                           0.88        60
   macro avg       0.86      0.87      0.86        60
weighted avg       0.89      0.88      0.88        60


[Task 08] classification report (Histogram Gradient Boosting)
              precision    recall  f1-score   support

           0       0.91      0.93      0.92        42
           1       0.82      0.78      0.80        18

    accuracy                           0.88        60
   macro avg       0.87      0.85      0.86        60
weighted avg       0.88      0.88      0.88        60



In [67]:
###############################################################
#                           TASK 09                           #
###############################################################

# define what feature names we want to include in the experiment
task_features = [
	'expertFeatures_29_task9 Percentage of repeated-recalled sentence characters',
	'expertFeatures_30_task9 Correct recalled sentence words count',
]

# obtain the filtered X-y splits for test and train subsets
x_train, y_train, x_test, y_test = get_xy(task_features, "0vs23")

# print the obtained training data (just a sample) to see what they actually look like
print("[Task 09] training data (sample):")
display(x_train.head())
sep()

# train the LOGISTIC REGRESSION classifier and evaluate on the test subset
report = train_and_eval_logreg(x_train, y_train, x_test, y_test)

# print the classificaion report
print("[Task 09] classification report (Logistic Regression)")
print(report)
sep()

# now train the HISTOGRAM GRADIENT BOOSTING classifier and evaluate on the test subset
report = train_and_eval_histgradboost(x_train, y_train, x_test, y_test)

# and again print the classification report
print("[Task 09] classification report (Histogram Gradient Boosting)")
print(report)

[Task 09] training data (sample):


Unnamed: 0_level_0,expertFeatures_29_task9 Percentage of repeated-recalled sentence characters,expertFeatures_30_task9 Correct recalled sentence words count
screening_id,Unnamed: 1_level_1,Unnamed: 2_level_1
scr-29XBdG3UN32Lm22zGeq9AV,0.283,2.0
scr-2CMKXBDSreZoAkLDhcK6a8,0.4528,4.0
scr-2UvzBxvgymGYCpdn2VnLUN,0.2073,0.0
scr-2XfEpW35qx7wMfVFxJ9Tdp,0.1698,0.0
scr-2Xn65aWeAY3yHaAvy7Hd9y,0.3623,5.0



[Task 09] classification report (Logistic Regression)
              precision    recall  f1-score   support

           0       0.88      0.90      0.89        42
           1       0.76      0.72      0.74        18

    accuracy                           0.85        60
   macro avg       0.82      0.81      0.82        60
weighted avg       0.85      0.85      0.85        60


[Task 09] classification report (Histogram Gradient Boosting)
              precision    recall  f1-score   support

           0       0.89      0.95      0.92        42
           1       0.87      0.72      0.79        18

    accuracy                           0.88        60
   macro avg       0.88      0.84      0.85        60
weighted avg       0.88      0.88      0.88        60



In [68]:
###############################################################
#                           TASK 10                           #
###############################################################

# define what feature names we want to include in the experiment
task_features = ['expertFeatures_31_task10 Word similarity score']

# obtain the filtered X-y splits for test and train subsets
x_train, y_train, x_test, y_test = get_xy(task_features, "0vs23")

# print the obtained training data (just a sample) to see what they actually look like
print("[Task 10] training data (sample):")
display(x_train.head())
sep()

# train the LOGISTIC REGRESSION classifier and evaluate on the test subset
report = train_and_eval_logreg(x_train, y_train, x_test, y_test)

# print the classificaion report
print("[Task 10] classification report (Logistic Regression)")
print(report)
sep()

# now train the HISTOGRAM GRADIENT BOOSTING classifier and evaluate on the test subset
report = train_and_eval_histgradboost(x_train, y_train, x_test, y_test)

# and again print the classification report
print("[Task 10] classification report (Histogram Gradient Boosting)")
print(report)

[Task 10] training data (sample):


Unnamed: 0_level_0,expertFeatures_31_task10 Word similarity score
screening_id,Unnamed: 1_level_1
scr-29XBdG3UN32Lm22zGeq9AV,0.804
scr-2CMKXBDSreZoAkLDhcK6a8,0.8768
scr-2UvzBxvgymGYCpdn2VnLUN,0.7285
scr-2XfEpW35qx7wMfVFxJ9Tdp,
scr-2Xn65aWeAY3yHaAvy7Hd9y,0.523



[Task 10] classification report (Logistic Regression)
              precision    recall  f1-score   support

           0       0.70      0.93      0.80        42
           1       0.25      0.06      0.09        18

    accuracy                           0.67        60
   macro avg       0.47      0.49      0.44        60
weighted avg       0.56      0.67      0.58        60


[Task 10] classification report (Histogram Gradient Boosting)
              precision    recall  f1-score   support

           0       0.69      0.98      0.81        42
           1       0.00      0.00      0.00        18

    accuracy                           0.68        60
   macro avg       0.35      0.49      0.41        60
weighted avg       0.49      0.68      0.57        60

