In [18]:
import json
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [21]:
with open("../../datasets/export-lindat/lindat-20251031_json/metadata_20251031.json", "r") as f:
    meta = pd.DataFrame(json.load(f)).set_index("screening_id")

target_map = {0: 0, 2: 1, 3: 1}  # map targets: 0 -> 0 and (2, 3) -> 1
raget_key = "kobar_kategorizace_definitivni"
targets = meta.loc[meta[raget_key].isin(target_map), raget_key].map(target_map).sort_index()

with open("../../datasets/export-lindat/lindat-20251031_json/expert_features_zipformer_lm-extra06.json", "r") as f:
    features = pd.DataFrame(json.load(f)).set_index("screening_id")
    features = features.loc[features.index.intersection(targets.index)].sort_index()

with open("../../datasets/export-lindat/lindat-20251031_json/train_20251031.json", "r") as f:
    train_ids = json.load(f)
with open("../../datasets/export-lindat/lindat-20251031_json/test_20251031.json", "r") as f:
    test_ids = json.load(f)

X_train = features.loc[features.index.isin(train_ids)]
y_train = targets.loc[targets.index.isin(train_ids)]

X_test = features.loc[features.index.isin(test_ids)]
y_test = targets.loc[targets.index.isin(test_ids)]

In [22]:
model = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler()),
        ("logreg", LogisticRegression(random_state=42, max_iter=1000, solver="liblinear")),
    ]
)

In [26]:
# TASK 01
features = ["expertFeatures_1_task1 Correctly repeated numbers"]

X_train_task = X_train[features]
y_train_task = y_train.loc[y_train.index.intersection(X_train_task.index)]

X_test_task = X_test[features]
y_test_task = y_test.loc[y_test.index.intersection(X_test_task.index)]

model.fit(X_train_task, y_train_task)
predicted = model.predict(X_test_task)
report = classification_report(y_test_task, predicted, zero_division=0.0)
print(report)

              precision    recall  f1-score   support

           0       0.70      1.00      0.82        42
           1       0.00      0.00      0.00        18

    accuracy                           0.70        60
   macro avg       0.35      0.50      0.41        60
weighted avg       0.49      0.70      0.58        60



In [27]:
# TASK 02
features = ["expertFeatures_2_task2 Correctly repeated characters"]

X_train_task = X_train[features]
y_train_task = y_train.loc[y_train.index.intersection(X_train_task.index)]

X_test_task = X_test[features]
y_test_task = y_test.loc[y_test.index.intersection(X_test_task.index)]

model.fit(X_train_task, y_train_task)
predicted = model.predict(X_test_task)
report = classification_report(y_test_task, predicted, zero_division=0.0)
print(report)

              precision    recall  f1-score   support

           0       0.70      1.00      0.82        42
           1       0.00      0.00      0.00        18

    accuracy                           0.70        60
   macro avg       0.35      0.50      0.41        60
weighted avg       0.49      0.70      0.58        60



In [28]:
# TASK 03
features = [
    "expertFeatures_3_task3 Character match ratio",
    "expertFeatures_4_task3 Correctly repeated words"
]

X_train_task = X_train[features]
y_train_task = y_train.loc[y_train.index.intersection(X_train_task.index)]

X_test_task = X_test[features]
y_test_task = y_test.loc[y_test.index.intersection(X_test_task.index)]

model.fit(X_train_task, y_train_task)
predicted = model.predict(X_test_task)
report = classification_report(y_test_task, predicted, zero_division=0.0)
print(report)

              precision    recall  f1-score   support

           0       0.79      0.88      0.83        42
           1       0.62      0.44      0.52        18

    accuracy                           0.75        60
   macro avg       0.70      0.66      0.67        60
weighted avg       0.74      0.75      0.74        60



In [44]:
# TASK 04 - morphological analysis
features = [
    "expertFeatures_5_task4 Sentence count",
    "expertFeatures_6_task4 First person verb proportion",
    "expertFeatures_7_task4 Meaningful words ratio",
    "expertFeatures_8_task4 Pronoun to noun ratio",
    "expertFeatures_9_task4 Count of repeated meaningful words",
    "expertFeatures_10_task4 Unique words to total words",
]

X_train_task = X_train[features]
y_train_task = y_train.loc[y_train.index.intersection(X_train_task.index)]

X_test_task = X_test[features]
y_test_task = y_test.loc[y_test.index.intersection(X_test_task.index)]

model.fit(X_train_task, y_train_task)
predicted = model.predict(X_test_task)
report = classification_report(y_test_task, predicted, zero_division=0.0)
print(report)

              precision    recall  f1-score   support

           0       0.78      0.86      0.82        42
           1       0.57      0.44      0.50        18

    accuracy                           0.73        60
   macro avg       0.68      0.65      0.66        60
weighted avg       0.72      0.73      0.72        60



In [31]:
# TASK 04 - semantic analysis
features = [
    "expertFeatures_11_task4 Named object count",
    "expertFeatures_12_task4 Described object relation count",
    "expertFeatures_13_task4 Distinct topic count",
    "expertFeatures_14_task4 Description trajectory length",
    "expertFeatures_15_task4 Objects in water count",
    "expertFeatures_16_task4 Objects in sky count",
    "expertFeatures_17_task4 Objects on land count",
    "expertFeatures_18_task4 Explicit child danger mentioned",
    "expertFeatures_19_task4 Explicit animal danger mentioned",
]

X_train_task = X_train[features]
y_train_task = y_train.loc[y_train.index.intersection(X_train_task.index)]

X_test_task = X_test[features]
y_test_task = y_test.loc[y_test.index.intersection(X_test_task.index)]

model.fit(X_train_task, y_train_task)
predicted = model.predict(X_test_task)
report = classification_report(y_test_task, predicted, zero_division=0.0)
print(report)

              precision    recall  f1-score   support

           0       0.81      0.83      0.82        42
           1       0.59      0.56      0.57        18

    accuracy                           0.75        60
   macro avg       0.70      0.69      0.70        60
weighted avg       0.75      0.75      0.75        60



In [32]:
# TASK 05
features = [
    "expertFeatures_20_task5 Total recalled words count",
    "expertFeatures_21_task5 Distinct objects recalled count",
    "expertFeatures_22_task5 Repeated recalled words count",
]

X_train_task = X_train[features]
y_train_task = y_train.loc[y_train.index.intersection(X_train_task.index)]

X_test_task = X_test[features]
y_test_task = y_test.loc[y_test.index.intersection(X_test_task.index)]

model.fit(X_train_task, y_train_task)
predicted = model.predict(X_test_task)
report = classification_report(y_test_task, predicted, zero_division=0.0)
print(report)

              precision    recall  f1-score   support

           0       0.85      0.95      0.90        42
           1       0.85      0.61      0.71        18

    accuracy                           0.85        60
   macro avg       0.85      0.78      0.80        60
weighted avg       0.85      0.85      0.84        60



In [33]:
# TASK 06
features = [
    "expertFeatures_23_task6 Correctly named pictures count",
    "expertFeatures_24_task6 Total naming reaction time",
]

X_train_task = X_train[features]
y_train_task = y_train.loc[y_train.index.intersection(X_train_task.index)]

X_test_task = X_test[features]
y_test_task = y_test.loc[y_test.index.intersection(X_test_task.index)]

model.fit(X_train_task, y_train_task)
predicted = model.predict(X_test_task)
report = classification_report(y_test_task, predicted, zero_division=0.0)
print(report)

              precision    recall  f1-score   support

           0       0.79      0.90      0.84        42
           1       0.67      0.44      0.53        18

    accuracy                           0.77        60
   macro avg       0.73      0.67      0.69        60
weighted avg       0.75      0.77      0.75        60



In [34]:
# TASK 07
features = ["expertFeatures_25_task7 Correctly recalled pictures count"]

X_train_task = X_train[features]
y_train_task = y_train.loc[y_train.index.intersection(X_train_task.index)]

X_test_task = X_test[features]
y_test_task = y_test.loc[y_test.index.intersection(X_test_task.index)]

model.fit(X_train_task, y_train_task)
predicted = model.predict(X_test_task)
report = classification_report(y_test_task, predicted, zero_division=0.0)
print(report)

              precision    recall  f1-score   support

           0       0.88      0.90      0.89        42
           1       0.76      0.72      0.74        18

    accuracy                           0.85        60
   macro avg       0.82      0.81      0.82        60
weighted avg       0.85      0.85      0.85        60



In [35]:
# TASK 08
features = [
    "expertFeatures_26_task8 Total word count",
    "expertFeatures_27_task8 Animal word count",
    "expertFeatures_28_task8 Repeated animals count",
]

X_train_task = X_train[features]
y_train_task = y_train.loc[y_train.index.intersection(X_train_task.index)]

X_test_task = X_test[features]
y_test_task = y_test.loc[y_test.index.intersection(X_test_task.index)]

model.fit(X_train_task, y_train_task)
predicted = model.predict(X_test_task)
report = classification_report(y_test_task, predicted, zero_division=0.0)
print(report)

              precision    recall  f1-score   support

           0       0.93      0.90      0.92        42
           1       0.79      0.83      0.81        18

    accuracy                           0.88        60
   macro avg       0.86      0.87      0.86        60
weighted avg       0.89      0.88      0.88        60



In [36]:
# TASK 09
features = [
    "expertFeatures_29_task9 Percentage of repeated-recalled sentence characters",
    "expertFeatures_30_task9 Correct recalled sentence words count",
]

X_train_task = X_train[features]
y_train_task = y_train.loc[y_train.index.intersection(X_train_task.index)]

X_test_task = X_test[features]
y_test_task = y_test.loc[y_test.index.intersection(X_test_task.index)]

model.fit(X_train_task, y_train_task)
predicted = model.predict(X_test_task)
report = classification_report(y_test_task, predicted, zero_division=0.0)
print(report)

              precision    recall  f1-score   support

           0       0.88      0.90      0.89        42
           1       0.76      0.72      0.74        18

    accuracy                           0.85        60
   macro avg       0.82      0.81      0.82        60
weighted avg       0.85      0.85      0.85        60



In [39]:
# TASK 10
features = ["expertFeatures_31_task10 Word similarity score"]

X_train_task = X_train[features]
y_train_task = y_train.loc[y_train.index.intersection(X_train_task.index)]

X_test_task = X_test[features]
y_test_task = y_test.loc[y_test.index.intersection(X_test_task.index)]

model.fit(X_train_task, y_train_task)
predicted = model.predict(X_test_task)
report = classification_report(y_test_task, predicted, zero_division=0.0)
print(report)

              precision    recall  f1-score   support

           0       0.70      0.93      0.80        42
           1       0.25      0.06      0.09        18

    accuracy                           0.67        60
   macro avg       0.47      0.49      0.44        60
weighted avg       0.56      0.67      0.58        60

