In [1]:
pip install conllu

Note: you may need to restart the kernel to use updated packages.


In [None]:
import torch
from transformers import BertModel, BertTokenizerFast
from datasets import load_dataset
from tqdm.auto import tqdm
import numpy as np
import os
import csv
import pickle
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_absolute_error, r2_score



  from .autonotebook import tqdm as notebook_tqdm


In [None]:
EMBEDDING_DIR = "./bert_mrc_embeddings"
os.makedirs(EMBEDDING_DIR, exist_ok=True)

# loading dataset
print("Loading MRC Psycholinguistic Database...")
mrc_dataset = load_dataset("StephanAkkerman/MRC-psycholinguistic-database")['train']

# extracting features 
words = mrc_dataset['Word']
img = np.array(mrc_dataset['Imageability'], dtype=float)
conc = np.array(mrc_dataset['Concreteness'], dtype=float)
nsyl = np.array(mrc_dataset['Number of Syllables'], dtype=float)

valid = ~np.isnan(img) & ~np.isnan(conc) & ~np.isnan(nsyl)
words = [words[i] for i in range(len(words)) if valid[i]]
img = img[valid]
conc = conc[valid]
nsyl = nsyl[valid]

print(f"Total valid words: {len(words)}")


Loading MRC Psycholinguistic Database...
Total valid words: 150000


In [4]:
print("Loading BERT...")
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased", output_hidden_states=True).eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Loading BERT...


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [5]:
def get_single_word_embedding_per_layer(word, model, tokenizer, device):
    try:
        encoded = tokenizer(word, return_tensors='pt').to(device)
        with torch.no_grad():
            output = model(**encoded)
        hidden_states = output.hidden_states
        return [layer[0, 0, :].cpu().numpy() for layer in hidden_states]
    except:
        return None

In [6]:
def get_embeddings_for_set(word_list, label_list, desc):
    all_embeddings = [[] for _ in range(13)]
    all_labels = []
    for i in tqdm(range(len(word_list)), desc=f"{desc} Embeddings"):
        emb = get_single_word_embedding_per_layer(word_list[i], model, tokenizer, device)
        if emb is None:
            continue
        for layer in range(13):
            all_embeddings[layer].append(emb[layer])
        all_labels.append(label_list[i])
    final = [torch.tensor(np.vstack(all_embeddings[layer]), dtype=torch.float32) for layer in range(13)]
    return final, all_labels

In [7]:
def probing_cv(words, labels, feature_name):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    results = {f"layer_{i}": {'mae': [], 'r2': []} for i in range(13)}

    for fold, (train_val_idx, test_idx) in enumerate(kf.split(words)):
        print(f"\nFold {fold+1}/5 - {feature_name}")

        # splitting data
        train_val_words = [words[i] for i in train_val_idx]
        train_val_labels = [labels[i] for i in train_val_idx]
        test_words = [words[i] for i in test_idx]
        test_labels = [labels[i] for i in test_idx]

        # splitting train/val
        t_idx, d_idx = train_test_split(np.arange(len(train_val_words)), test_size=0.2, random_state=42)
        train_words = [train_val_words[i] for i in t_idx]
        dev_words = [train_val_words[i] for i in d_idx]
        train_labels = [train_val_labels[i] for i in t_idx]
        dev_labels = [train_val_labels[i] for i in d_idx]

        # getting embeddings
        train_embs, train_labels = get_embeddings_for_set(train_words, train_labels, "Train")
        test_embs, test_labels = get_embeddings_for_set(test_words, test_labels, "Test")

        # training and evaluating on each layer
        for layer in range(13):
            print(f"  ➤ Layer {layer}")
            X_train = train_embs[layer].numpy()
            X_test = test_embs[layer].numpy()
            y_train = train_labels
            y_test = test_labels

            if len(np.unique(y_train)) < 2:
                results[f"layer_{layer}"]['mae'].append(float('inf'))
                results[f"layer_{layer}"]['r2'].append(0.0)
                continue

            reg = LinearRegression()
            reg.fit(X_train, y_train)
            preds = reg.predict(X_test)
            mae = mean_absolute_error(y_test, preds)
            r2 = r2_score(y_test, preds)
            results[f"layer_{layer}"]['mae'].append(mae)
            results[f"layer_{layer}"]['r2'].append(r2)
            print(f"     MAE: {mae:.4f}, R²: {r2:.4f}")
    return results


In [8]:
results_img = probing_cv(words, img.tolist(), "Imageability")


Fold 1/5 - Imageability


  return forward_call(*args, **kwargs)
Train Embeddings: 100%|██████████| 96000/96000 [08:00<00:00, 199.73it/s]
Test Embeddings: 100%|██████████| 30000/30000 [02:30<00:00, 198.70it/s]


  ➤ Layer 0
     MAE: 51.2839, R²: -0.0001
  ➤ Layer 1
     MAE: 54.2580, R²: 0.1907
  ➤ Layer 2
     MAE: 53.7592, R²: 0.2194
  ➤ Layer 3
     MAE: 53.8679, R²: 0.2430
  ➤ Layer 4
     MAE: 53.5519, R²: 0.2652
  ➤ Layer 5
     MAE: 51.5576, R²: 0.3099
  ➤ Layer 6
     MAE: 51.9300, R²: 0.3058
  ➤ Layer 7
     MAE: 51.6158, R²: 0.3151
  ➤ Layer 8
     MAE: 50.8079, R²: 0.3320
  ➤ Layer 9
     MAE: 50.3583, R²: 0.3353
  ➤ Layer 10
     MAE: 50.9663, R²: 0.3258
  ➤ Layer 11
     MAE: 51.6647, R²: 0.3185
  ➤ Layer 12
     MAE: 55.7515, R²: 0.2335

Fold 2/5 - Imageability


  return forward_call(*args, **kwargs)
Train Embeddings: 100%|██████████| 96000/96000 [08:19<00:00, 192.25it/s]
Test Embeddings: 100%|██████████| 30000/30000 [03:25<00:00, 146.02it/s]


  ➤ Layer 0
     MAE: 52.0825, R²: -0.0000
  ➤ Layer 1
     MAE: 54.7867, R²: 0.2037
  ➤ Layer 2
     MAE: 54.1992, R²: 0.2336
  ➤ Layer 3
     MAE: 54.2255, R²: 0.2591
  ➤ Layer 4
     MAE: 54.0932, R²: 0.2809
  ➤ Layer 5
     MAE: 52.5136, R²: 0.3229
  ➤ Layer 6
     MAE: 52.7665, R²: 0.3192
  ➤ Layer 7
     MAE: 51.9924, R²: 0.3374
  ➤ Layer 8
     MAE: 51.4204, R²: 0.3447
  ➤ Layer 9
     MAE: 50.4325, R²: 0.3559
  ➤ Layer 10
     MAE: 50.9046, R²: 0.3493
  ➤ Layer 11
     MAE: 51.5964, R²: 0.3386
  ➤ Layer 12
     MAE: 55.6332, R²: 0.2514

Fold 3/5 - Imageability


  return forward_call(*args, **kwargs)
Train Embeddings: 100%|██████████| 96000/96000 [07:55<00:00, 201.80it/s]
Test Embeddings: 100%|██████████| 30000/30000 [02:30<00:00, 199.66it/s]


  ➤ Layer 0
     MAE: 52.1242, R²: -0.0000
  ➤ Layer 1
     MAE: 54.9681, R²: 0.1952
  ➤ Layer 2
     MAE: 54.7848, R²: 0.2184
  ➤ Layer 3
     MAE: 54.4095, R²: 0.2446
  ➤ Layer 4
     MAE: 54.3027, R²: 0.2658
  ➤ Layer 5
     MAE: 52.5310, R²: 0.3116
  ➤ Layer 6
     MAE: 52.5120, R²: 0.3113
  ➤ Layer 7
     MAE: 52.1284, R²: 0.3180
  ➤ Layer 8
     MAE: 51.7424, R²: 0.3237
  ➤ Layer 9
     MAE: 50.9838, R²: 0.3371
  ➤ Layer 10
     MAE: 51.1983, R²: 0.3321
  ➤ Layer 11
     MAE: 51.9540, R²: 0.3223
  ➤ Layer 12
     MAE: 56.0743, R²: 0.2385

Fold 4/5 - Imageability


  return forward_call(*args, **kwargs)
Train Embeddings: 100%|██████████| 96000/96000 [07:58<00:00, 200.48it/s]
Test Embeddings: 100%|██████████| 30000/30000 [02:29<00:00, 200.72it/s]


  ➤ Layer 0
     MAE: 51.4620, R²: -0.0000
  ➤ Layer 1
     MAE: 54.1330, R²: 0.2010
  ➤ Layer 2
     MAE: 53.9678, R²: 0.2242
  ➤ Layer 3
     MAE: 53.6938, R²: 0.2545
  ➤ Layer 4
     MAE: 53.1925, R²: 0.2813
  ➤ Layer 5
     MAE: 51.7397, R²: 0.3167
  ➤ Layer 6
     MAE: 51.7697, R²: 0.3150
  ➤ Layer 7
     MAE: 51.6010, R²: 0.3231
  ➤ Layer 8
     MAE: 50.7643, R²: 0.3383
  ➤ Layer 9
     MAE: 50.0000, R²: 0.3483
  ➤ Layer 10
     MAE: 50.6962, R²: 0.3356
  ➤ Layer 11
     MAE: 51.6485, R²: 0.3247
  ➤ Layer 12
     MAE: 55.5617, R²: 0.2410

Fold 5/5 - Imageability


  return forward_call(*args, **kwargs)
Train Embeddings: 100%|██████████| 96000/96000 [08:00<00:00, 199.62it/s]
Test Embeddings: 100%|██████████| 30000/30000 [02:29<00:00, 200.83it/s]


  ➤ Layer 0
     MAE: 51.9495, R²: -0.0000
  ➤ Layer 1
     MAE: 54.7232, R²: 0.1976
  ➤ Layer 2
     MAE: 54.1514, R²: 0.2302
  ➤ Layer 3
     MAE: 53.9971, R²: 0.2546
  ➤ Layer 4
     MAE: 53.8138, R²: 0.2781
  ➤ Layer 5
     MAE: 52.4509, R²: 0.3159
  ➤ Layer 6
     MAE: 52.2145, R²: 0.3221
  ➤ Layer 7
     MAE: 51.8911, R²: 0.3329
  ➤ Layer 8
     MAE: 51.1995, R²: 0.3480
  ➤ Layer 9
     MAE: 50.7933, R²: 0.3541
  ➤ Layer 10
     MAE: 51.1169, R²: 0.3476
  ➤ Layer 11
     MAE: 51.8209, R²: 0.3375
  ➤ Layer 12
     MAE: 55.8348, R²: 0.2505


In [9]:
results_conc = probing_cv(words, conc.tolist(), "Concreteness")


Fold 1/5 - Concreteness


  return forward_call(*args, **kwargs)
Train Embeddings: 100%|██████████| 96000/96000 [07:58<00:00, 200.50it/s]
Test Embeddings: 100%|██████████| 30000/30000 [02:31<00:00, 198.49it/s]


  ➤ Layer 0
     MAE: 44.8743, R²: -0.0001
  ➤ Layer 1
     MAE: 48.8615, R²: 0.1717
  ➤ Layer 2
     MAE: 48.5401, R²: 0.1977
  ➤ Layer 3
     MAE: 48.7515, R²: 0.2208
  ➤ Layer 4
     MAE: 48.4890, R²: 0.2457
  ➤ Layer 5
     MAE: 47.0873, R²: 0.2893
  ➤ Layer 6
     MAE: 47.3708, R²: 0.2863
  ➤ Layer 7
     MAE: 47.0740, R²: 0.2964
  ➤ Layer 8
     MAE: 46.2736, R²: 0.3155
  ➤ Layer 9
     MAE: 45.9339, R²: 0.3198
  ➤ Layer 10
     MAE: 46.6175, R²: 0.3085
  ➤ Layer 11
     MAE: 46.9910, R²: 0.3030
  ➤ Layer 12
     MAE: 50.2884, R²: 0.2194

Fold 2/5 - Concreteness


  return forward_call(*args, **kwargs)
Train Embeddings: 100%|██████████| 96000/96000 [07:59<00:00, 200.26it/s]
Test Embeddings: 100%|██████████| 30000/30000 [02:30<00:00, 199.82it/s]


  ➤ Layer 0
     MAE: 45.2801, R²: -0.0000
  ➤ Layer 1
     MAE: 49.1835, R²: 0.1808
  ➤ Layer 2
     MAE: 48.8234, R²: 0.2078
  ➤ Layer 3
     MAE: 49.0018, R²: 0.2344
  ➤ Layer 4
     MAE: 48.8974, R²: 0.2583
  ➤ Layer 5
     MAE: 47.8924, R²: 0.2953
  ➤ Layer 6
     MAE: 48.0836, R²: 0.2945
  ➤ Layer 7
     MAE: 47.2897, R²: 0.3149
  ➤ Layer 8
     MAE: 46.6379, R²: 0.3251
  ➤ Layer 9
     MAE: 45.8310, R²: 0.3356
  ➤ Layer 10
     MAE: 46.3436, R²: 0.3286
  ➤ Layer 11
     MAE: 46.8932, R²: 0.3170
  ➤ Layer 12
     MAE: 50.1811, R²: 0.2304

Fold 3/5 - Concreteness


  return forward_call(*args, **kwargs)
Train Embeddings: 100%|██████████| 96000/96000 [08:01<00:00, 199.57it/s]
Test Embeddings: 100%|██████████| 30000/30000 [02:31<00:00, 198.33it/s]


  ➤ Layer 0
     MAE: 45.7154, R²: -0.0000
  ➤ Layer 1
     MAE: 49.5775, R²: 0.1781
  ➤ Layer 2
     MAE: 49.5344, R²: 0.1988
  ➤ Layer 3
     MAE: 49.4169, R²: 0.2230
  ➤ Layer 4
     MAE: 49.4427, R²: 0.2459
  ➤ Layer 5
     MAE: 48.0861, R²: 0.2909
  ➤ Layer 6
     MAE: 47.9715, R²: 0.2927
  ➤ Layer 7
     MAE: 47.5627, R²: 0.3016
  ➤ Layer 8
     MAE: 47.2993, R²: 0.3078
  ➤ Layer 9
     MAE: 46.6455, R²: 0.3201
  ➤ Layer 10
     MAE: 46.8747, R²: 0.3147
  ➤ Layer 11
     MAE: 47.4897, R²: 0.3054
  ➤ Layer 12
     MAE: 50.6895, R²: 0.2233

Fold 4/5 - Concreteness


  return forward_call(*args, **kwargs)
Train Embeddings: 100%|██████████| 96000/96000 [08:01<00:00, 199.39it/s]
Test Embeddings: 100%|██████████| 30000/30000 [02:31<00:00, 198.64it/s]


  ➤ Layer 0
     MAE: 44.8070, R²: -0.0000
  ➤ Layer 1
     MAE: 48.7706, R²: 0.1781
  ➤ Layer 2
     MAE: 48.7825, R²: 0.1982
  ➤ Layer 3
     MAE: 48.6742, R²: 0.2283
  ➤ Layer 4
     MAE: 48.2014, R²: 0.2601
  ➤ Layer 5
     MAE: 47.1832, R²: 0.2930
  ➤ Layer 6
     MAE: 47.3622, R²: 0.2905
  ➤ Layer 7
     MAE: 47.0306, R²: 0.3038
  ➤ Layer 8
     MAE: 46.1932, R²: 0.3207
  ➤ Layer 9
     MAE: 45.6500, R²: 0.3305
  ➤ Layer 10
     MAE: 46.3131, R²: 0.3171
  ➤ Layer 11
     MAE: 47.0491, R²: 0.3072
  ➤ Layer 12
     MAE: 50.0741, R²: 0.2219

Fold 5/5 - Concreteness


  return forward_call(*args, **kwargs)
Train Embeddings: 100%|██████████| 96000/96000 [08:01<00:00, 199.57it/s]
Test Embeddings: 100%|██████████| 30000/30000 [02:30<00:00, 199.30it/s]


  ➤ Layer 0
     MAE: 45.5360, R²: -0.0000
  ➤ Layer 1
     MAE: 49.4075, R²: 0.1778
  ➤ Layer 2
     MAE: 49.1445, R²: 0.2053
  ➤ Layer 3
     MAE: 49.1125, R²: 0.2308
  ➤ Layer 4
     MAE: 48.8947, R²: 0.2580
  ➤ Layer 5
     MAE: 47.9237, R²: 0.2955
  ➤ Layer 6
     MAE: 47.7258, R²: 0.3013
  ➤ Layer 7
     MAE: 47.4137, R²: 0.3145
  ➤ Layer 8
     MAE: 46.8062, R²: 0.3288
  ➤ Layer 9
     MAE: 46.5253, R²: 0.3362
  ➤ Layer 10
     MAE: 46.7952, R²: 0.3298
  ➤ Layer 11
     MAE: 47.4056, R²: 0.3195
  ➤ Layer 12
     MAE: 50.5348, R²: 0.2347


In [10]:
results_nsyl = probing_cv(words, nsyl.tolist(), "Number of Syllables")


Fold 1/5 - Number of Syllables


  return forward_call(*args, **kwargs)
Train Embeddings: 100%|██████████| 96000/96000 [07:57<00:00, 201.16it/s]
Test Embeddings: 100%|██████████| 30000/30000 [02:29<00:00, 201.23it/s]


  ➤ Layer 0
     MAE: 1.3677, R²: -0.0000
  ➤ Layer 1
     MAE: 0.8720, R²: 0.4794
  ➤ Layer 2
     MAE: 0.8246, R²: 0.5294
  ➤ Layer 3
     MAE: 0.8106, R²: 0.5435
  ➤ Layer 4
     MAE: 0.7624, R²: 0.5888
  ➤ Layer 5
     MAE: 0.7538, R²: 0.5973
  ➤ Layer 6
     MAE: 0.7157, R²: 0.6298
  ➤ Layer 7
     MAE: 0.7085, R²: 0.6376
  ➤ Layer 8
     MAE: 0.7059, R²: 0.6420
  ➤ Layer 9
     MAE: 0.7081, R²: 0.6413
  ➤ Layer 10
     MAE: 0.7146, R²: 0.6343
  ➤ Layer 11
     MAE: 0.7354, R²: 0.6178
  ➤ Layer 12
     MAE: 0.7668, R²: 0.5954

Fold 2/5 - Number of Syllables


  return forward_call(*args, **kwargs)
Train Embeddings: 100%|██████████| 96000/96000 [07:54<00:00, 202.32it/s]
Test Embeddings: 100%|██████████| 30000/30000 [02:29<00:00, 201.08it/s]


  ➤ Layer 0
     MAE: 1.3650, R²: -0.0000
  ➤ Layer 1
     MAE: 0.8758, R²: 0.4744
  ➤ Layer 2
     MAE: 0.8294, R²: 0.5242
  ➤ Layer 3
     MAE: 0.8176, R²: 0.5338
  ➤ Layer 4
     MAE: 0.7651, R²: 0.5841
  ➤ Layer 5
     MAE: 0.7553, R²: 0.5943
  ➤ Layer 6
     MAE: 0.7141, R²: 0.6296
  ➤ Layer 7
     MAE: 0.7095, R²: 0.6353
  ➤ Layer 8
     MAE: 0.7052, R²: 0.6391
  ➤ Layer 9
     MAE: 0.7039, R²: 0.6400
  ➤ Layer 10
     MAE: 0.7119, R²: 0.6329
  ➤ Layer 11
     MAE: 0.7304, R²: 0.6165
  ➤ Layer 12
     MAE: 0.7640, R²: 0.5929

Fold 3/5 - Number of Syllables


  return forward_call(*args, **kwargs)
Train Embeddings: 100%|██████████| 96000/96000 [07:55<00:00, 201.96it/s]
Test Embeddings: 100%|██████████| 30000/30000 [02:29<00:00, 201.24it/s]


  ➤ Layer 0
     MAE: 1.3687, R²: -0.0001
  ➤ Layer 1
     MAE: 0.8830, R²: 0.4738
  ➤ Layer 2
     MAE: 0.8365, R²: 0.5227
  ➤ Layer 3
     MAE: 0.8223, R²: 0.5348
  ➤ Layer 4
     MAE: 0.7684, R²: 0.5834
  ➤ Layer 5
     MAE: 0.7562, R²: 0.5965
  ➤ Layer 6
     MAE: 0.7208, R²: 0.6257
  ➤ Layer 7
     MAE: 0.7119, R²: 0.6347
  ➤ Layer 8
     MAE: 0.7098, R²: 0.6362
  ➤ Layer 9
     MAE: 0.7094, R²: 0.6368
  ➤ Layer 10
     MAE: 0.7157, R²: 0.6315
  ➤ Layer 11
     MAE: 0.7355, R²: 0.6158
  ➤ Layer 12
     MAE: 0.7719, R²: 0.5908

Fold 4/5 - Number of Syllables


  return forward_call(*args, **kwargs)
Train Embeddings: 100%|██████████| 96000/96000 [07:54<00:00, 202.37it/s]
Test Embeddings: 100%|██████████| 30000/30000 [02:29<00:00, 201.23it/s]


  ➤ Layer 0
     MAE: 1.3723, R²: -0.0000
  ➤ Layer 1
     MAE: 0.8762, R²: 0.4765
  ➤ Layer 2
     MAE: 0.8283, R²: 0.5300
  ➤ Layer 3
     MAE: 0.8159, R²: 0.5401
  ➤ Layer 4
     MAE: 0.7644, R²: 0.5879
  ➤ Layer 5
     MAE: 0.7524, R²: 0.6011
  ➤ Layer 6
     MAE: 0.7134, R²: 0.6347
  ➤ Layer 7
     MAE: 0.7052, R²: 0.6429
  ➤ Layer 8
     MAE: 0.7011, R²: 0.6458
  ➤ Layer 9
     MAE: 0.7034, R²: 0.6441
  ➤ Layer 10
     MAE: 0.7085, R²: 0.6380
  ➤ Layer 11
     MAE: 0.7259, R²: 0.6244
  ➤ Layer 12
     MAE: 0.7640, R²: 0.5972

Fold 5/5 - Number of Syllables


  return forward_call(*args, **kwargs)
Train Embeddings: 100%|██████████| 96000/96000 [07:55<00:00, 201.74it/s]
Test Embeddings: 100%|██████████| 30000/30000 [02:28<00:00, 201.57it/s]


  ➤ Layer 0
     MAE: 1.3628, R²: -0.0001
  ➤ Layer 1
     MAE: 0.8667, R²: 0.4837
  ➤ Layer 2
     MAE: 0.8201, R²: 0.5339
  ➤ Layer 3
     MAE: 0.8092, R²: 0.5446
  ➤ Layer 4
     MAE: 0.7574, R²: 0.5934
  ➤ Layer 5
     MAE: 0.7485, R²: 0.6044
  ➤ Layer 6
     MAE: 0.7109, R²: 0.6355
  ➤ Layer 7
     MAE: 0.7048, R²: 0.6430
  ➤ Layer 8
     MAE: 0.7002, R²: 0.6472
  ➤ Layer 9
     MAE: 0.6989, R²: 0.6475
  ➤ Layer 10
     MAE: 0.7050, R²: 0.6422
  ➤ Layer 11
     MAE: 0.7208, R²: 0.6276
  ➤ Layer 12
     MAE: 0.7578, R²: 0.6019


In [11]:
def save_results_to_csv(results, feature_name):
    records = []
    for layer, scores in results.items():
        for fold in range(len(scores['mae'])):
            records.append({
                'feature': feature_name,
                'layer': layer,
                'fold': fold,
                'mae': scores['mae'][fold],
                'r2': scores['r2'][fold]
            })
    df = pd.DataFrame(records)
    df.to_csv(f"{EMBEDDING_DIR}/{feature_name.lower()}_probing_results.csv", index=False)

save_results_to_csv(results_img, "Imageability")
save_results_to_csv(results_conc, "Concreteness")
save_results_to_csv(results_nsyl, "Number of Syllables")