In [2]:
pip install conllu

Note: you may need to restart the kernel to use updated packages.


In [None]:
import torch
import numpy as np
import pandas as pd
from transformers import BertTokenizerFast, BertModel
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm

url = "https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/en_ewt-ud-train.conllu"
cols = ["id","form","lemma","upos","xpos","feats","head","deprel","deps","misc"]
conllu = pd.read_csv(url, comment="#", sep="\t", names=cols)

tokens_df = conllu[conllu["form"].notnull() & conllu["upos"].notnull()][["id", "form", "upos"]].reset_index(drop=True)



In [None]:
sentences = []
labels = []
current_sentence = []
current_labels = []

for _, row in tokens_df.iterrows():
    if row["id"] == 1 and current_sentence:
        sentences.append(current_sentence)
        labels.append(current_labels)
        current_sentence = []
        current_labels = []
    current_sentence.append(row["form"])
    current_labels.append(row["upos"])

if current_sentence:
    sentences.append(current_sentence)
    labels.append(current_labels)

unique_upos = sorted(set(tag for seq in labels for tag in seq))
upos_to_id = {tag: idx for idx, tag in enumerate(unique_upos)}
id_to_upos = {idx: tag for tag, idx in upos_to_id.items()}


In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [None]:
def extract_embeddings(sentence, upos_tags):
    encoded = tokenizer(
        sentence,
        is_split_into_words=True,
        return_tensors="pt",
        return_attention_mask=True,
        return_offsets_mapping=True,
        padding=True,
        truncation=True
    ).to(device)

    with torch.no_grad():
        input_keys = ['input_ids', 'attention_mask', 'token_type_ids']
        model_inputs = {k: v for k, v in encoded.items() if k in input_keys}
        output = model(**model_inputs)
    hidden_states = output.hidden_states  

    word_ids = encoded.word_ids()
    word_to_token_indices = {}
    for idx, word_id in enumerate(word_ids):
        if word_id is not None:
            word_to_token_indices.setdefault(word_id, []).append(idx)

    all_layer_embs = [[] for _ in range(len(hidden_states))]
    aligned_labels = []

    for word_idx, token_idxs in word_to_token_indices.items():
        if word_idx >= len(upos_tags): continue
        aligned_labels.append(upos_to_id[upos_tags[word_idx]])
        for layer_idx in range(len(hidden_states)):
            vectors = hidden_states[layer_idx][0, token_idxs, :]
            mean_vec = vectors.mean(dim=0).cpu().numpy()
            all_layer_embs[layer_idx].append(mean_vec)

    return all_layer_embs, aligned_labels



In [None]:
all_embeddings_by_layer = [[] for _ in range(model.config.num_hidden_layers + 1)]
all_labels = []

print("Extracting embeddings...")
for sentence, tag_seq in tqdm(zip(sentences, labels), total=len(sentences)):
    embs, labs = extract_embeddings(sentence, tag_seq)
    if len(labs) == 0: continue
    for l in range(len(embs)):
        all_embeddings_by_layer[l].extend(embs[l])
    all_labels.extend(labs)



Extracting embeddings...


100%|██████████| 1/1 [00:05<00:00,  5.64s/it]


In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = []

print("Starting 5-fold cross-validation...")
for layer_idx, layer_embs in enumerate(all_embeddings_by_layer):
    X = np.array(layer_embs)
    y = np.array(all_labels)

    fold_num = 1
    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        clf = LogisticRegression(max_iter=1000, solver='lbfgs', multi_class='multinomial', n_jobs=-1)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_val)

        acc = accuracy_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred, average='weighted')

        results.append({
            'layer': layer_idx,
            'fold': fold_num,
            'accuracy': acc,
            'f1_weighted': f1
        })
        print(f"Layer {layer_idx}, Fold {fold_num}: Accuracy={acc:.4f}, F1={f1:.4f}")
        fold_num += 1



Starting 5-fold cross-validation...




Layer 0, Fold 1: Accuracy=0.8222, F1=0.8126




Layer 0, Fold 2: Accuracy=0.8333, F1=0.8216




Layer 0, Fold 3: Accuracy=0.8222, F1=0.7945




Layer 0, Fold 4: Accuracy=0.8667, F1=0.8594




Layer 0, Fold 5: Accuracy=0.7778, F1=0.7533




Layer 1, Fold 1: Accuracy=0.8556, F1=0.8468




Layer 1, Fold 2: Accuracy=0.8778, F1=0.8722




Layer 1, Fold 3: Accuracy=0.8444, F1=0.8276




Layer 1, Fold 4: Accuracy=0.9000, F1=0.8991




Layer 1, Fold 5: Accuracy=0.8333, F1=0.8173




Layer 2, Fold 1: Accuracy=0.8889, F1=0.8883




Layer 2, Fold 2: Accuracy=0.9000, F1=0.8976




Layer 2, Fold 3: Accuracy=0.8889, F1=0.8830




Layer 2, Fold 4: Accuracy=0.9222, F1=0.9220




Layer 2, Fold 5: Accuracy=0.8778, F1=0.8704




Layer 3, Fold 1: Accuracy=0.8889, F1=0.8837




Layer 3, Fold 2: Accuracy=0.9222, F1=0.9185




Layer 3, Fold 3: Accuracy=0.8889, F1=0.8824




Layer 3, Fold 4: Accuracy=0.9111, F1=0.9112




Layer 3, Fold 5: Accuracy=0.9222, F1=0.9133




Layer 4, Fold 1: Accuracy=0.8889, F1=0.8882




Layer 4, Fold 2: Accuracy=0.9333, F1=0.9294




Layer 4, Fold 3: Accuracy=0.8889, F1=0.8752




Layer 4, Fold 4: Accuracy=0.8778, F1=0.8712




Layer 4, Fold 5: Accuracy=0.8889, F1=0.8698




Layer 5, Fold 1: Accuracy=0.9000, F1=0.8951




Layer 5, Fold 2: Accuracy=0.9000, F1=0.8820




Layer 5, Fold 3: Accuracy=0.9000, F1=0.8842




Layer 5, Fold 4: Accuracy=0.8778, F1=0.8711




Layer 5, Fold 5: Accuracy=0.8778, F1=0.8582




Layer 6, Fold 1: Accuracy=0.9000, F1=0.8974




Layer 6, Fold 2: Accuracy=0.9111, F1=0.8970




Layer 6, Fold 3: Accuracy=0.9222, F1=0.9089




Layer 6, Fold 4: Accuracy=0.8778, F1=0.8705




Layer 6, Fold 5: Accuracy=0.8778, F1=0.8553




Layer 7, Fold 1: Accuracy=0.8889, F1=0.8821




Layer 7, Fold 2: Accuracy=0.8889, F1=0.8657




Layer 7, Fold 3: Accuracy=0.8889, F1=0.8713




Layer 7, Fold 4: Accuracy=0.8889, F1=0.8806




Layer 7, Fold 5: Accuracy=0.8889, F1=0.8798




Layer 8, Fold 1: Accuracy=0.8667, F1=0.8527




Layer 8, Fold 2: Accuracy=0.8778, F1=0.8622




Layer 8, Fold 3: Accuracy=0.9000, F1=0.8818




Layer 8, Fold 4: Accuracy=0.8778, F1=0.8647




Layer 8, Fold 5: Accuracy=0.8778, F1=0.8682




Layer 9, Fold 1: Accuracy=0.8667, F1=0.8491




Layer 9, Fold 2: Accuracy=0.8778, F1=0.8650




Layer 9, Fold 3: Accuracy=0.8667, F1=0.8395




Layer 9, Fold 4: Accuracy=0.8667, F1=0.8573




Layer 9, Fold 5: Accuracy=0.9000, F1=0.8898




Layer 10, Fold 1: Accuracy=0.8778, F1=0.8691




Layer 10, Fold 2: Accuracy=0.8556, F1=0.8394




Layer 10, Fold 3: Accuracy=0.8667, F1=0.8361




Layer 10, Fold 4: Accuracy=0.8889, F1=0.8780




Layer 10, Fold 5: Accuracy=0.8778, F1=0.8577




Layer 11, Fold 1: Accuracy=0.8111, F1=0.7977




Layer 11, Fold 2: Accuracy=0.8667, F1=0.8494




Layer 11, Fold 3: Accuracy=0.8333, F1=0.8037




Layer 11, Fold 4: Accuracy=0.8667, F1=0.8587




Layer 11, Fold 5: Accuracy=0.8667, F1=0.8550




Layer 12, Fold 1: Accuracy=0.8778, F1=0.8619




Layer 12, Fold 2: Accuracy=0.8444, F1=0.8231




Layer 12, Fold 3: Accuracy=0.8333, F1=0.7974




Layer 12, Fold 4: Accuracy=0.8444, F1=0.8345




Layer 12, Fold 5: Accuracy=0.8556, F1=0.8494


In [None]:
results_df = pd.DataFrame(results)
results_df.to_csv("bert_pos_probing_5fold_results.csv", index=False)
print("Results saved to 'bert_pos_probing_5fold_results.csv'")


Results saved to 'bert_pos_probing_5fold_results.csv'
