In [3]:
pip install conllu

Note: you may need to restart the kernel to use updated packages.


In [4]:
import torch
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
from tqdm.auto import tqdm
import os
import pickle
import pyphen
import conllu 
from transformers import (
    BertTokenizerFast, 
    BertModel
)
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression

print("Step 1: Data Preparation for Probing")

file_path = "en_ewt-ud-train.conllu"

try:
    print(f"Loading data from local file: {file_path}...")
    with open(file_path, "r", encoding="utf-8") as f:
        parsed_data = conllu.parse(f.read())
    
    words = []
    char_labels_data = []
    syll_labels_data = []

    char_label_map = {0: 'short', 1: 'medium', 2: 'long'}
    syll_label_map = {0: 'short', 1: 'medium', 2: 'long'}
    
    dic = pyphen.Pyphen(lang='en_US')

    def create_labels(token_list, feature_type='chars'):
        labels = []
        for token in token_list:
            if feature_type == 'chars':
                length = len(token)
                if length <= 4: labels.append(0)
                elif length <= 8: labels.append(1)
                else: labels.append(2)
            elif feature_type == 'syllables':
                syllables = dic.inserted(token.lower()).count('-') + 1 if token.strip() else 0
                if syllables <= 1: labels.append(0)
                elif syllables <= 2: labels.append(1)
                else: labels.append(2)
        return labels

    for sentence in tqdm(parsed_data, desc="Processing sentences"):
        tokens = [token['form'] for token in sentence]
        words.append(tokens)
        char_labels_data.append(create_labels(tokens, 'chars'))
        syll_labels_data.append(create_labels(tokens, 'syllables'))

    probing_dataset = Dataset.from_dict({
        'tokens': words,
        'char_labels': char_labels_data,
        'syll_labels': syll_labels_data
    })
    
    SUBSET_SIZE = 2000
    if len(probing_dataset) > SUBSET_SIZE:
        probing_dataset = probing_dataset.shuffle(seed=42).select(range(SUBSET_SIZE))
    
    print(f"Total sentences in probing subset: {len(probing_dataset)}")

except Exception as e:
    print(f"Error preparing dataset: {e}")
    exit()


  from .autonotebook import tqdm as notebook_tqdm


Step 1: Data Preparation for Probing
Loading data from local file: en_ewt-ud-train.conllu...


Processing sentences: 100%|██████████| 12544/12544 [00:01<00:00, 10892.90it/s]


Total sentences in probing subset: 2000


In [5]:
print("\n Step 2: Probing Setup on Pre-Trained Model ")

model_name = 'bert-base-uncased'
tokenizer = BertTokenizerFast.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

pre_trained_model = BertModel.from_pretrained(model_name, output_hidden_states=True).to(device)
pre_trained_model.eval()

print(f"Using device: {device}")


PROBE_RESULTS_DIR = "/home/sharmajidotdev/manish/probing_bert_surfacelevel_results"
os.makedirs(PROBE_RESULTS_DIR, exist_ok=True)

# probing function
def get_word_level_embeddings_single_sentence(sentence_tokens, labels_for_sentence, tokenizer, model, device):
    encoded_inputs = tokenizer(sentence_tokens, is_split_into_words=True, return_tensors='pt', padding=True, truncation=True, return_offsets_mapping=True, max_length=256)
    input_ids, attention_mask = encoded_inputs['input_ids'].to(device), encoded_inputs['attention_mask'].to(device)
    word_ids_list = encoded_inputs.word_ids()

    if not isinstance(word_ids_list, list) or not word_ids_list or not labels_for_sentence: return None, None
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    hidden_states = outputs.hidden_states
    sentence_hidden_states_all_layers = [hs[0] for hs in hidden_states]
    
    word_to_subword_indices = {}
    for token_idx, word_idx in enumerate(word_ids_list):
        if word_idx is None or token_idx >= attention_mask.shape[1] or attention_mask[0, token_idx].item() == 0 or word_idx < 0: continue
        if word_idx not in word_to_subword_indices: word_to_subword_indices[word_idx] = []
        word_to_subword_indices[word_idx].append(token_idx)
    if not word_to_subword_indices: return None, None
    sorted_word_indices = sorted(word_to_subword_indices.keys())
    if len(sorted_word_indices) != len(labels_for_sentence): return None, None
    
    current_sentence_word_embeddings_by_layer = [[] for _ in range(7)]
    aligned_labels_for_sentence_output = []
    
    for original_word_idx in sorted_word_indices:
        subword_token_indices = word_to_subword_indices[original_word_idx]
        if not subword_token_indices: continue
        for layer_idx in range(7):
            if any(idx >= sentence_hidden_states_all_layers[layer_idx].shape[0] for idx in subword_token_indices): return None, None
            subword_embs = sentence_hidden_states_all_layers[layer_idx][subword_token_indices, :]
            current_sentence_word_embeddings_by_layer[layer_idx].append(subword_embs.mean(dim=0).cpu().numpy())
        aligned_labels_for_sentence_output.append(labels_for_sentence[original_word_idx])
    
    processed_word_embeddings_tensors = []
    for layer_idx in range(7):
        if current_sentence_word_embeddings_by_layer[layer_idx]:
            processed_word_embeddings_tensors.append(torch.from_numpy(np.vstack(current_sentence_word_embeddings_by_layer[layer_idx])).float())
        else: return None, None
    return processed_word_embeddings_tensors, aligned_labels_for_sentence_output



 Step 2: Probing Setup on Pre-Trained Model 
Using device: cuda


In [None]:
print("\n Step 3: Probing Pre-Trained Model Across 5 Folds ")

probe_results_char = {f"layer_{i}": {'accuracy': [], 'f1': []} for i in range(7)}
probe_results_syll = {f"layer_{i}": {'accuracy': [], 'f1': []} for i in range(7)}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold_idx in range(5):
    print(f"\n--- Probing Fold {fold_idx + 1} ---")
    
    # extracting embeddings  
    sentences_to_probe = probing_dataset['tokens']
    labels_nchar_to_probe = probing_dataset['char_labels']
    labels_nsyll_to_probe = probing_dataset['syll_labels']
    
    # probe for NCHAR 
    print("\nExtracting embeddings from pre-trained model and training probes for NCHAR...")
    all_embs_nchar = [[] for _ in range(7)]
    all_labels_nchar = []
    for i in tqdm(range(len(sentences_to_probe)), desc="NCHAR Probing"):
        embs, labels = get_word_level_embeddings_single_sentence(sentences_to_probe[i], labels_nchar_to_probe[i], tokenizer, pre_trained_model, device)
        if embs:
            for l_idx, e_list in enumerate(embs):
                all_embs_nchar[l_idx].append(e_list)
            all_labels_nchar.extend(labels)
    
    concatenated_embs_nchar = [torch.cat(embs, dim=0) for embs in all_embs_nchar]
    y_nchar = np.array(all_labels_nchar)

    for l_idx in range(7):
        X = concatenated_embs_nchar[l_idx].cpu().numpy()
        y = y_nchar
        for train_idx, val_idx in skf.split(X, y):
            X_train, X_val = X[train_idx], X[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]
            
            probe = LogisticRegression(max_iter=1000, multi_class='multinomial', n_jobs=-1).fit(X_train, y_train)
            y_pred = probe.predict(X_val)
            probe_results_char[f"layer_{l_idx}"]['accuracy'].append(accuracy_score(y_val, y_pred))
            probe_results_char[f"layer_{l_idx}"]['f1'].append(f1_score(y_val, y_pred, average='weighted'))

    # probe for NSYLL 
    print("\nExtracting embeddings from pre-trained model and training probes for NSYLL...")
    all_embs_nsyll = [[] for _ in range(7)]
    all_labels_nsyll = []
    for i in tqdm(range(len(sentences_to_probe)), desc="NSYLL Probing"):
        embs, labels = get_word_level_embeddings_single_sentence(sentences_to_probe[i], labels_nsyll_to_probe[i], tokenizer, pre_trained_model, device)
        if embs:
            for l_idx, e_list in enumerate(embs):
                all_embs_nsyll[l_idx].append(e_list)
            all_labels_nsyll.extend(labels)
    
    concatenated_embs_nsyll = [torch.cat(embs, dim=0) for embs in all_embs_nsyll]
    y_nsyll = np.array(all_labels_nsyll)

    for l_idx in range(7):
        X = concatenated_embs_nsyll[l_idx].cpu().numpy()
        y = y_nsyll
        for train_idx, val_idx in skf.split(X, y):
            X_train, X_val = X[train_idx], X[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]
            
            probe = LogisticRegression(max_iter=1000, multi_class='multinomial', n_jobs=-1).fit(X_train, y_train)
            y_pred = probe.predict(X_val)
            probe_results_syll[f"layer_{l_idx}"]['accuracy'].append(accuracy_score(y_val, y_pred))
            probe_results_syll[f"layer_{l_idx}"]['f1'].append(f1_score(y_val, y_pred, average='weighted'))
    
    # saving probing results for this fold
    current_fold_results = {
        'nchar': probe_results_char,
        'nsyll': probe_results_syll
    }
    results_file_path = os.path.join(PROBE_RESULTS_DIR, f"fold_{fold_idx}_probing_results.pkl")
    with open(results_file_path, 'wb') as f:
        pickle.dump(current_fold_results, f)
    print(f"Saved probing results for Fold {fold_idx+1} to {results_file_path}")
    



 Step 3: Probing Pre-Trained Model Across 5 Folds 

--- Probing Fold 1 ---

Extracting embeddings from pre-trained model and training probes for NCHAR...


NCHAR Probing:   0%|          | 0/2000 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)
NCHAR Probing: 100%|██████████| 2000/2000 [00:32<00:00, 60.87it/s]



Extracting embeddings from pre-trained model and training probes for NSYLL...


  return forward_call(*args, **kwargs)
NSYLL Probing: 100%|██████████| 2000/2000 [00:20<00:00, 96.29it/s] 


Saved probing results for Fold 1 to /home/sharmajidotdev/manish/probing_bert_surfacelevel_results/fold_0_probing_results.pkl

--- Probing Fold 2 ---

Extracting embeddings from pre-trained model and training probes for NCHAR...


  return forward_call(*args, **kwargs)
NCHAR Probing: 100%|██████████| 2000/2000 [00:20<00:00, 98.38it/s] 



Extracting embeddings from pre-trained model and training probes for NSYLL...


  return forward_call(*args, **kwargs)
NSYLL Probing: 100%|██████████| 2000/2000 [00:19<00:00, 102.44it/s]


Saved probing results for Fold 2 to /home/sharmajidotdev/manish/probing_bert_surfacelevel_results/fold_1_probing_results.pkl

--- Probing Fold 3 ---

Extracting embeddings from pre-trained model and training probes for NCHAR...


  return forward_call(*args, **kwargs)
NCHAR Probing: 100%|██████████| 2000/2000 [00:20<00:00, 99.70it/s] 



Extracting embeddings from pre-trained model and training probes for NSYLL...


  return forward_call(*args, **kwargs)
NSYLL Probing: 100%|██████████| 2000/2000 [00:20<00:00, 95.83it/s] 


Saved probing results for Fold 3 to /home/sharmajidotdev/manish/probing_bert_surfacelevel_results/fold_2_probing_results.pkl

--- Probing Fold 4 ---

Extracting embeddings from pre-trained model and training probes for NCHAR...


  return forward_call(*args, **kwargs)
NCHAR Probing: 100%|██████████| 2000/2000 [00:20<00:00, 98.74it/s] 



Extracting embeddings from pre-trained model and training probes for NSYLL...


  return forward_call(*args, **kwargs)
NSYLL Probing: 100%|██████████| 2000/2000 [00:19<00:00, 100.18it/s]


Saved probing results for Fold 4 to /home/sharmajidotdev/manish/probing_bert_surfacelevel_results/fold_3_probing_results.pkl

--- Probing Fold 5 ---

Extracting embeddings from pre-trained model and training probes for NCHAR...


  return forward_call(*args, **kwargs)
NCHAR Probing: 100%|██████████| 2000/2000 [00:19<00:00, 100.05it/s]



Extracting embeddings from pre-trained model and training probes for NSYLL...


  return forward_call(*args, **kwargs)
NSYLL Probing: 100%|██████████| 2000/2000 [00:20<00:00, 98.19it/s] 


Saved probing results for Fold 5 to /home/sharmajidotdev/manish/probing_bert_surfacelevel_results/fold_4_probing_results.pkl


In [9]:
import pickle
import os

file_path = os.path.join("probing_bert_surfacelevel_results/fold_0_probing_results.pkl") 

try:
    with open(file_path, 'rb') as f:
        loaded_results = pickle.load(f)
    print("Successfully loaded results from the pickle file.")
    print(loaded_results)

except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except Exception as e:
    print(f"An error occurred while loading the pickle file: {e}")

Successfully loaded results from the pickle file.
{'nchar': {'layer_0': {'accuracy': [0.9387507580351728, 0.9348089751364463, 0.9417828987265009, 0.9372346876895088, 0.9452698605215282], 'f1': [0.9381815377855119, 0.9338270902352749, 0.9411605335048934, 0.9364734457851955, 0.9447595189924847]}, 'layer_1': {'accuracy': [0.9319284414796847, 0.9255609460278957, 0.9311704063068527, 0.9313220133414191, 0.9322316555488175], 'f1': [0.931081400117691, 0.924928006405919, 0.9309868335000552, 0.9307680376444735, 0.9316976403425598]}, 'layer_2': {'accuracy': [0.9223771983020013, 0.9176773802304427, 0.9246513038204973, 0.9228320194057005, 0.9217707701637355], 'f1': [0.9216513382221326, 0.9166263097848905, 0.9247425918774347, 0.9224441378932933, 0.9209903296418331]}, 'layer_3': {'accuracy': [0.9161613098847786, 0.9104002425712553, 0.9185870224378411, 0.9179805942995755, 0.9173741661613098], 'f1': [0.9156310598321905, 0.9099974634583802, 0.918470316630831, 0.917113149942464, 0.9164234880406473]}, 'la

In [10]:
import pickle
import os

file_path = os.path.join("probing_bert_surfacelevel_results/fold_1_probing_results.pkl") 

try:
    with open(file_path, 'rb') as f:
        loaded_results = pickle.load(f)
    print("Successfully loaded results from the pickle file.")
    print(loaded_results)

except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except Exception as e:
    print(f"An error occurred while loading the pickle file: {e}")

Successfully loaded results from the pickle file.
{'nchar': {'layer_0': {'accuracy': [0.9387507580351728, 0.9348089751364463, 0.9417828987265009, 0.9372346876895088, 0.9452698605215282, 0.9387507580351728, 0.9348089751364463, 0.9417828987265009, 0.9372346876895088, 0.9452698605215282], 'f1': [0.9381815377855119, 0.9338270902352749, 0.9411605335048934, 0.9364734457851955, 0.9447595189924847, 0.9381815377855119, 0.9338270902352749, 0.9411605335048934, 0.9364734457851955, 0.9447595189924847]}, 'layer_1': {'accuracy': [0.9319284414796847, 0.9255609460278957, 0.9311704063068527, 0.9313220133414191, 0.9322316555488175, 0.9319284414796847, 0.9255609460278957, 0.9311704063068527, 0.9313220133414191, 0.9322316555488175], 'f1': [0.931081400117691, 0.924928006405919, 0.9309868335000552, 0.9307680376444735, 0.9316976403425598, 0.931081400117691, 0.924928006405919, 0.9309868335000552, 0.9307680376444735, 0.9316976403425598]}, 'layer_2': {'accuracy': [0.9223771983020013, 0.9176773802304427, 0.924651

In [11]:
import pickle
import os

file_path = os.path.join("probing_bert_surfacelevel_results/fold_2_probing_results.pkl") 

try:
    with open(file_path, 'rb') as f:
        loaded_results = pickle.load(f)
    print("Successfully loaded results from the pickle file.")
    print(loaded_results)

except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except Exception as e:
    print(f"An error occurred while loading the pickle file: {e}")

Successfully loaded results from the pickle file.
{'nchar': {'layer_0': {'accuracy': [0.9387507580351728, 0.9348089751364463, 0.9417828987265009, 0.9372346876895088, 0.9452698605215282, 0.9387507580351728, 0.9348089751364463, 0.9417828987265009, 0.9372346876895088, 0.9452698605215282, 0.9387507580351728, 0.9348089751364463, 0.9417828987265009, 0.9372346876895088, 0.9452698605215282], 'f1': [0.9381815377855119, 0.9338270902352749, 0.9411605335048934, 0.9364734457851955, 0.9447595189924847, 0.9381815377855119, 0.9338270902352749, 0.9411605335048934, 0.9364734457851955, 0.9447595189924847, 0.9381815377855119, 0.9338270902352749, 0.9411605335048934, 0.9364734457851955, 0.9447595189924847]}, 'layer_1': {'accuracy': [0.9319284414796847, 0.9255609460278957, 0.9311704063068527, 0.9313220133414191, 0.9322316555488175, 0.9319284414796847, 0.9255609460278957, 0.9311704063068527, 0.9313220133414191, 0.9322316555488175, 0.9319284414796847, 0.9255609460278957, 0.9311704063068527, 0.9313220133414191,

In [12]:
import pickle
import os

file_path = os.path.join("probing_bert_surfacelevel_results/fold_3_probing_results.pkl") 

try:
    with open(file_path, 'rb') as f:
        loaded_results = pickle.load(f)
    print("Successfully loaded results from the pickle file.")
    print(loaded_results)

except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except Exception as e:
    print(f"An error occurred while loading the pickle file: {e}")

Successfully loaded results from the pickle file.
{'nchar': {'layer_0': {'accuracy': [0.9387507580351728, 0.9348089751364463, 0.9417828987265009, 0.9372346876895088, 0.9452698605215282, 0.9387507580351728, 0.9348089751364463, 0.9417828987265009, 0.9372346876895088, 0.9452698605215282, 0.9387507580351728, 0.9348089751364463, 0.9417828987265009, 0.9372346876895088, 0.9452698605215282, 0.9387507580351728, 0.9348089751364463, 0.9417828987265009, 0.9372346876895088, 0.9452698605215282], 'f1': [0.9381815377855119, 0.9338270902352749, 0.9411605335048934, 0.9364734457851955, 0.9447595189924847, 0.9381815377855119, 0.9338270902352749, 0.9411605335048934, 0.9364734457851955, 0.9447595189924847, 0.9381815377855119, 0.9338270902352749, 0.9411605335048934, 0.9364734457851955, 0.9447595189924847, 0.9381815377855119, 0.9338270902352749, 0.9411605335048934, 0.9364734457851955, 0.9447595189924847]}, 'layer_1': {'accuracy': [0.9319284414796847, 0.9255609460278957, 0.9311704063068527, 0.9313220133414191,

In [13]:
import pickle
import os

file_path = os.path.join("probing_bert_surfacelevel_results/fold_4_probing_results.pkl") 

try:
    with open(file_path, 'rb') as f:
        loaded_results = pickle.load(f)
    print("Successfully loaded results from the pickle file.")
    print(loaded_results)

except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except Exception as e:
    print(f"An error occurred while loading the pickle file: {e}")

Successfully loaded results from the pickle file.
{'nchar': {'layer_0': {'accuracy': [0.9387507580351728, 0.9348089751364463, 0.9417828987265009, 0.9372346876895088, 0.9452698605215282, 0.9387507580351728, 0.9348089751364463, 0.9417828987265009, 0.9372346876895088, 0.9452698605215282, 0.9387507580351728, 0.9348089751364463, 0.9417828987265009, 0.9372346876895088, 0.9452698605215282, 0.9387507580351728, 0.9348089751364463, 0.9417828987265009, 0.9372346876895088, 0.9452698605215282, 0.9387507580351728, 0.9348089751364463, 0.9417828987265009, 0.9372346876895088, 0.9452698605215282], 'f1': [0.9381815377855119, 0.9338270902352749, 0.9411605335048934, 0.9364734457851955, 0.9447595189924847, 0.9381815377855119, 0.9338270902352749, 0.9411605335048934, 0.9364734457851955, 0.9447595189924847, 0.9381815377855119, 0.9338270902352749, 0.9411605335048934, 0.9364734457851955, 0.9447595189924847, 0.9381815377855119, 0.9338270902352749, 0.9411605335048934, 0.9364734457851955, 0.9447595189924847, 0.9381