In [None]:
import sys
import os
import pandas as pd
import math

from tqdm import trange


In [None]:
# Add the directory containing lingpred to sys.path
sys.path.append(os.path.abspath("/Users/xiaosuhu/Python_Proj/Pilot_Analysis_3_Way_Split/lingpred/lingpred"))
# sys.path.append(os.path.abspath("F:\Matlab_Project\PROJECT_HH_ENGLISH\Pilot_Analysis_3_Way_Split\lingpred\lingpred"))

from lingpred.lingpred.pipeline import LingPred

In [None]:
# Load your CSV file
df = pd.read_csv("./DesignMatrix/HH_time_matrix.csv")  # Replace with the actual filename

In [None]:

# Extract the 'Text' column
text_column = df["Text"]

# Combine into a single string
cleaned_text = " ".join(df["Text"].astype(str).str.strip())

print(cleaned_text)

In [None]:
# This section is used for testing the limit of the words can be used.
# Split the cleaned text into words
words = cleaned_text.split()

# Take the first 100 words
first_100_words = " ".join(words[:201])

print(first_100_words)
# Can not put the full text of 2000 word in as it exceed the limit
lp = LingPred()
lp.preprocess(first_100_words) # Load text
lp.fit() # Run GPT2-Small FFN
lp.get_lexical_info() # Get Lexical surprisals
lp.get_POS_info() # Get POS labels and surprisals
lp.get_phonemic_info() # Get phonemic labels and phonemic susprisals

In [None]:
def _safe_last(arr):
    """Return last element of a list-like or None."""
    try:
        return arr[-1] if arr is not None and len(arr) > 0 else None
    except Exception:
        return None

def _safe_float(x):
    """Convert numpy/Python numbers to float, map inf/-inf to None."""
    try:
        if x is None:
            return None
        xf = float(x)
        if math.isfinite(xf):
            return xf
        return None
    except Exception:
        return None

def compute_suprisal_over_story(
    cleaned_text: str,
    window_words: int = 200,
    save_path: str = None,
    verbose_every: int = 200,
):
    """
    Rolling-context surprisal over entire story.

    For each word i, we run LingPred on the preceding `window_words` (or fewer)
    PLUS the target word, then record the final entry from lp.out_dict arrays.
    """
    words_all = cleaned_text.split()
    n = len(words_all)
    # n=10 # for test purpose
    rows = []
    lp = LingPred()

    for i in trange(n, desc="Computing surprisals"):
        start = max(0, i - window_words)
        chunk_words = words_all[start : i + 1]       # include target
        text_chunk = " ".join(chunk_words)

        try:
            # Run the pipeline for this chunk
            lp.preprocess(text_chunk)
            lp.fit()
            lp.get_lexical_info()
            lp.get_POS_info()
            lp.get_phonemic_info()

            # Grab arrays from out_dict (defensive .get in case keys vary)
            words_chunk             = getattr(lp, "words", chunk_words)
            lexical_surprise_arr    = lp.out_dict.get("lexical_surprise")
            pos_surprise_arr        = lp.out_dict.get("POS_surprise")
            content_labels_arr      = lp.out_dict.get("content_words")
            function_labels_arr     = lp.out_dict.get("function_words")
            phoneme_initial_arr     = lp.out_dict.get("phoneme")
            phonemic_surprise_arr   = lp.out_dict.get("phonemic_surprise")

            # The target is the *last* element of each array
            target_word             = _safe_last(words_chunk) or words_all[i]
            lex_surprisal           = _safe_float(_safe_last(lexical_surprise_arr))
            pos_surprisal           = _safe_float(_safe_last(pos_surprise_arr))
            is_content              = _safe_last(content_labels_arr)
            is_function             = _safe_last(function_labels_arr)
            phoneme_initial         = _safe_last(phoneme_initial_arr)
            phoneme_surprisal       = _safe_float(_safe_last(phonemic_surprise_arr))

            rows.append({
                "orig_index": i,                         # index in full story
                "word": words_all[i],                    # original token
                "model_word": target_word,               # last word seen by model in this chunk
                "lex_surprisal": lex_surprisal,
                "pos_surprisal": pos_surprisal,
                "is_content_word": bool(is_content) if is_content is not None else None,
                "is_function_word": bool(is_function) if is_function is not None else None,
                "phoneme_initial": phoneme_initial,
                "phoneme_surprisal": phoneme_surprisal,
                "window_start_index": start,
                "window_len_words": len(chunk_words),
            })

        except Exception as e:
            # Keep going on errors; record Nones to preserve alignment
            if (i % verbose_every) == 0:
                print(f"[warn] i={i}, error: {e}")
            rows.append({
                "orig_index": i,
                "word": words_all[i],
                "model_word": None,
                "lex_surprisal": None,
                "pos_surprisal": None,
                "is_content_word": None,
                "is_function_word": None,
                "phoneme_initial": None,
                "phoneme_surprisal": None,
                "window_start_index": start,
                "window_len_words": len(chunk_words),
            })

    df = pd.DataFrame(rows)

    if save_path:
        df.to_csv(save_path, index=False)
        print(f"Saved results to {save_path}")

    return df


In [None]:

# Example use:
df_surprisal = compute_suprisal_over_story(
    cleaned_text,
    window_words=200,
    save_path="story_surprisal_2.csv"
)

df_surprisal.to_csv("HH_surprisal_3_way.csv", index=False)
df_surprisal.head()
