<a href="https://colab.research.google.com/github/Rumata-arc/Probabilities_Surprisal/blob/main/Token-word_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
# БЛОК 1: Excel -> единицы-ячейки + определение пунктуации
import re
import openpyxl

PUNCT_CHARS = set(list(".,!?;:…—–-()[]{}«»\"'`“”„”“/\\|"))

def is_punct_only_cell(s: str) -> bool:
    if s is None:
        return False
    t = str(s).strip()
    if t == "":
        return False
    return all((ch in PUNCT_CHARS) for ch in t)

def extract_cell_units_from_sheet(ws, start_row=2, start_col=1):
    units = []
    u = 0
    for col in range(start_col, ws.max_column + 1):
        header = ws.cell(row=1, column=col).value
        if header is None:
            continue
        for row in range(start_row, ws.max_row + 1):
            v = ws.cell(row=row, column=col).value
            if v is None:
                break
            s = str(v).strip()
            if s == "":
                break

            units.append({
                "unit_index": u,
                "text": s,
                "is_punct": is_punct_only_cell(s),
                "row": row,
                "col": col
            })
            u += 1
    return units

print("✅ Block 1 OK: functions loaded")



✅ Block 1 OK: functions loaded


In [13]:
# БЛОК 2: alignment токенов CSV к ячейкам-единицам
from transformers import AutoTokenizer
import pandas as pd
import math

TOKENIZER_NAME_OR_PATH = "Qwen/Qwen1.5-0.5B"
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME_OR_PATH, use_fast=True)

def encode_unit_candidates(unit_text: str, is_first: bool, is_punct: bool):
    """
    Возвращает варианты token_id для unit_text с разными префиксами.
    Для пунктуации-ячейки сначала пробуем без пробела.
    """
    t = str(unit_text)

    prefixes = []
    if is_first:
        prefixes = [""]  # первое — без пробела
    else:
        # если пунктуация-ячейка, чаще она "прилипает" без пробела: слово + ",".
        if is_punct:
            prefixes = ["", " ", "\n"]
        else:
            prefixes = [" ", "", "\n"]

    # формируем кандидаты
    cands = []
    for p in prefixes:
        ids = tokenizer.encode(p + t, add_special_tokens=False)
        if ids:
            cands.append(ids)

    # убираем дубликаты
    uniq, seen = [], set()
    for ids in cands:
        key = tuple(ids)
        if key not in seen:
            seen.add(key)
            uniq.append(ids)
    return uniq

def best_match(token_ids_stream, pos, unit_text, is_first, is_punct):
    """
    Ищем такой вариант токенизации unit, который совпадёт с token_ids_stream[pos:pos+L].
    """
    for ids in encode_unit_candidates(unit_text, is_first=is_first, is_punct=is_punct):
        L = len(ids)
        if token_ids_stream[pos:pos+L] == ids:
            return ids, L, "EXACT"
    return None, 0, "NO_MATCH"

def align_units_to_tokens(token_df: pd.DataFrame, units: list):
    """
    token_df: Text_X_probabilities.csv (нужны token_id, surprisal_ln)
    units: список из extract_cell_units_from_sheet
    Выход: unit_df (по единице-ячейке) + диагностика
    """
    if "token_id" not in token_df.columns or "surprisal_ln" not in token_df.columns:
        raise ValueError("В token_df нужны столбцы token_id и surprisal_ln")

    token_ids = token_df["token_id"].tolist()
    pos = 0

    rows = []
    first_mismatch = None

    for ui, unit in enumerate(units):
        is_first = (ui == 0)
        ids, L, status = best_match(
            token_ids_stream=token_ids,
            pos=pos,
            unit_text=unit["text"],
            is_first=is_first,
            is_punct=unit["is_punct"]
        )

        if L == 0:
            if first_mismatch is None:
                first_mismatch = {
                    "unit_index": ui,
                    "unit_text": unit["text"],
                    "row": unit["row"],
                    "col": unit["col"],
                    "token_pos": pos,
                    "next_token_ids": token_ids[pos:pos+10],
                }
            rows.append({
                "unit_index": ui,
                "cell_text": unit["text"],
                "is_punct_cell": unit["is_punct"],
                "row": unit["row"],
                "col": unit["col"],
                "tokens_in_unit": None,
                "unit_surprisal_ln_mean": None,
                "status": "NO_MATCH"
            })
            # Важно: при NO_MATCH лучше не "угадывать" и не сдвигать,
            # иначе поедет всё остальное. Остановимся.
            break

        chunk = token_df.iloc[pos:pos+L]
        pos += L

        mean_s = float(chunk["surprisal_ln"].mean())

        rows.append({
            "unit_index": ui,
            "cell_text": unit["text"],
            "is_punct_cell": unit["is_punct"],
            "row": unit["row"],
            "col": unit["col"],
            "tokens_in_unit": int(L),
            "unit_surprisal_ln_mean": mean_s,
            "status": "OK"
        })

    leftover = len(token_df) - pos
    diag = {
        "units_total": len(units),
        "units_aligned_ok": sum(1 for r in rows if r["status"] == "OK"),
        "tokens_total": len(token_df),
        "tokens_consumed": pos,
        "tokens_leftover": leftover,
        "first_mismatch": first_mismatch
    }

    return pd.DataFrame(rows), diag
try:
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B", use_fast=True)
    print("Tokenizer vocab size:", tokenizer.vocab_size)
except Exception as e:
    print("ERROR:", e)




Tokenizer vocab size: 151643


In [None]:
# БЛОК 3: обработать тексты порциями + сохранить CSV + печать диагностики
import os, zipfile

ZIP_PATH   = "/mnt/data/Qwen1.5_ALL-RESULTS.zip"
EXCEL_PATH = "/mnt/data/data_exp.xlsx"
OUT_DIR    = "/mnt/data/wordlevel_out_cells"
os.makedirs(OUT_DIR, exist_ok=True)

def process_texts_cells(text_numbers):
    wb = openpyxl.load_workbook(EXCEL_PATH, data_only=True)

    with zipfile.ZipFile(ZIP_PATH, "r") as zf:
        for i in text_numbers:
            sheet = f"Text_{i}"
            csv_in_zip = f"Qwen1.5_ALL-RESULTS/Text_{i}_probabilities.csv"

            if sheet not in wb.sheetnames:
                print("[skip] no sheet:", sheet)
                continue
            if csv_in_zip not in zf.namelist():
                print("[skip] no csv:", csv_in_zip)
                continue

            units = extract_cell_units_from_sheet(wb[sheet])

            with zf.open(csv_in_zip) as f:
                token_df = pd.read_csv(f)

            unit_df, diag = align_units_to_tokens(token_df, units)

            out_csv = os.path.join(OUT_DIR, f"{sheet}_cell_units_wordlevel.csv")
            unit_df.to_csv(out_csv, index=False, encoding="utf-8-sig")

            print(f"\n[{sheet}] saved -> {out_csv}")
            print("  units_total:", diag["units_total"])
            print("  units_aligned_ok:", diag["units_aligned_ok"])
            print("  tokens_leftover:", diag["tokens_leftover"])

            # Проверка "всё сошлось"
            if diag["units_aligned_ok"] != diag["units_total"] or diag["tokens_leftover"] != 0:
                print("  ⚠️ НЕ СОШЛОСЬ (alignment issue)")
                if diag["first_mismatch"] is not None:
                    fm = diag["first_mismatch"]
                    print("  first mismatch at unit:", fm["unit_index"], "text:", repr(fm["unit_text"]))
                    print("  cell:", (fm["row"], fm["col"]), "token_pos:", fm["token_pos"])
                    print("  next token_ids (10):", fm["next_token_ids"])
            else:
                print("  ✅ СОШЛОСЬ: все ячейки сопоставлены и токенов не осталось")

# пример: по 2 текста
process_texts_cells([1, 2])


In [None]:
# БЛОК 4: собрать все Text_i_cell_units_wordlevel.csv в один Excel и скачать (Colab)
import glob
import pandas as pd
import os

xlsx_path = os.path.join(OUT_DIR, "Qwen_cell_units_wordlevel.xlsx")
files = sorted(glob.glob(os.path.join(OUT_DIR, "Text_*_cell_units_wordlevel.csv")))

with pd.ExcelWriter(xlsx_path, engine="openpyxl") as writer:
    for fp in files:
        sheet = os.path.basename(fp).replace("_cell_units_wordlevel.csv", "")
        df = pd.read_csv(fp)
        df.to_excel(writer, sheet_name=sheet[:31], index=False)

print("Excel saved:", xlsx_path)

try:
    from google.colab import files as colab_files
    colab_files.download(xlsx_path)
except Exception:
    print("Если не Colab — файл лежит тут:", xlsx_path)
