In [1]:
# SINGLE-CELL, HIGH-LEVEL DEMO

# --- 1) Imports and Installs (if needed) ---
# Comment these out if packages are already installed
# !pip install transformers torch hebrew-tokenizer scikit-learn

import unicodedata
import re
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# For Hebrew BERT
import torch
from transformers import AutoTokenizer, AutoModel

# --- 2) Define a small Hebrew text inline or load your own ---
lines = [
    "בראשית ברא אלהים את השמים ואת הארץ",
    "ויאמר אלהים יהי אור ויהי אור",
    "וירא אלהים את האור כי טוב"
]

# --- 3) Basic Cleaning and Normalization ---
def clean_hebrew_text(lines):
    cleaned_lines = []
    for line in lines:
        norm_line = unicodedata.normalize('NFKC', line)
        # Remove niqqud/diacritics
        norm_line = re.sub(r'[\u0591-\u05C7]', '', norm_line)
        cleaned_lines.append(norm_line.strip())
    return cleaned_lines

cleaned = clean_hebrew_text(lines)

# --- 4) Gematria Logic ---
GEMATRIA_MAP = {
    'א': 1, 'ב': 2, 'ג': 3, 'ד': 4, 'ה': 5, 'ו': 6, 'ז': 7, 'ח': 8, 'ט': 9,
    'י': 10, 'כ': 20, 'ל': 30, 'מ': 40, 'נ': 50, 'ס': 60, 'ע': 70, 'פ': 80, 'צ': 90,
    'ק': 100, 'ר': 200, 'ש': 300, 'ת': 400
}

def calculate_gematria(word):
    return sum(GEMATRIA_MAP.get(ch, 0) for ch in word)

def simple_tokenize_hebrew(line):
    return line.split()

def create_token_dataframe(cleaned_lines):
    rows = []
    for line in cleaned_lines:
        tokens = simple_tokenize_hebrew(line)
        for t in tokens:
            g_val = calculate_gematria(t)
            rows.append({"token": t, "gematria": g_val})
    return pd.DataFrame(rows)

df_tokens = create_token_dataframe(cleaned)

print("DataFrame head:")
print(df_tokens.head(), "\n")

# --- 5) (Optional) Load Hebrew BERT and Generate Embeddings ---
def load_hebrew_transformer(model_name="onlplab/alephbert-base"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    model.eval()
    return tokenizer, model

def generate_semantic_embeddings(df_tokens, tokenizer, model, batch_size=16):
    all_embeddings = []
    tokens_list = df_tokens['token'].tolist()
    for i in range(0, len(tokens_list), batch_size):
        batch_tokens = tokens_list[i:i+batch_size]
        encoded = tokenizer(batch_tokens, return_tensors='pt', padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**encoded)
        # Take the [CLS] embedding
        hidden_states = outputs.last_hidden_state
        cls_tokens = hidden_states[:, 0, :]
        for row in cls_tokens:
            all_embeddings.append(row.tolist())
    return np.array(all_embeddings)

tokenizer, model = load_hebrew_transformer()
embeddings = generate_semantic_embeddings(df_tokens, tokenizer, model)

# Combine gematria as an extra dimension in the embedding
combined_vecs = []
for i, row in df_tokens.iterrows():
    gem = float(row['gematria'])
    emb = embeddings[i]
    # Append gematria to the embedding
    combined_vecs.append(list(emb) + [gem])
combined_vecs = np.array(combined_vecs)

# --- 6) Demo: Find tokens with specific gematria ---
def find_tokens_by_gematria(df_tokens, value):
    return df_tokens[df_tokens["gematria"] == value]

target_val = 207  # e.g., often corresponds to "אור"
matches = find_tokens_by_gematria(df_tokens, target_val)
print(f"Tokens with gematria={target_val}:\n", matches, "\n")

# --- 7) Demo: Nearest Neighbors Based on Embeddings ---
def find_nearest_neighbors(token_idx, embeddings, top_k=5):
    query_vec = embeddings[token_idx].reshape(1, -1)
    sims = cosine_similarity(query_vec, embeddings)[0]
    nn_indices = np.argsort(sims)[::-1]  # descending
    return nn_indices[:top_k], sims[nn_indices[:top_k]]

def print_similar_tokens(df_tokens, embeddings, token_idx, top_k=5):
    nn_indices, scores = find_nearest_neighbors(token_idx, embeddings, top_k)
    query_token = df_tokens.iloc[token_idx]["token"]
    print(f"Query Token: '{query_token}' (Index={token_idx})")
    print("--------------------------------------------------")
    for rank, (idx, score) in enumerate(zip(nn_indices, scores), start=1):
        t = df_tokens.iloc[idx]['token']
        gem = df_tokens.iloc[idx]['gematria']
        print(f"{rank}. '{t}' | Gematria={gem} | Similarity={score:.3f}")

if len(df_tokens) > 0:
    print("Nearest neighbors for token at index 0:")
    print_similar_tokens(df_tokens, combined_vecs, token_idx=0, top_k=5)


DataFrame head:
    token  gematria
0  בראשית       913
1     ברא       203
2   אלהים        46
3      את       401
4   השמים       355 



Some weights of BertModel were not initialized from the model checkpoint at onlplab/alephbert-base and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokens with gematria=207:
    token  gematria
10   אור       207
12   אור       207 

Nearest neighbors for token at index 0:
Query Token: 'בראשית' (Index=0)
--------------------------------------------------
1. 'בראשית' | Gematria=913 | Similarity=1.000
2. 'ואת' | Gematria=407 | Similarity=0.999
3. 'את' | Gematria=401 | Similarity=0.999
4. 'את' | Gematria=401 | Similarity=0.999
5. 'השמים' | Gematria=355 | Similarity=0.999
