# Methedologi test

### Setting up the environment

In [1]:
import os
from itertools import chain
import fitz  # PyMuPDF for PDF ingestion
import pandas as pd

### Ingesting the models

In [2]:
from gensim.models import Word2Vec

## Importing documents

#### setting up folder structure

In [3]:
# Path to the reports folder
reports_folder = "./reports"
# List all PDF files in the folder
report_files = [f for f in os.listdir(reports_folder) if f.endswith(".pdf")]
# Pick one example file
random_report = report_files[0]
# Build the full path
doc_path = os.path.join(reports_folder, random_report)
print("Found report:", doc_path)

Found report: ./reports/2022_DeutscheBank_group.pdf


#### Ingestion of documents


In [4]:
from itertools import chain


from itertools import chain

page_ranges = {
    # 2024 files
    '2024_Danske_group.pdf': range(208, 240),  # First 20 pages
    '2024_UBS_group.pdf': range(88, 136),  # First 3 and last page
    '2024_DeutscheBank_group.pdf': range(91, 208),
    '2024_ING_group.pdf': range(158, 222),  # Pages 10-29

    # 2023 files
    '2023_Danske_group.pdf': range(175, 213),  # First 20 pages
    '2023_UBS_group.pdf': range(97, 153),  # First 3 and last page
    '2023_DeutscheBank_group.pdf': range(91, 208),
    '2023_ING_group.pdf': range(131, 204),  # Pages 10-29

    # 2022 files
    '2022_Danske_group.pdf': range(169, 208),  # First 20 pages
    '2022_UBS_group.pdf': range(83, 134),  # First 3 and last page
    '2022_DeutscheBank_group.pdf': range(90, 213),
    '2022_ING_group.pdf': range(103, 185),  # Pages 10-29

    # 2021 files
    '2021_Danske_group.pdf': range(159, 194),  # First 20 pages
    '2021_UBS_group.pdf': range(98, 150),  # First 3 and last page
    '2021_DeutscheBank_group.pdf': range(84, 201),
    '2021_ING_group.pdf': range(45, 150),  # Pages 10-29
}

# Note that the reason I added +1 to the end value is because Python’s range() is end-exclusive:

# Default pages to process if no specific range is defined for a file
# Options:
# - range(0, 10) for first 10 pages
# - [0, 1, 2, -1] for first 3 and last page (use negative for counting from end)
# - None to process all pages
default_pages = range(0, 10)  # First 10 pages by default

files_walk = os.walk(reports_folder)  # Fixed variable name
report_paragraphs = []
report_paragraphs_source = []
report_pages_source = []

for path, dirs, files in files_walk:
    pdfs = [file for file in files if file.endswith('.pdf')]
    for _file in pdfs:
        print(f"Processing {_file}...")

        # Determine which pages to process for this file
        if _file in page_ranges:
            pages_to_process = page_ranges[_file]
        else:
            pages_to_process = default_pages

        with fitz.open(os.path.join(path, _file)) as doc:
            total_pages = len(doc)

            # If pages_to_process is None, process all pages
            if pages_to_process is None:
                pages_to_process = range(total_pages)

            # Handle negative page numbers (count from end)
            actual_pages = []
            for page_num in pages_to_process:
                if isinstance(page_num, int):
                    if page_num < 0:
                        actual_page = total_pages + page_num  # Convert negative to positive
                    else:
                        actual_page = page_num

                    # Only include valid page numbers
                    if 0 <= actual_page < total_pages:
                        actual_pages.append(actual_page)


            # Process only the specified pages
            for page_num in actual_pages:
                page = doc[page_num]
                blocks = [x[4] for x in page.get_text("blocks")]
                # get rid of empty blocks
                blocks = [block.strip() for block in blocks if block.strip()]

                if blocks:  # Only add if there are non-empty blocks
                    report_paragraphs.extend(blocks)
                    report_pages_source.extend([page_num] * len(blocks))
                    report_paragraphs_source.extend([_file] * len(blocks))


Processing 2022_DeutscheBank_group.pdf...
Processing 2021_DeutscheBank_group.pdf...
Processing 2022_UBS_group.pdf...
Processing 2023_UBS_group.pdf.pdf...
Processing 2023_DeutscheBank_group.pdf...
Processing 2024_UBS_group.pdf.pdf...
Processing 2021_Danske_group.pdf.pdf...
Processing 2024_DeutscheBank_group.pdf...
Processing 2021_UBS_group.pdf.pdf...
Processing 2024_ING_group.pdf.pdf...
Processing 2022_Danske_group.pdf.pdf...
Processing 2021_ING_group.pdf.pdf...
Processing 2024_Danske_group.pdf...
Processing 2023_ING_group.pdf.pdf...
Processing 2023_Danske_group.pdf.pdf...
Processing 2022_ING_group.pdf.pdf...


In [5]:
report_paragraphs[10]


'61\t\nIntroduction'

In [6]:
change_indices = [i for i in range(1, len(report_paragraphs_source)) if report_paragraphs_source[i] != report_paragraphs_source[i-1]]
for index in change_indices:
    print(report_paragraphs_source[index-1])
    print(report_paragraphs_source[index])
    print(report_pages_source[index-1])
    print(report_pages_source[index])
    print(report_paragraphs[index-15:index])

2022_DeutscheBank_group.pdf
2021_DeutscheBank_group.pdf
212
84
['but no', 'more', 'than \n9 months', 'Over \n9 months', 'but no', 'more', 'than \n1 year', 'Over \n1 year \nbut no', 'more', 'than \n2 years', 'Over \n2 years', 'but no', 'more', 'than \n5 years', 'Over \n5 years \nTotal \nDeposits \n378,174 \n34,971 \n97,284 \n55,043 \n16,398 \n14,629 \n7,638 \n7,975 \n9,344 \n621,456 \nDue to banks \n41,570 \n1,052 \n9,089 \n8,984 \n6,248 \n1,592 \n2,965 \n5,699 \n7,853 \n85,053 \nDue to customers \n336,605 \n33,919 \n88,196 \n46,059 \n10,150 \n13,038 \n4,673 \n2,276 \n1,491 \n536,404 \nRetail \n155,180 \n5,491 \n58,382 \n28,637 \n1,334 \n1,273 \n943 \n579 \n84 \n251,903 \nCorporates and other \ncustomers \n181,425 \n28,428 \n29,813 \n17,422 \n8,816 \n11,764 \n3,730 \n1,697 \n1,407 \n284,500 \nTrading liabilities \n332,969 \n0 \n0 \n0 \n0 \n0 \n0 \n0 \n0 \n332,969 \nTrading securities \n49,860 \n0 \n0 \n0 \n0 \n0 \n0 \n0 \n0 \n49,860 \nOther trading liabilities \n756 \n0 \n0 \n0 \n0 \n0 

In [7]:
print(len(report_paragraphs_source))
print(len(report_pages_source))
print(len(report_paragraphs))

12443
12443
12443


### Cleaning ingested data


In [8]:
from collections import defaultdict

grouped_text = defaultdict(list)
for text, source in zip(report_paragraphs, report_paragraphs_source):
    year = source.split('_')[0]      # e.g., '2024'
    bank = source.split('_')[1]      # e.g., 'Danske'
    grouped_text[(year, bank)].append(text)


# Group indices by document
doc_indices = defaultdict(list)
for idx, doc in enumerate(report_paragraphs_source):
    doc_indices[doc].append(idx)

# Indices to keep
indices_to_keep = set()

for doc, indices in doc_indices.items():
    seen = []
    for idx in indices:
        para = report_paragraphs[idx]
        # Check if this paragraph is very similar to any already seen (diff < 5 chars)
        if not any(abs(len(para) - len(other)) < 5 and sum(a != b for a, b in zip(para, other)) < 5 for other in seen):
            indices_to_keep.add(idx)
            seen.append(para)

# Sort indices to keep
indices_to_keep = sorted(indices_to_keep)

# Filter all lists
report_paragraphs = [report_paragraphs[i] for i in indices_to_keep]
report_paragraphs_source = [report_paragraphs_source[i] for i in indices_to_keep]
report_pages_source = [report_pages_source[i] for i in indices_to_keep]

In [9]:
print(len(report_paragraphs_source))
print(len(report_pages_source))
print(len(report_paragraphs))

8360
8360
8360


In [10]:
change_indices = [i for i in range(1, len(report_paragraphs_source)) if
                  report_paragraphs_source[i] != report_paragraphs_source[i - 1]]
for index in change_indices:
    print(report_paragraphs_source[index - 1])
    print(report_paragraphs_source[index])
    print(report_pages_source[index - 1])
    print(report_pages_source[index])
    print(report_paragraphs[index - 15:index])

2022_DeutscheBank_group.pdf
2021_DeutscheBank_group.pdf
212
84
['On \ndemand', '(incl. \nOvernight', 'and \none day', 'notice)', 'one \nmonth', 'to no \nmore', 'Over \n1 year \nbut no', 'than \n2 years', 'Over \n5 years \nTotal \nCash and central bank \nbalances¹ \n164,090 \n13,138 \n1,639 \n0 \n0 \n29 \n0 \n0 \n0 \n178,897 \nInterbank balances \n(w/o central banks)¹ \n6,315 \n265 \n181 \n83 \n166 \n181 \n0 \n0 \n6 \n7,195 \nCentral bank funds sold \n0 \n0 \n0 \n0 \n0 \n0 \n0 \n0 \n0 \n0 \nSecurities purchased under \nresale agreements \n9 \n2,646 \n3,990 \n356 \n519 \n895 \n1,721 \n1,342 \n0 \n11,478 \nWith banks \n3 \n305 \n869 \n22 \n5 \n600 \n1,626 \n1,322 \n0 \n4,750 \nWith customers \n6 \n2,342 \n3,121 \n334 \n514 \n295 \n95 \n21 \n0 \n6,728 \nSecurities borrowed \n0 \n0 \n0 \n0 \n0 \n0 \n0 \n0 \n0 \n0 \nWith banks \n0 \n0 \n0 \n0 \n0 \n0 \n0 \n0 \n0 \n0 \nWith customers \n0 \n0 \n0 \n0 \n0 \n0 \n0 \n0 \n0 \n0 \nFinancial assets at fair value \nthrough profit or loss \n410,982 \n

## Stage 1: Word Embedding Through Semantic Vector Representation

### Stage 1 (i): Using neural networks (e.g., Word2Vec) to capture the semantic meaning of words

In [11]:
# We import Word2Vec

import re

from gensim.models import Word2Vec

from gensim.parsing.preprocessing import remove_stopwords   # <-- added for stopword removal


def simple_tokenize(text):

# Makes the text lowercase

    text = text.lower()

# Remove stopwords using gensim (no downloads needed)

    text = remove_stopwords(text)

# This replaces anything that is not a-z or 0-9 with a space. Removed punctuation(.,;!?). Keeps only letters and digits.

    text = re.sub(r"[^a-z0-9]+", " ", text)

# Splits on whitespace

    tokens = text.split()

# Removes tokens that are pure number, not connected to a letter

    tokens = [t for t in tokens if not t.isdigit()]

# Removes characters tokens. (e.g "a", "x")

    tokens = [t for t in tokens if len(t) > 1]

    return tokens


# This does a for loop over every paragrps and keeps only elements that are actually strincs, and not empty or just whitespace.

tokenized_docs = [

    simple_tokenize(p)

    for p in report_paragraphs

    if isinstance(p, str) and p.strip()

]

# This removes them

tokenized_docs = [t for t in tokenized_docs if t]

# This show how many documents we train on and example of what the tokens used in word 2 vec is.

print("Documents:", len(tokenized_docs))

print("Example tokens:", tokenized_docs[0][:20])

# This is the actual Word2Vec model

w2v_model = Word2Vec(
    sentences=tokenized_docs,
    vector_size=80,        # more expressive than 150
    window=5,               # more local → less over-grouping
    min_count=5,            # keeps more medium-frequency words
    workers=4,
    sg=1,                   # skip-gram for semantics
    negative=3,            # stronger contrast
    sample=1e-5,            # better handling of common words
    epochs=5               # train longer → cleaner space
)


# This is the object that we use for the learned vector for each word.

word_vectors = w2v_model.wv

# Get a list of all words in the vocabulary, in the order gensim stores them. This order will define the row index of each word in the embedding matrix.

vocab = list(word_vectors.key_to_index.keys())

# This should be 300 (same as vector_size),

embedding_dim = word_vectors.vector_size

print("Vocabulary size:", len(vocab))

print("Embedding dimension:", embedding_dim)


Documents: 8230
Example tokens: ['deutsche', 'bank', 'risks', 'opportunities', 'annual', 'report', 'opportunities']
Vocabulary size: 3492
Embedding dimension: 80


In [12]:
# ============================================
# BUILD EMBEDDING MATRIX X   (REQUIRED BEFORE CLUSTERING)
# ============================================

# vocab and word_vectors must already exist from your Word2Vec model
vocab = list(word_vectors.key_to_index.keys())
embedding_dim = word_vectors.vector_size

import numpy as np

# Map each word to a row index
word_to_index = {w: i for i, w in enumerate(vocab)}

# Create the embedding matrix
X = np.zeros((len(vocab), embedding_dim), dtype=float)

for w, i in word_to_index.items():
    X[i] = word_vectors[w]

# Normalize for cosine similarity
X = X / (np.linalg.norm(X, axis=1, keepdims=True) + 1e-9)

print("Embedding matrix shape:", X.shape)  # should be (V, d)


Embedding matrix shape: (3492, 80)


### Step 1 (ii): Cluster words based on their vector embedding using a large scale nearest-neighbor search LSH

### Algoritm 1

#### while numClusters > K do
    1. Run lsh-cand-pairs on all current clusters;
       (Optionally) calculate cosine similarity over all candidate pairs
       to pick the most similar candidate pair;

    2. Pick one (best) candidate pair to merge, and combine the
       corresponding two clusters into one;

    3. Run cluster-roid to find the center of the new cluster
       then set numClusters = numClusters – 1;
#### end

#### Algorithm 1: Hierarchical Word Clustering based on LSH

#### Initialization

In [13]:
# If we get a missing key, it returns it missing
from collections import defaultdict
# Generates pairs (i,j)
from itertools import combinations
import numpy as np  # already imported, but harmless to repeat


def hierarchical_lsh_clustering(
    X, K,
    n_planes=8, n_tables=12,
    max_cluster_size=3000,
    alpha_size_penalty=0,
    random_state=42
):
    """
    X : numpy array of shape (V, d)
        Row i = embedding for word i (already normalized).
    K : int
        Desired number of word clusters (themes).
    n_planes : int
        Number of random hyperplanes per LSH table (length of hash).
    n_tables : int
        Number of independent LSH tables.
    max_cluster_size : int
        Hard cap on cluster size (we never merge beyond this).
    alpha_size_penalty : float
        How strongly we penalise merges that create large clusters.
        Higher value → fewer clusters right at the cap.
    """
    # These can be changed for optimization. Se article below

    # ---------------------------------------------------------------------------------------------------------------------------------------------------
    # This is a random number generator with a fixed seed so that results are reproducible. V = vocabulary size (#words), d = embedding dimension (300)
    rng = np.random.RandomState(random_state)
    V, d = X.shape

    # Inizilization -------------------------------------------------------------------------------------------------------------------------------------

    # Start with V clusters, one per word.
    # This is standard bottom-up hierarchical clustering:
    #   - Each word begins as its own cluster
    #   - Then we iteratively merge the most similar pair
    # Indices: list of which word indices belong to this cluster
    # Center:  the semantic embedding representing the cluster.

    clusters = [
        {"indices": [i], "center": X[i].copy()}
        for i in range(V)
    ]
    # Should be the same as the total word cap
    print(f"Initial number of clusters: {len(clusters)}")

    # This single line creates the entire hashing mechanism used to find "sufficiently similar" words of clusters.
    #  This LSH scheme is designed to approximate cosine similarity between vectors. This LSH scheme is designed to approximate cosine similarity between vectors.
    hyperplanes = rng.randn(n_tables, n_planes, d)

    # Inizilization -------------------------------------------------------------------------------------------------------------------------------------
    # As long as there are more than K clusters, we keep merging.
    while len(clusters) > K:

        # Build LSH tables. ---------------------------------------------------------------------------------------------------------------------------------
        # We create a list of hash tables, one per LSH table.
        tables = [defaultdict(list) for _ in range(n_tables)]
        # We index the number of clusers
        for idx, cl in enumerate(clusters):
            v = cl["center"]
            # This is the lsh-cand pairs.clusters with similar directions (high cosine) are likely to get the same bit pattern and thus end up in the same bucket.
            for t in range(n_tables):
                hp = hyperplanes[t]
                bits = (hp @ v >= 0).astype(int)
                key = tuple(bits.tolist())
                tables[t][key].append(idx)

        # Extract candidate similar pairs from LSH buckets. -------------------------------------------------------------------------------------------------
        cand_pairs = set()

        for t in range(n_tables):
            for bucket_indices in tables[t].values():
                if len(bucket_indices) < 2:
                    continue
                for i, j in combinations(bucket_indices, 2):
                    cand_pairs.add(tuple(sorted((i, j))))

        # if cand_pairs is empty, then gives all possible pairs, so we always have a least something to merge.
        if not cand_pairs:
            cand_pairs = set(combinations(range(len(clusters)), 2))

        # Select the "most similar" pair among candidates. --------------------------------------------------------------------------------------------------
        # We initialize best_sim with -1 (lowest possible cosine)
        # After the loop, best_pair is the pair of clusters with the highest cosine similarity, matching Algorithm 1:
        best_score = -1e9
        best_pair = None

        for i, j in cand_pairs:
            ci = clusters[i]
            cj = clusters[j]

            # ---------- NEW: size-aware merging ----------
            size_i = len(ci["indices"])
            size_j = len(cj["indices"])
            new_size = size_i + size_j

            # Hard cap: never create clusters above max_cluster_size
            if new_size > max_cluster_size:
                continue

            vi = ci["center"]
            vj = cj["center"]

            sim = float(np.dot(vi, vj))

            # Soft penalty: we down-weight merges that create very large clusters.
            # If new_size == max_cluster_size → penalty = alpha_size_penalty
            # If new_size is small → penalty is close to 0.
            size_penalty = alpha_size_penalty * (new_size / max_cluster_size)
            score = sim - size_penalty
            # ---------------------------------------------

            if score > best_score:
                best_score = score
                best_pair = (i, j)

        # Safety: in theory best_pair should never be None here, because we always
        # have at least one candidate pair. This is just a guard against bugs.
        if best_pair is None:
            print("No more merges possible (no candidate pairs found under constraints).")
            break

        # Merge the most similar pair. ----------------------------------------------------------------------------------------------------------------------

        # ---------------------------------------------------------------------
        # MERGE THE MOST SIMILAR PAIR (Algorithm 1: Step 2 + Step 3)
        # ---------------------------------------------------------------------
        # At this point we have identified `best_pair`, which contains the
        # indices (i, j) of the two clusters that are *most* semantically
        # similar according to cosine similarity.
        #
        # The idea is:
        #     - these two clusters are close in meaning
        #     - therefore they should be merged into a single, larger cluster
        #
        # This is the fundamental operation in hierarchical clustering:
        # repeatedly merging the two closest groups.
        #
        # After merging, the number of clusters decreases by 1.
        # This continues until we reach the target number of K clusters.
        # ---------------------------------------------------------------------

        i, j = best_pair

        # Ensure i < j so that deletion in the list below does not mess up indexing.
        # If we remove the higher index first, the lower index remains valid.
        if i > j:
            i, j = j, i

        # Retrieve the two clusters that we are going to merge.
        # ci and cj each contain:
        #   - a list of word indices ("indices")
        #   - a center vector ("center")
        ci = clusters[i]
        cj = clusters[j]

        # ---------------------------------------------------------------------
        # COMBINE WORD MEMBERSHIP LISTS
        # ---------------------------------------------------------------------
        # new_indices is simply the union of the two clusters' word lists.
        # If ci contains words [3, 15, 20] and cj contains [7, 8],
        # then new_indices will be [3, 15, 20, 7, 8].
        #
        # This is a simple concatenation because:
        #   - each word belongs to exactly ONE cluster at any time
        #   - merging should preserve *all* previous members
        #
        # There is no deduplication needed because the algorithm never assigns
        # a word to more than one cluster.
        new_indices = ci["indices"] + cj["indices"]

        # ---------------------------------------------------------------------
        # COMPUTE NEW CLUSTER CENTER ("cluster-roid")
        # ---------------------------------------------------------------------
        # According to Algorithm 1, once two clusters are merged we must find
        # a new representative vector (center) for the merged cluster.
        #
        # The most standard and natural choice → the mean of all member vectors.
        #
        # If the cluster contains N words with embeddings x₁, x₂, ... , x_N,
        # the new center is:
        #       center = (1/N) * Σ x_k
        #
        # This averaging step creates a semantic "topic direction" that tries
        # to represent the entire cluster.
        #
        # IMPORTANT:
        #   After computing the mean, we re-normalize the vector so that it is
        #   still on the unit sphere. This keeps cosine similarity consistent
        #   across all iterations.
        emb = X[new_indices].mean(axis=0)
        emb = emb / (np.linalg.norm(emb) + 1e-9)

        # The new merged cluster is now fully defined:
        #   - its members are all words from ci and cj
        #   - its center is the averaged embedding from above.
        new_cluster = {"indices": new_indices, "center": emb}

        # ---------------------------------------------------------------------
        # UPDATE CLUSTER LIST
        # ---------------------------------------------------------------------
        # We now remove the two old clusters (i and j) and insert the new one.
        #
        # Why rebuild the list?
        #    - If we try to overwrite clusters[i] and clusters[j] directly,
        #      indexing becomes complicated because lists shift when items
        #      are removed.
        #
        # Wi rebuild a clean list to avoid logical errors and keep the cluster
        # structure easy to reason about.
        new_clusters = []
        for k, cl in enumerate(clusters):
            if k in (i, j):   # skip clusters we just merged
                continue
            new_clusters.append(cl)

        # Add the merged cluster at the end
        new_clusters.append(new_cluster)

        # Replace the old list with the updated one
        clusters = new_clusters

        # Print progress occasionally so the user knows clustering is ongoing
        if len(clusters) % 50 == 0:
            print(f"Current number of clusters: {len(clusters)}")

    # -----------------------------------------------------------------
    # When the while-loop finishes, we have K clusters left (unless the
    # safety break above triggered, which should not normally happen).
    # Return them so we can inspect them outside the function.
    # -----------------------------------------------------------------
    print(f"Final number of clusters: {len(clusters)}")
    return clusters


In [14]:
# RUN THE CLUSTERING ----------------------------------------------------------------------------------------------------------------------

K = 100  # Number of semantic clusters you want to extract
clusters = hierarchical_lsh_clustering(X, K)

# Quick inspection of the first cluster
example_indices = clusters[0]["indices"][:10]
example_words = [vocab[i] for i in example_indices]
print("Example words in cluster 0:", example_words)


Initial number of clusters: 3492


KeyboardInterrupt: 

In [70]:
# ========================================================================
#  CLUSTERING VERIFICATION CHECKS
# ========================================================================

print("\n==================== CHECK 1: SAMPLE WORDS ====================")
num_to_show = 200
for c in range(min(num_to_show, len(clusters))):
    word_ids = clusters[c]["indices"][:10]
    words = [vocab[i] for i in word_ids]
    print(f"\nCluster {c} (size={len(clusters[c]['indices'])}):")
    print(words)

print("\n==================== CHECK 2: CLUSTER SIZE STATS ====================")
sizes = [len(cl["indices"]) for cl in clusters]
print("Min size:", min(sizes))
print("Max size:", max(sizes))
print("Average size:", sum(sizes)/len(sizes))

print("\n==================== CHECK 3: CENTER VECTOR NORMS ====================")
norms = [np.linalg.norm(cl["center"]) for cl in clusters]
print("Min norm:", min(norms))
print("Max norm:", max(norms))

print("\n==================== CHECK 4: WORD INDEX COVERAGE ====================")
all_idx = []
for cl in clusters:
    all_idx.extend(cl["indices"])

if len(all_idx) == len(set(all_idx)) == len(vocab):
    print("✔ All words appear exactly once across all clusters")
else:
    print(" Error: Duplicate or missing word assignments detected")

print("Total indices collected:", len(all_idx))
print("Unique indices:", len(set(all_idx)))
print("Vocabulary size:", len(vocab))




Cluster 0 (size=1):
['credit']

Cluster 1 (size=1):
['capital']

Cluster 2 (size=1):
['financial']

Cluster 3 (size=1):
['non']

Cluster 4 (size=1):
['loss']

Cluster 5 (size=1):
['related']

Cluster 6 (size=1):
['collateral']

Cluster 7 (size=1):
['equity']

Cluster 8 (size=1):
['includes']

Cluster 9 (size=1):
['information']

Cluster 10 (size=1):
['customers']

Cluster 11 (size=1):
['rwa']

Cluster 12 (size=1):
['counterparty']

Cluster 13 (size=1):
['annual']

Cluster 14 (size=1):
['security']

Cluster 15 (size=1):
['country']

Cluster 16 (size=1):
['high']

Cluster 17 (size=1):
['usd']

Cluster 18 (size=1):
['use']

Cluster 19 (size=1):
['change']

Cluster 20 (size=1):
['recovery']

Cluster 21 (size=1):
['applicable']

Cluster 22 (size=1):
['entities']

Cluster 23 (size=1):
['senior']

Cluster 24 (size=1):
['which']

Cluster 25 (size=1):
['sector']

Cluster 26 (size=1):
['applies']

Cluster 27 (size=1):
['thresholds']

Cluster 28 (size=1):
['the']

Cluster 29 (size=1):
['areas']

In [71]:
# -------------------------------------------------------------
# Stage 1 (iii): Extract Textual Factors (TFs) using SVD
# -------------------------------------------------------------

import numpy as np
from numpy.linalg import svd

all_factors = []        # here we store all TF vectors (in embedding space)
factor_meta = []        # optional: store which cluster + top words each TF comes from

# 1. Choose candidate clusters (avoid huge or tiny clusters)
min_size, max_size = 5, 80

candidate_clusters = [
    (cid, cl)
    for cid, cl in enumerate(clusters)
    if min_size <= len(cl["indices"]) <= max_size
]

print("Number of candidate clusters:", len(candidate_clusters))

# If nothing is in the range → use the largest cluster so we still get at least one TF
if not candidate_clusters:
    print("⚠️ No clusters in size range, using largest cluster instead.")
    cid_big, cl_big = max(
        enumerate(clusters),
        key=lambda x: len(x[1]["indices"])
    )
    candidate_clusters = [(cid_big, cl_big)]

# 2. Loop over all candidate clusters → build local matrix + SVD
for cid, cl in candidate_clusters:
    word_idx = cl["indices"]                 # indices into vocab / X
    X_sub = X[word_idx, :]                   # shape: (cluster_size, embedding_dim)

    # SVD on local embedding matrix
    # X_sub = U Σ V^T
    # first right-singular vector V[0] is the main direction in embedding space
    U, S, Vt = svd(X_sub, full_matrices=False)
    factor_vec = Vt[0, :]                    # shape: (embedding_dim,)

    # normalize factor direction
    factor_vec = factor_vec / (np.linalg.norm(factor_vec) + 1e-9)

    all_factors.append(factor_vec)

    # optional: store metadata with top words in this factor
    # projection of words in this cluster on the factor
    scores = X_sub @ factor_vec              # (cluster_size,)
    order = np.argsort(-np.abs(scores))      # sort by absolute loading

    top_k = 15
    top_indices = [word_idx[i] for i in order[:top_k]]
    top_words = [vocab[i] for i in top_indices]

    factor_meta.append({
        "cluster_id": cid,
        "cluster_size": len(word_idx),
        "top_words": top_words
    })

# 3. Stack all TF vectors into a single matrix (F x d)
TF_matrix = np.vstack(all_factors)           # shape: (num_factors, embedding_dim)

print("Number of textual factors (TFs):", TF_matrix.shape[0])
print("TF embedding dimension:", TF_matrix.shape[1])

# Quick check of the first few factors and their top words
for f_id, meta in list(enumerate(factor_meta[:80])):
    print(f"\nTF {f_id} (from cluster {meta['cluster_id']}, size={meta['cluster_size']}):")
    print("Top words:", meta["top_words"])


Number of candidate clusters: 16
Number of textual factors (TFs): 16
TF embedding dimension: 200

TF 0 (from cluster 61, size=9):
Top words: ['zip', 'pdf', 'discrepancies', 'esef', 'en', 'danskebank', 'xhtml', 'version', 'file']

TF 1 (from cluster 63, size=39):
Top words: ['north', 'belgium', 'south', 'canada', 'america', 'islands', 'ireland', 'france', 'australia', 'poland', 'netherlands', 'cayman', 'india', 'china', 'kingdom']

TF 2 (from cluster 67, size=5):
Top words: ['dcemvguvkpi', 'tgxgpwgu', 'vtcfkpi', 'cpf', '8c4']

TF 3 (from cluster 68, size=17):
Top words: ['wall', 'extensive', 'street', 'title', 'submit', 'comptroller', 'platform', 'farm', 'fdic', 'recently', 'reform', 'housing', 'regulators', 'facing', 'frb']

TF 4 (from cluster 69, size=24):
Top words: ['recreation', 'arts', 'entertainment', 'employers', 'undifferentiated', 'education', 'compulsory', 'health', 'goods', 'service', 'human', 'administration', 'producing', 'work', 'households']

TF 5 (from cluster 72, size=

In [72]:
# -------------------------------------------------------------
# Stage 2: Project documents onto Textual Factors (TFs)
# -------------------------------------------------------------

import numpy as np

# 1. Build a document embedding for each paragraph
#    (simple average of word embeddings in the paragraph)

doc_embeddings = []
doc_lengths = []

for tokens in tokenized_docs:
    idxs = [word_to_index[w] for w in tokens if w in word_to_index]

    if not idxs:
        # no known words → zero vector
        doc_vec = np.zeros(X.shape[1], dtype=float)
        doc_len = 0
    else:
        emb_mat = X[idxs, :]                  # (n_words_in_doc, d)
        doc_vec = emb_mat.mean(axis=0)        # average
        doc_len = len(idxs)

    # normalize to unit length (like Cong et al. to keep cosine geometry)
    norm = np.linalg.norm(doc_vec)
    if norm > 0:
        doc_vec = doc_vec / norm

    doc_embeddings.append(doc_vec)
    doc_lengths.append(doc_len)

doc_embeddings = np.vstack(doc_embeddings)    # shape: (N_docs, d)
print("Document embedding matrix shape:", doc_embeddings.shape)

# 2. Project documents onto TFs
#    TF_matrix: (F, d) from your SVD step

TF_matrix_norm = TF_matrix / (np.linalg.norm(TF_matrix, axis=1, keepdims=True) + 1e-9)
doc_TF_scores = doc_embeddings @ TF_matrix_norm.T   # shape: (N_docs, F)

print("Doc–TF score matrix shape:", doc_TF_scores.shape)

# Optional: quick peek at first doc and first few factors
print("First doc TF scores (first 10 factors):")
print(doc_TF_scores[0, :10])


Document embedding matrix shape: (8230, 200)
Doc–TF score matrix shape: (8230, 16)
First doc TF scores (first 10 factors):
[-0.86005097 -0.51077645 -0.76823011 -0.85807682 -0.62659098 -0.7496583
 -0.68755115 -0.62950143 -0.91252929 -0.51734781]


In [73]:
# How many docs and TFs do we have?
n_docs, n_factors = doc_TF_scores.shape
print("Docs:", n_docs, "Factors:", n_factors)

# Show top-k TFs for the first few documents
k = 5      # top TFs per doc
n_show = 10  # how many docs to print

for d in range(min(n_show, n_docs)):
    scores = doc_TF_scores[d]
    top_idx = np.argsort(scores)[-k:][::-1]  # indices of top-k factors
    top_scores = scores[top_idx]
    print(f"\nDoc {d}:")
    for tf_idx, sc in zip(top_idx, top_scores):
        print(f"  TF {tf_idx}: {sc:.3f}")


Docs: 8230 Factors: 16

Doc 0:
  TF 1: -0.511
  TF 9: -0.517
  TF 10: -0.600
  TF 4: -0.627
  TF 7: -0.630

Doc 1:
  TF 10: -0.423
  TF 11: -0.493
  TF 1: -0.493
  TF 5: -0.515
  TF 12: -0.521

Doc 2:
  TF 10: -0.637
  TF 1: -0.641
  TF 9: -0.649
  TF 5: -0.681
  TF 11: -0.690

Doc 3:
  TF 1: -0.589
  TF 9: -0.607
  TF 10: -0.619
  TF 11: -0.676
  TF 12: -0.689

Doc 4:
  TF 1: -0.630
  TF 9: -0.647
  TF 10: -0.668
  TF 4: -0.676
  TF 11: -0.705

Doc 5:
  TF 1: -0.624
  TF 9: -0.645
  TF 4: -0.682
  TF 10: -0.684
  TF 11: -0.737

Doc 6:
  TF 10: -0.569
  TF 1: -0.596
  TF 9: -0.611
  TF 11: -0.648
  TF 5: -0.655

Doc 7:
  TF 1: -0.608
  TF 10: -0.617
  TF 9: -0.641
  TF 11: -0.652
  TF 5: -0.667

Doc 8:
  TF 1: -0.378
  TF 9: -0.380
  TF 4: -0.515
  TF 6: -0.520
  TF 10: -0.566

Doc 9:
  TF 4: -0.538
  TF 1: -0.603
  TF 9: -0.643
  TF 15: -0.695
  TF 6: -0.704


In [74]:
# Show top-k documents for the first few TFs
k = 5         # top docs per TF
n_tf_show = 10  # how many TFs to print

for f in range(min(n_tf_show, n_factors)):
    scores = doc_TF_scores[:, f]
    top_docs = np.argsort(scores)[-k:][::-1]
    top_vals = scores[top_docs]
    print(f"\nTF {f}: top {k} docs")
    for doc_id, sc in zip(top_docs, top_vals):
        print(f"  doc {doc_id}: {sc:.3f}")



TF 0: top 5 docs
  doc 6453: 0.000
  doc 3594: 0.000
  doc 7121: 0.000
  doc 5355: 0.000
  doc 7375: 0.000

TF 1: top 5 docs
  doc 3430: 0.000
  doc 7375: 0.000
  doc 3436: 0.000
  doc 7404: 0.000
  doc 7400: 0.000

TF 2: top 5 docs
  doc 7375: 0.000
  doc 2774: 0.000
  doc 7477: 0.000
  doc 7404: 0.000
  doc 7400: 0.000

TF 3: top 5 docs
  doc 3804: 0.066
  doc 5146: 0.066
  doc 6877: 0.066
  doc 3436: 0.000
  doc 3429: 0.000

TF 4: top 5 docs
  doc 6877: 0.028
  doc 3804: 0.028
  doc 5146: 0.028
  doc 7404: 0.000
  doc 6453: 0.000

TF 5: top 5 docs
  doc 5146: 0.002
  doc 3804: 0.002
  doc 6877: 0.002
  doc 3436: 0.000
  doc 3834: 0.000

TF 6: top 5 docs
  doc 5355: 0.000
  doc 2774: 0.000
  doc 3429: 0.000
  doc 3430: 0.000
  doc 3436: 0.000

TF 7: top 5 docs
  doc 1144: 0.062
  doc 4915: 0.062
  doc 6348: 0.062
  doc 2451: 0.062
  doc 7375: 0.000

TF 8: top 5 docs
  doc 5146: 0.073
  doc 6877: 0.073
  doc 3804: 0.073
  doc 7492: 0.000
  doc 5355: 0.000

TF 9: top 5 docs
  doc 3429

#### Also possible to pick the words that we want it to look at. Like in the marcin code

In [75]:
To many stop words. Improve stop words
FalseAnd LSH is build for 50,000 vocabularies. Is it something wrong with my code? Improve minimum count?
    How much minimum count?
How much semilar are they?
I think it is much smaller than Cong´s 10-k corpus
ptionally extend vocabulary with bigrams/phrases later (“credit_risk”, “capital_ratio”), which also increases effective vocabulary size without adding junk.

SyntaxError: invalid character '´' (U+00B4) (1997296567.py, line 5)

In [None]:
God analyse:

200 K. Min 5 count

Hvorfor er der forskel på antal ord på mac og på computer


In [55]:
import numpy as np
from collections import defaultdict

# ---------------------------------------------------------
# 1. Build reverse index: from word index → word string
# ---------------------------------------------------------
rev_index = {i: w for w, i in word_to_index.items()}

# ---------------------------------------------------------
# 2. Compute similarity of each word to its cluster center
# ---------------------------------------------------------
cluster_sizes = []
cluster_stats = []
cluster_wordlists = []

print("\n==================== CLUSTER DIAGNOSTICS ====================")

for cid, cl in enumerate(clusters):

    indices = cl["indices"]
    center = cl["center"]  # already normalized
    cluster_sizes.append(len(indices))

    # Extract all embeddings for this cluster
    M = X[indices]  # shape: (cluster_size, embedding_dim)

    # Cosine similarity = dot product (since everything normalized)
    sims = M @ center

    avg_sim = float(np.mean(sims))
    med_sim = float(np.median(sims))
    min_sim = float(np.min(sims))
    max_sim = float(np.max(sims))

    # Sort words inside cluster by similarity (descending)
    sorted_pairs = sorted(
        zip(indices, sims),
        key=lambda x: x[1],
        reverse=True
    )

    words_sorted = [(rev_index[i], float(sim)) for i, sim in sorted_pairs]
    cluster_wordlists.append(words_sorted)

    cluster_stats.append({
        "cid": cid,
        "size": len(indices),
        "avg_sim": avg_sim,
        "med_sim": med_sim,
        "min_sim": min_sim,
        "max_sim": max_sim
    })

    print(f"\n--- Cluster {cid} ---")
    print(f"Size: {len(indices)}")
    print(f"Avg similarity: {avg_sim:.4f} | Median: {med_sim:.4f}")
    print(f"Min similarity: {min_sim:.4f} | Max: {max_sim:.4f}")

    print("Top 10 words by similarity:")
    for w, sim in words_sorted[:20]:
        print(f"   {w:20s}  sim={sim:.4f}")

# ---------------------------------------------------------
# 3. Summary of cluster size distribution
# ---------------------------------------------------------
sizes = np.array(cluster_sizes)
print("\n==================== SIZE SUMMARY ====================")
print(f"Number of clusters: {len(clusters)}")
print(f"Min size:  {sizes.min()}")
print(f"Max size:  {sizes.max()}")
print(f"Mean size: {sizes.mean():.2f}")
print(f"Median:    {np.median(sizes)}")

# ---------------------------------------------------------
# 4. Optional: flag low-quality clusters (low cohesion)
# ---------------------------------------------------------
low_quality = [s for s in cluster_stats if s["avg_sim"] < 0.15]

print("\n==================== LOW-COHESION CLUSTERS ====================")
if low_quality:
    for s in low_quality:
        print(f"Cluster {s['cid']} | size={s['size']} | avg_sim={s['avg_sim']:.4f}")
else:
    print("✔ All clusters show reasonable cohesion.")




--- Cluster 0 ---
Size: 1
Avg similarity: 0.9797 | Median: 0.9797
Min similarity: 0.9797 | Max: 0.9797
Top 10 words by similarity:
   management            sim=0.9797

--- Cluster 1 ---
Size: 1
Avg similarity: 0.9747 | Median: 0.9747
Min similarity: 0.9747 | Max: 0.9747
Top 10 words by similarity:
   financial             sim=0.9747

--- Cluster 2 ---
Size: 1
Avg similarity: 0.9881 | Median: 0.9881
Min similarity: 0.9881 | Max: 0.9881
Top 10 words by similarity:
   total                 sim=0.9881

--- Cluster 3 ---
Size: 1
Avg similarity: 0.9780 | Median: 0.9780
Min similarity: 0.9780 | Max: 0.9780
Top 10 words by similarity:
   portfolio             sim=0.9780

--- Cluster 4 ---
Size: 1
Avg similarity: 0.9888 | Median: 0.9888
Min similarity: 0.9888 | Max: 0.9888
Top 10 words by similarity:
   equity                sim=0.9888

--- Cluster 5 ---
Size: 1
Avg similarity: 0.9941 | Median: 0.9941
Min similarity: 0.9941 | Max: 0.9941
Top 10 words by similarity:
   counterparty          si