# Methedologi test

### Setting up the environment

In [5]:
import os
from itertools import chain
import fitz  # PyMuPDF for PDF ingestion
import pandas as pd

### Ingesting the models

In [6]:
from gensim.models import Word2Vec

## Importing documents

#### setting up folder structure

In [7]:
# Path to the reports folder
reports_folder = "./reports"
# List all PDF files in the folder
report_files = [f for f in os.listdir(reports_folder) if f.endswith(".pdf")]
# Pick one example file
random_report = report_files[0]
# Build the full path
doc_path = os.path.join(reports_folder, random_report)
print("Found report:", doc_path)

Found report: ./reports/2022_DeutscheBank_group.pdf


#### Ingestion of documents


In [8]:
from itertools import chain


from itertools import chain

page_ranges = {
    # 2024 files
    '2024_Danske_group.pdf': range(208, 240),  # First 20 pages
    '2024_UBS_group.pdf': range(88, 136),  # First 3 and last page
    '2024_DeutscheBank_group.pdf': range(91, 208),
    '2024_ING_group.pdf': range(158, 222),  # Pages 10-29

    # 2023 files
    '2023_Danske_group.pdf': range(175, 213),  # First 20 pages
    '2023_UBS_group.pdf': range(97, 153),  # First 3 and last page
    '2023_DeutscheBank_group.pdf': range(91, 208),
    '2023_ING_group.pdf': range(131, 204),  # Pages 10-29

    # 2022 files
    '2022_Danske_group.pdf': range(169, 208),  # First 20 pages
    '2022_UBS_group.pdf': range(83, 134),  # First 3 and last page
    '2022_DeutscheBank_group.pdf': range(90, 213),
    '2022_ING_group.pdf': range(103, 185),  # Pages 10-29

    # 2021 files
    '2021_Danske_group.pdf': range(159, 194),  # First 20 pages
    '2021_UBS_group.pdf': range(98, 150),  # First 3 and last page
    '2021_DeutscheBank_group.pdf': range(84, 201),
    '2021_ING_group.pdf': range(45, 150),  # Pages 10-29
}

# Note that the reason I added +1 to the end value is because Python’s range() is end-exclusive:

# Default pages to process if no specific range is defined for a file
# Options:
# - range(0, 10) for first 10 pages
# - [0, 1, 2, -1] for first 3 and last page (use negative for counting from end)
# - None to process all pages
default_pages = range(0, 10)  # First 10 pages by default

files_walk = os.walk(reports_folder)  # Fixed variable name
report_paragraphs = []
report_paragraphs_source = []
report_pages_source = []

for path, dirs, files in files_walk:
    pdfs = [file for file in files if file.endswith('.pdf')]
    for _file in pdfs:
        print(f"Processing {_file}...")

        # Determine which pages to process for this file
        if _file in page_ranges:
            pages_to_process = page_ranges[_file]
        else:
            pages_to_process = default_pages

        with fitz.open(os.path.join(path, _file)) as doc:
            total_pages = len(doc)

            # If pages_to_process is None, process all pages
            if pages_to_process is None:
                pages_to_process = range(total_pages)

            # Handle negative page numbers (count from end)
            actual_pages = []
            for page_num in pages_to_process:
                if isinstance(page_num, int):
                    if page_num < 0:
                        actual_page = total_pages + page_num  # Convert negative to positive
                    else:
                        actual_page = page_num

                    # Only include valid page numbers
                    if 0 <= actual_page < total_pages:
                        actual_pages.append(actual_page)


            # Process only the specified pages
            for page_num in actual_pages:
                page = doc[page_num]
                blocks = [x[4] for x in page.get_text("blocks")]
                # get rid of empty blocks
                blocks = [block.strip() for block in blocks if block.strip()]

                if blocks:  # Only add if there are non-empty blocks
                    report_paragraphs.extend(blocks)
                    report_pages_source.extend([page_num] * len(blocks))
                    report_paragraphs_source.extend([_file] * len(blocks))


Processing 2022_DeutscheBank_group.pdf...
Processing 2021_DeutscheBank_group.pdf...
Processing 2022_UBS_group.pdf...
Processing 2023_UBS_group.pdf.pdf...
Processing 2023_DeutscheBank_group.pdf...
Processing 2024_UBS_group.pdf.pdf...
Processing 2021_Danske_group.pdf.pdf...
Processing 2024_DeutscheBank_group.pdf...
Processing 2021_UBS_group.pdf.pdf...
Processing 2024_ING_group.pdf.pdf...
Processing 2022_Danske_group.pdf.pdf...
Processing 2021_ING_group.pdf.pdf...
Processing 2024_Danske_group.pdf...
Processing 2023_ING_group.pdf.pdf...
Processing 2023_Danske_group.pdf.pdf...
Processing 2022_ING_group.pdf.pdf...


In [9]:
report_paragraphs[10]


'61\t\nIntroduction'

In [10]:
change_indices = [i for i in range(1, len(report_paragraphs_source)) if report_paragraphs_source[i] != report_paragraphs_source[i-1]]
for index in change_indices:
    print(report_paragraphs_source[index-1])
    print(report_paragraphs_source[index])
    print(report_pages_source[index-1])
    print(report_pages_source[index])
    print(report_paragraphs[index-15:index])

2022_DeutscheBank_group.pdf
2021_DeutscheBank_group.pdf
212
84
['but no', 'more', 'than \n9 months', 'Over \n9 months', 'but no', 'more', 'than \n1 year', 'Over \n1 year \nbut no', 'more', 'than \n2 years', 'Over \n2 years', 'but no', 'more', 'than \n5 years', 'Over \n5 years \nTotal \nDeposits \n378,174 \n34,971 \n97,284 \n55,043 \n16,398 \n14,629 \n7,638 \n7,975 \n9,344 \n621,456 \nDue to banks \n41,570 \n1,052 \n9,089 \n8,984 \n6,248 \n1,592 \n2,965 \n5,699 \n7,853 \n85,053 \nDue to customers \n336,605 \n33,919 \n88,196 \n46,059 \n10,150 \n13,038 \n4,673 \n2,276 \n1,491 \n536,404 \nRetail \n155,180 \n5,491 \n58,382 \n28,637 \n1,334 \n1,273 \n943 \n579 \n84 \n251,903 \nCorporates and other \ncustomers \n181,425 \n28,428 \n29,813 \n17,422 \n8,816 \n11,764 \n3,730 \n1,697 \n1,407 \n284,500 \nTrading liabilities \n332,969 \n0 \n0 \n0 \n0 \n0 \n0 \n0 \n0 \n332,969 \nTrading securities \n49,860 \n0 \n0 \n0 \n0 \n0 \n0 \n0 \n0 \n49,860 \nOther trading liabilities \n756 \n0 \n0 \n0 \n0 \n0 

In [11]:
print(len(report_paragraphs_source))
print(len(report_pages_source))
print(len(report_paragraphs))

12443
12443
12443


### Cleaning ingested data


In [12]:
from collections import defaultdict

grouped_text = defaultdict(list)
for text, source in zip(report_paragraphs, report_paragraphs_source):
    year = source.split('_')[0]      # e.g., '2024'
    bank = source.split('_')[1]      # e.g., 'Danske'
    grouped_text[(year, bank)].append(text)


# Group indices by document
doc_indices = defaultdict(list)
for idx, doc in enumerate(report_paragraphs_source):
    doc_indices[doc].append(idx)

# Indices to keep
indices_to_keep = set()

for doc, indices in doc_indices.items():
    seen = []
    for idx in indices:
        para = report_paragraphs[idx]
        # Check if this paragraph is very similar to any already seen (diff < 5 chars)
        if not any(abs(len(para) - len(other)) < 5 and sum(a != b for a, b in zip(para, other)) < 5 for other in seen):
            indices_to_keep.add(idx)
            seen.append(para)

# Sort indices to keep
indices_to_keep = sorted(indices_to_keep)

# Filter all lists
report_paragraphs = [report_paragraphs[i] for i in indices_to_keep]
report_paragraphs_source = [report_paragraphs_source[i] for i in indices_to_keep]
report_pages_source = [report_pages_source[i] for i in indices_to_keep]

In [13]:
print(len(report_paragraphs_source))
print(len(report_pages_source))
print(len(report_paragraphs))

8360
8360
8360


In [14]:
change_indices = [i for i in range(1, len(report_paragraphs_source)) if
                  report_paragraphs_source[i] != report_paragraphs_source[i - 1]]
for index in change_indices:
    print(report_paragraphs_source[index - 1])
    print(report_paragraphs_source[index])
    print(report_pages_source[index - 1])
    print(report_pages_source[index])
    print(report_paragraphs[index - 15:index])

2022_DeutscheBank_group.pdf
2021_DeutscheBank_group.pdf
212
84
['On \ndemand', '(incl. \nOvernight', 'and \none day', 'notice)', 'one \nmonth', 'to no \nmore', 'Over \n1 year \nbut no', 'than \n2 years', 'Over \n5 years \nTotal \nCash and central bank \nbalances¹ \n164,090 \n13,138 \n1,639 \n0 \n0 \n29 \n0 \n0 \n0 \n178,897 \nInterbank balances \n(w/o central banks)¹ \n6,315 \n265 \n181 \n83 \n166 \n181 \n0 \n0 \n6 \n7,195 \nCentral bank funds sold \n0 \n0 \n0 \n0 \n0 \n0 \n0 \n0 \n0 \n0 \nSecurities purchased under \nresale agreements \n9 \n2,646 \n3,990 \n356 \n519 \n895 \n1,721 \n1,342 \n0 \n11,478 \nWith banks \n3 \n305 \n869 \n22 \n5 \n600 \n1,626 \n1,322 \n0 \n4,750 \nWith customers \n6 \n2,342 \n3,121 \n334 \n514 \n295 \n95 \n21 \n0 \n6,728 \nSecurities borrowed \n0 \n0 \n0 \n0 \n0 \n0 \n0 \n0 \n0 \n0 \nWith banks \n0 \n0 \n0 \n0 \n0 \n0 \n0 \n0 \n0 \n0 \nWith customers \n0 \n0 \n0 \n0 \n0 \n0 \n0 \n0 \n0 \n0 \nFinancial assets at fair value \nthrough profit or loss \n410,982 \n

Imports from their modules

In [15]:
# === Cong et al. replication helpers ===
from engine import (
    calculate_word_frequencies,
    clean_and_normalize_text,
)

from TextualFactors import (
    NeighborFinder,
    EmbeddingCluster,
    TextualFactors,
    transfer_document_topics,
    transfer_sigular_values,
    transfer_topic_importances,
)


In [16]:
import pandas as pd

# Each paragraph = one "document" in Cong’s pipeline
df_docs = pd.DataFrame({
    "document": range(len(report_paragraphs)),
    "content": report_paragraphs,
    "source": report_paragraphs_source,
    "page": report_pages_source,
})

df_docs.head()


Unnamed: 0,document,content,source,page
0,0,59,2022_DeutscheBank_group.pdf,90
1,1,Deutsche Bank \nRisks and Opportunities \nAnn...,2022_DeutscheBank_group.pdf,90
2,2,"Technology, Data and Innovation",2022_DeutscheBank_group.pdf,90
3,3,Digital Innovation offers various revenue oppo...,2022_DeutscheBank_group.pdf,90
4,4,"To drive change, accelerate the adoption of te...",2022_DeutscheBank_group.pdf,90


In [17]:
# 3.1 Clean and normalize text using their function
df_docs = clean_and_normalize_text(df_docs, column_name="content")

# 3.2 Compute word frequencies per document using their function
df_docs = calculate_word_frequencies(df_docs, text_column="content")

# Now df_docs has:
# - 'tokens'    : list of tokens per document
# - 'word_freq' : Counter / dict {word: count} per document
df_docs[["document", "content", "tokens", "word_freq"]].head()


LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - '/Users/sebastianwiegandmoller/nltk_data'
    - '/Users/sebastianwiegandmoller/PycharmProjects/Emerging-Credit-Risk/.venv/nltk_data'
    - '/Users/sebastianwiegandmoller/PycharmProjects/Emerging-Credit-Risk/.venv/share/nltk_data'
    - '/Users/sebastianwiegandmoller/PycharmProjects/Emerging-Credit-Risk/.venv/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************
