# Importing Libraries

!pip install pdfplumber \
!pip install openai-whisper \
!sudo apt update && sudo apt install ffmpeg \
!pip install pdfplumber openai pandas tiktoken 

!pip install spacy \
!pip install bertopic 

!pip install openpyxl \
!pip install transformers \
!pip install tqdm \
pip install accelerate



In [1]:
# --- Standard Library ---
import os
import re
import shutil
import sys
from collections import Counter
from glob import glob

# --- Third-Party Libraries ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.io as pio
import pdfplumber
import torch
import requests
from tqdm import tqdm

# --- NLP Libraries ---
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
from spacy.pipeline import EntityRuler

# --- Hugging Face Transformers ---
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    BertTokenizer,
    BertForSequenceClassification,
    pipeline
)

# --- Sentence Transformers & BERTopic ---
# from sentence_transformers import SentenceTransformer, util
# from bertopic import BERTopic

# --- Clustering & Dimensionality Reduction ---
# from umap import UMAP
from hdbscan import HDBSCAN

# --- Google Colab / Google Drive ---
# from google.colab import drive
import gdown

# --- Setup ---
pio.renderers.default = "notebook"

# --- NLTK Downloads ---
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

# --- spaCy Model Loading ---
try:
    nlp = spacy.load("en_core_web_sm")
except:
    os.system("python -m spacy download en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# 🏢 HSBC - Presentation and Q&A Extraction
This section processes and extracts Presentation and Q&A data using BERT.


## Data Processing

In [None]:
# Internal titles pattern (dynamic, fully general)
internal_title_pattern = re.compile(
    r"(Group|Chief|Officer|CEO|CFO|Treasurer|Head|Director|Vice President|Finance|Investor Relations|Chairman|Chair|IR)",
    re.IGNORECASE
)

# Fully improved internal speaker detection:
def is_internal_speaker(institution):
    if not institution or institution.strip() == "":
        return True
    if re.search(r"hsbc", institution, re.IGNORECASE):
        return True
    return bool(internal_title_pattern.search(institution))

def is_external_speaker(institution):
    return not is_internal_speaker(institution)

def extract_qa_from_pdf(pdf_path, file_label=None):
    qa_data = []
    current_speaker = ""
    current_institution = ""
    current_text = []

    speaker_institutions = {}
    question_number = -1
    current_question_owner = None
    in_qa_section = False

    try:
        with pdfplumber.open(pdf_path) as pdf:
            first_page_text = pdf.pages[0].extract_text()

            # YEAR DETECTION
            year = None
            year_match = re.search(r"\b(20[1-3][0-9])\b", first_page_text)
            if year_match:
                year = int(year_match.group(1))

            # QUARTER DETECTION (fully robust)
            quarter = "Unknown"
            quarter_text = first_page_text.lower()

            post_results_match = re.search(r"post-results\s+([A-Za-z\s\-]+)", first_page_text, re.IGNORECASE)
            if post_results_match:
                quarter = "Post-Results " + post_results_match.group(1).strip()
            elif re.search(r"\bq\s*4\b", quarter_text):
                quarter = 4
            elif re.search(r"\bq\s*3\b", quarter_text):
                quarter = 3
            elif re.search(r"\bq\s*2\b", quarter_text):
                quarter = 2
            elif re.search(r"\bq\s*1\b", quarter_text):
                quarter = 1
            elif re.search(r"interim", quarter_text) and re.search(r"\bh[\s\-]*1\b", quarter_text):
                quarter = "Interim/H1"
            elif re.search(r"interim", quarter_text):
                quarter = "Interim"
            elif re.search(r"\bh[\s\-]*1\b", quarter_text):
                quarter = "H1"
            elif re.search(r"\bannual\b|\bfull\s*year\b", quarter_text):
                quarter = "Annual"

            for page in pdf.pages:
                text = page.extract_text()
                if not text:
                    continue
                lines = text.split("\n")

                for line in lines:
                    # Q&A section detection
                    line_clean = re.sub(r"[\W_]+", "", line.lower())
                    if not in_qa_section and re.search(r"(qa|questionsandanswers|questions)", line_clean):
                        in_qa_section = True
                        continue

                    # Flexible speaker matching
                    match_full = re.match(r"^([A-Z\s\.]+),\s*([A-Za-z\s&]+):\s*(.*)", line)
                    if match_full:
                        speaker = match_full.group(1).title()
                        institution = match_full.group(2).title()
                        remainder = match_full.group(3).strip()
                    else:
                        match_short = re.match(r"^([A-Z\s\.]+):\s*(.*)", line)
                        if match_short:
                            speaker = match_short.group(1).title()
                            institution = speaker_institutions.get(speaker, "")  # Use previous institution if known
                            remainder = match_short.group(2).strip()
                        else:
                            if line.strip():
                                current_text.append(line.strip())
                            continue

                    # Save previous block
                    if current_text:
                        flag_question = (
                            in_qa_section and current_speaker == current_question_owner
                        )
                        presentation = 1 if question_number in [None, -1] else 0

                        qa_data.append({
                            "File": file_label or os.path.basename(pdf_path),
                            "Bank Name": "HSBC",
                            "Year": year,
                            "Quarter": quarter,
                            "Speaker name": current_speaker,
                            "Institution": current_institution,
                            "Speaker text": " ".join(current_text),
                            "flag_question": flag_question,
                            "Question No": question_number if in_qa_section else None,
                            "presentation": presentation
                        })
                        current_text = []

                    # Update current speaker
                    current_speaker = speaker
                    current_institution = institution
                    speaker_institutions[current_speaker] = current_institution

                    # Dynamically switch to Q&A when first external detected
                    if not in_qa_section and is_external_speaker(current_institution):
                        in_qa_section = True

                    # FULLY FIXED QUESTION NUMBERING LOGIC 
                    if in_qa_section:
                        if is_external_speaker(current_institution):
                            if current_question_owner != current_speaker:
                                question_number += 1
                                current_question_owner = current_speaker

                    if remainder:
                        current_text.append(remainder)

            # Save final block
            if current_text:
                flag_question = (
                    in_qa_section and current_speaker == current_question_owner
                )
                presentation = 1 if question_number in [None, -1] else 0

                qa_data.append({
                    "File": file_label or os.path.basename(pdf_path),
                    "Bank Name": "HSBC",
                    "Year": year,
                    "Quarter": quarter,
                    "Speaker name": current_speaker,
                    "Institution": current_institution,
                    "Speaker text": " ".join(current_text),
                    "flag_question": flag_question,
                    "Question No": question_number if in_qa_section else None,
                    "presentation": presentation
                })

    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")

    return qa_data

def process_all_pdfs(root_dir):
    all_results = []

    for root, _, files in os.walk(root_dir):
        for file in files:
            if file.lower().endswith(".pdf"):
                full_path = os.path.join(root, file)
                rel_path = os.path.relpath(full_path, root_dir)
                print(f"Processing: {rel_path}")
                qa_rows = extract_qa_from_pdf(full_path, file_label=rel_path)
                all_results.extend(qa_rows)

    df_all = pd.DataFrame(all_results)
    df_all = df_all[df_all['Speaker name'].str.strip() != ""]
    df_all.to_csv("combined_all_qas.csv", index=False)
    return df_all

# # USAGE
# df = process_all_pdfs(r"C:\Users\User\Desktop\CAM_BoE_GitHub\team-42\HSBC\HSBC Presentation texts")

#reading files from drive

In [3]:
# Step 1: Download the entire folder recursively using gdown
folder_url = "https://drive.google.com/drive/folders/1K-LnQdiKCxkzjQOjXcfhEjCM0McaDBuT"
local_dir = "./hsbc_pdfs"

# Only download if not already done
if not os.path.exists(local_dir):
    gdown.download_folder(url=folder_url, output=local_dir, quiet=False, use_cookies=False)

# Step 2: Your existing PDF processing logic
def is_internal_speaker(institution):
    if not institution or institution.strip() == "":
        return True
    if re.search(r"hsbc", institution, re.IGNORECASE):
        return True
    return bool(re.search(r"(Group|Chief|Officer|CEO|CFO|Treasurer|Head|Director|Vice President|Finance|Investor Relations|Chairman|Chair|IR)", institution, re.IGNORECASE))

def is_external_speaker(institution):
    return not is_internal_speaker(institution)

# (Paste your full extract_qa_from_pdf function here unchanged)

# Step 3: Walk through downloaded folder recursively
def process_all_pdfs(root_dir):
    all_results = []

    for root, _, files in os.walk(root_dir):
        for file in files:
            if file.lower().endswith(".pdf"):
                full_path = os.path.join(root, file)
                rel_path = os.path.relpath(full_path, root_dir)
                print(f"Processing: {rel_path}")
                qa_rows = extract_qa_from_pdf(full_path, file_label=rel_path)
                all_results.extend(qa_rows)

    df_all = pd.DataFrame(all_results)
    df_all = df_all[df_all['Speaker name'].str.strip() != ""]
    df_all.to_csv("combined_all_qas.csv", index=False)
    return df_all

# Run the whole pipeline
df = process_all_pdfs(local_dir)


Processing: Presentation Transcripts\2021\210427-1q-2021-presentation-to-investors-and-analysts-transcript.pdf
Processing: Presentation Transcripts\2021\210803-interim-results-2021-presentation-to-investors-and-analysts-transcript.pdf
Processing: Presentation Transcripts\2021\210810-fixed-income-call-transcript interim.pdf
Processing: Presentation Transcripts\2021\211025-3q-2021-presentation-to-investors-and-analysts-transcript.pdf
Processing: Presentation Transcripts\2021\220224-hsbc-fixed-income-call-2021-annual-results-22-02-20-final.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


Processing: Presentation Transcripts\2021\220228-fy-2021-equity-analyst-meeting-transcript.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


Processing: Presentation Transcripts\2022\220223-annual-results-2022-fixed-income-investor-presentation-transcript.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


Processing: Presentation Transcripts\2022\220802-interim-results-2022-presentation-to-investors-and-analysts-transcript.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

Processing: Presentation Transcripts\2022\220808-interim-results-2022-presentation-to-fixed-income-investors-transcript.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


Processing: Presentation Transcripts\2022\220902-h1-2022-equity-analyst-meeting-transcript.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


Processing: Presentation Transcripts\2022\221026-3q-2022-presentation-to-investors-and-analysts-transcript.pdf
Processing: Presentation Transcripts\2022\230221-annual-results-2022-presentation-to-investors-and-analysts-transcript.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


Processing: Presentation Transcripts\2022\230228-annual-results-2022-fy-2022-equity-analysts-meeting-transcript.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

Processing: Presentation Transcripts\2023\230502-1q-2023-hsbc-presentation-to-investors-and-analysts-transcript.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

Processing: Presentation Transcripts\2023\230802-interim-results-2023-presentation-to-investors-and-analysts-transcript.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

Processing: Presentation Transcripts\2023\230908-interim-results-2023-fixed-income-investors-conference-call-transcript.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


Processing: Presentation Transcripts\2023\231030-3q-2023-presentation-to-investors-and-analysts-transcript.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


Processing: Presentation Transcripts\2023\240222-annual-results-2023-presentation-to-investors-and-analysts-transcript.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


Processing: Presentation Transcripts\2023\240229-fy-2023-equity-analysts-meeting-transcript.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

Processing: Presentation Transcripts\2024\240502-1q-2024-hsbc-presentation-to-investors-and-analysts-transcript.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


Processing: Presentation Transcripts\2024\240801-interim-results-2024-presentation-to-fixed-income-investors-transcript.pdf
Processing: Presentation Transcripts\2024\240801-interim-results-2024-presentation-to-investors-and-analysts-transcript.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


Processing: Presentation Transcripts\2024\240806-interim-results-2024-equity-analysts-meeting-transcript.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


Processing: Presentation Transcripts\2024\241029-q3-results-to-investors-and-analysts-transcript.pdf
Processing: Presentation Transcripts\2024\250220-annual-results-to-investors-and-analysts-transcript.pdf
Processing: Presentation Transcripts\2024\250221-fy-2024-fixed-income-investor-call-transcript.pdf
Processing: Presentation Transcripts\2024\250228-fy-2024-equity-analysts-meeting-transcript.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


Processing: Presentation Transcripts\2025\250429-1q-2025-earnings-release-investors-and-analysts-call-transcript.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


In [4]:
df.head()

Unnamed: 0,File,Bank Name,Year,Quarter,Speaker name,Institution,Speaker text,flag_question,Question No,presentation
1,Presentation Transcripts\2021\210427-1q-2021-p...,HSBC,2021,1,Noel Quinn,Group Chief Executive,Good morning in London and good afternoon in H...,False,,1
2,Presentation Transcripts\2021\210427-1q-2021-p...,HSBC,2021,1,Ewen Stevenson,Group Chief Financial Officer,"Thanks, Noel, and good morning or afternoon, a...",False,-1.0,1
3,Presentation Transcripts\2021\210427-1q-2021-p...,HSBC,2021,1,Ed Firth,Kbw,"Good morning, everybody. I just had two questi...",True,0.0,0
4,Presentation Transcripts\2021\210427-1q-2021-p...,HSBC,2021,1,Ewen Stevenson,Group Chief Financial Officer,"On restructuring charges, we’re not changing o...",False,0.0,0
5,Presentation Transcripts\2021\210427-1q-2021-p...,HSBC,2021,1,Ed Firth,Kbw,"Based on what we can see, the bulk of those so...",True,0.0,0


## BERT and FinBert Topics

In [6]:
# --- Q&A filler phrases ---
qa_filler_phrases = [
    "thanks", "thank you", "appreciate", "question", "questions", "ask", "asking", "follow-up", 
    "couple of questions", "good morning", "good afternoon", "hello", "thanks for taking my question",
    "thanks for the call", "thank you for your time", "joining us", "line is open", "line", 
    "hand over", "presentation", "prepared remarks", "pick up with ir team", "management remarks", 
    "closing remarks", "just wondering", "can you talk a bit more about", "could you elaborate", 
    "as you mentioned", "as you said", "i guess", "i wonder if", "may i ask", "i was going to ask",
    "one more question", "a quick clarification", "sorry to go on", "very helpful", "hope that makes sense",
    "buyback", "payout ratio", "dividend policy", "progressive dividend", "distribution policy", 
    "capital management", "common equity tier one", "cet1", "surplus capital", "capital efficiency",
    "loan growth", "lending volume", "mortgage growth", "unsecured lending", "credit card balances",
    "loan demand", "pipeline growth", "mortgage book", "consumer lending", "personal banking",
    "variable pay", "compensation pool", "performance-related pay", "transformation cost savings",
    "management focus on costs", "broadly flat cost", "cost inflation", "hiring plans", "competitive pressures",
    "t&e", "travel expense", "operating expenses", "uk", "hong kong", "asia", "us", "europe", 
    "mainland china", "greater bay area"
]

# --- Hedge words ---
hedge_verbs = [
    "think", "know", "see", "believe", "wonder", "expect", "assume", "guess", 
    "probably", "maybe", "feel", "suppose", "suggest", "consider", "estimate", 
    "intend", "anticipate", "imagine", "hope", "trying", "might", "would", "could", "should"
]

# --- Financial stopwords ---
financial_stopwords = [
    "basis", "points", "million", "billion", "percent", "percentage", "cost", "costs", "capital", 
    "equity", "return", "dividend", "buyback", "allocation", "guidance", "expenditure", "earnings", 
    "profit", "margin", "revenue", "interest", "hedging", "hedge", "swap", "currency", "fx", 
    "sensitivity", "structural", "stack", "tier", "rwas", "provisions", "charges", "impairment", 
    "grandfathering", "issuance", "liquidity", "loan", "loans", "mortgage", "deposit", "assets", 
    "lending", "borrowers", "defaults", "exposure", "collateral", "refinancing", "provisioning", 
    "counterparty", "wealth", "insurance", "private", "trade", "transaction", "client", "portfolio", 
    "invested", "funding", "treasury", "surplus", "liabilities", "compliance", "legacy", 
    "volatility", "ratios", "metrics", "leverage", "valuation", "scenario", "assumptions", 
    "calculations", "underlying", "sovereign", "macroeconomic"
]

# --- Function words ---
function_stopwords = [
    "we", "have", "has", "had", "were", "was", "are", "is", "it's", "its", "being", "been", 
    "do", "does", "doing", "did", "can", "may", "might", "come", "comes", "going"
]

# ==========================
# BUILD FINAL STOPWORDS LIST
# ==========================

lemmatizer = WordNetLemmatizer()
base_stopwords = set(stopwords.words("english"))
domain_stopwords = set(qa_filler_phrases + hedge_verbs + financial_stopwords + function_stopwords)
all_stopwords = base_stopwords.union(domain_stopwords)

# Load speaker names dynamically and add
speaker_names = df['Speaker name'].dropna().unique().tolist()
speaker_parts = set()
for name in speaker_names:
    for part in name.split():
        speaker_parts.add(part.lower())

all_stopwords = all_stopwords.union(speaker_parts)

# ==========================
# CLEANING FUNCTION
# ==========================

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = [lemmatizer.lemmatize(w) for w in text.split() if w not in all_stopwords and len(w) > 2]
    return ' '.join(tokens)

# ==========================
# FILTER Q&A (presentation == 0)
# ==========================

df_qa = df[df['presentation'] == 0].copy()
df_qa['Speaker text cleaned'] = df_qa['Speaker text'].astype(str).apply(clean_text)
df_qa = df_qa[df_qa['Speaker text cleaned'].str.split().apply(len) >= 5]

# ==========================
# EMBEDDING & TOPIC MODELING
# ==========================

device = "cuda" if torch.cuda.is_available() else "cpu"
device_index = 0 if device == "cuda" else -1

embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device=device)
embeddings = embedding_model.encode(df_qa['Speaker text cleaned'].tolist(), show_progress_bar=True, batch_size=32)

umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.1, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

topic_model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    top_n_words=15,
    calculate_probabilities=True,
    verbose=True
)

topics, probs = topic_model.fit_transform(df_qa['Speaker text cleaned'].tolist(), embeddings=embeddings)
df_qa['Topic'] = [0 if t == -1 else t for t in topics]
df_qa['Top_15_Words'] = [", ".join([w for w, _ in topic_model.get_topic(topic)[:15]]) for topic in topics]

# ==========================
# VISUALIZATION
# ==========================

fig_bar = topic_model.visualize_barchart(top_n_topics=10)
fig_bar.show()

try:
    fig_topics = topic_model.visualize_topics()
    fig_topics.show()
except Exception as e:
    print(f"BERTopic topic map error: {e}")

# ==========================
# FINBERT SENTIMENT
# ==========================

finbert_tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
finbert_model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert").to(device)
finbert_pipeline = pipeline("sentiment-analysis", model=finbert_model, tokenizer=finbert_tokenizer, device=device_index)

sentiments = []
BATCH_SIZE = 32
for i in tqdm(range(0, len(df_qa), BATCH_SIZE)):
    batch = df_qa['Speaker text cleaned'].iloc[i:i+BATCH_SIZE].tolist()
    batch_sentiments = finbert_pipeline(batch, truncation=True, max_length=512, padding=True)
    sentiments.extend(batch_sentiments)

df_qa['FinBERT_Sentiment'] = [x['label'] for x in sentiments]
df_qa['FinBERT_Score'] = [x['score'] for x in sentiments]

# ==========================
# FINAL OUTPUT
# ==========================

pd.set_option('display.max_columns', None)
print(df_qa[['Speaker text', 'Speaker text cleaned', 'Topic', 'Top_15_Words', 'FinBERT_Sentiment', 'FinBERT_Score']].head())


NameError: name 'SentenceTransformer' is not defined

In [None]:
df_qa.FinBERT_Sentiment.value_counts()/df_qa.shape[0]

neutral     0.774468
positive    0.175532
negative    0.050000
Name: FinBERT_Sentiment, dtype: float64

In [None]:
# --- Fully unstack-free, pivot-free, bulletproof aggregation ---

# Build the counts
sentiment_counts = df_qa.groupby(['Topic', 'FinBERT_Sentiment']).size().reset_index(name='count')

# Initialize full dataframe of all Topics
all_topics = pd.DataFrame({'Topic': df_qa['Topic'].unique()}).sort_values('Topic')

# Merge manually per sentiment
for sentiment in ['positive', 'neutral', 'negative']:
    temp = sentiment_counts[sentiment_counts['FinBERT_Sentiment'] == sentiment][['Topic', 'count']].rename(columns={'count': sentiment})
    all_topics = all_topics.merge(temp, on='Topic', how='left')

# Replace any missing counts with 0
all_topics = all_topics.fillna(0).astype({'positive': 'int', 'neutral': 'int', 'negative': 'int'})

# Totals for safe division
total_pos = all_topics['positive'].sum()
total_neu = all_topics['neutral'].sum()
total_neg = all_topics['negative'].sum()

# Calculate percentages
all_topics['%_positive'] = (all_topics['positive'] / total_pos * 100).round(2) if total_pos else 0
all_topics['%_neutral']  = (all_topics['neutral']  / total_neu * 100).round(2) if total_neu else 0
all_topics['%_negative'] = (all_topics['negative'] / total_neg * 100).round(2) if total_neg else 0

# Attach BERTopic words
all_topics['Top_15_Words'] = df_qa.groupby('Topic')['Top_15_Words'].first().reindex(all_topics['Topic'].values).values

# Reorder
cols = [
    'Top_15_Words',
    'positive', '%_positive',
    'neutral', '%_neutral',
    'negative', '%_negative'
]
final_table = all_topics[cols].reset_index(drop=True)

# Display
pd.set_option('display.float_format', '{:,.2f}'.format)

final_table.to_csv("hsbc_final_sentiment_analysis.csv", index=False)
final_table


NameError: name 'df_qa' is not defined

In [None]:
# df.to_csv("hsbc_qa_analysis.csv", index=False)

## Phi-4

In [5]:
import warnings
warnings.filterwarnings("ignore")

import torch
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from tqdm import tqdm
import gc
import re

# Setup
torch.backends.cudnn.benchmark = True
if torch.cuda.is_available():
    torch.cuda.empty_cache()

print("Loading model...")
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-4-mini-instruct", trust_remote_code=True)
tokenizer.padding_side = 'left'
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

if torch.cuda.is_available():
    model = AutoModelForCausalLM.from_pretrained(
        "microsoft/Phi-4-mini-instruct",
        torch_dtype=torch.float16,
        trust_remote_code=True,
        device_map="cuda",
        low_cpu_mem_usage=True
    )
else:
    print("WARNING: No GPU detected, using CPU (slow)")
    model = AutoModelForCausalLM.from_pretrained(
        "microsoft/Phi-4-mini-instruct",
        torch_dtype=torch.float32,
        trust_remote_code=True,
        low_cpu_mem_usage=True
    )

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)

generation_args = {
    "max_new_tokens": 120,  # Increased to allow complete sentences
    "return_full_text": False,
    "do_sample": True,
    "temperature": 0.3,
    "pad_token_id": tokenizer.eos_token_id,
    "use_cache": True
}

batch_size = 16 if torch.cuda.is_available() else 2

# Load your DataFrame
# df = pd.read_csv("your_file.csv")

# FIXED: Proper Q&A Pair Extraction based on Year, Quarter, Question No, and presentation==0
print("Extracting Q&A pairs based on Year, Quarter, Question No grouping (Q&A session only)...")

# First, clean and prepare the data
df['Question No'] = pd.to_numeric(df['Question No'], errors='coerce')

# Filter for Q&A session only (presentation==0) and valid Question No
qa_session_data = df[(df['presentation'] == 0) & (df['Question No'].notna())]

# Group by Year, Quarter, Question No to get proper Q&A pairs
qa_pairs = []

# Get unique question groups from Q&A session only
question_groups = qa_session_data.groupby(['Year', 'Quarter', 'Question No'])

for (year, quarter, q_no), group in question_groups:
    # Sort by index to maintain order within each question group
    group_sorted = group.sort_index()
    
    # Separate questions and answers
    questions = group_sorted[group_sorted['flag_question'] == True]
    answers = group_sorted[group_sorted['flag_question'] == False]
    
    # Skip if no questions or answers
    if len(questions) == 0 or len(answers) == 0:
        continue
    
    # Combine all questions in this group
    combined_questions = " ".join(questions['Speaker text'].astype(str))
    question_indices = questions.index.tolist()
    
    # Combine all answers in this group  
    combined_answers = " ".join(answers['Speaker text'].astype(str))
    answer_indices = answers.index.tolist()
    
    # Create Q&A pair
    qa_pairs.append({
        "questions": combined_questions,
        "answer": combined_answers,
        "question_indices": question_indices,
        "answer_indices": answer_indices,
        "year": year,
        "quarter": quarter,
        "question_no": q_no
    })

print(f"Found {len(qa_pairs)} Q&A pairs from Q&A session only (presentation==0)")

# Debug: Show some examples of the grouping
if len(qa_pairs) > 0:
    print(f"\nFiltered to Q&A session: {len(qa_session_data)} rows from {len(df)} total rows")
    print("First few Q&A pairs:")
    for i, pair in enumerate(qa_pairs[:3]):
        print(f"  Pair {i+1}: Year={pair['year']}, Quarter={pair['quarter']}, Q_No={pair['question_no']}")
        print(f"    Questions: {len(pair['question_indices'])} rows, Answers: {len(pair['answer_indices'])} rows")
        print(f"    Question preview: {pair['questions'][:100]}...")
        print(f"    Answer preview: {pair['answer'][:100]}...")
        print()
else:
    print("No Q&A pairs found! Check if presentation==0 filter is correct.")
    print(f"Presentation values in data: {sorted(df['presentation'].unique())}")
    print(f"Rows with presentation==0: {len(df[df['presentation'] == 0])}")
    print(f"Rows with valid Question No: {len(df[df['Question No'].notna()])}")
    print(f"Rows with both conditions: {len(qa_session_data)}")

# FIXED: Better Prompt Design for Complete Sentences
def create_prompt(question_text, answer_text):
    # Truncate to reasonable lengths
    q_truncated = question_text[:400] if len(question_text) > 400 else question_text
    a_truncated = answer_text[:500] if len(answer_text) > 500 else answer_text
    
    prompt = f"""Analyze this Q&A exchange and provide exactly the following format:

INSIGHT: [summarizing key insight from the answer - what was revealed, decided, or explained]. [keep short and complete sentence].
RISK: [Yes/No/Unclear]  
ANSWERED: [Complete/Partial/None]

Question: {q_truncated}

Answer: {a_truncated}

Analysis:"""
    
    return prompt

# Build prompts
prompts = []
for pair in qa_pairs:
    prompt = create_prompt(pair["questions"], pair["answer"])
    prompts.append(prompt)

print(f"Created {len(prompts)} prompts for processing")

# FIXED: Better batch processing with error handling
print("Processing batches...")
outputs_all = []

with torch.no_grad():
    for i in tqdm(range(0, len(prompts), batch_size), desc="Processing batches"):
        batch = prompts[i:i + batch_size]
        
        try:
            outputs = pipe(batch, **generation_args)
            batch_outputs = []
            
            for out in outputs:
                if isinstance(out, list) and len(out) > 0:
                    batch_outputs.append(out[0]['generated_text'].strip())
                else:
                    batch_outputs.append("Processing failed")
                    
            outputs_all.extend(batch_outputs)
            
        except Exception as e:
            print(f"Batch {i//batch_size} failed: {e}")
            # Process individually on failure
            for prompt in batch:
                try:
                    output = pipe(prompt, **generation_args)
                    if isinstance(output, list) and len(output) > 0:
                        outputs_all.append(output[0]['generated_text'].strip())
                    else:
                        outputs_all.append("Processing failed")
                except Exception as e2:
                    print(f"Individual prompt failed: {e2}")
                    outputs_all.append("Processing failed")

        # Memory cleanup
        if i % (batch_size * 4) == 0 and torch.cuda.is_available():
            torch.cuda.empty_cache()

# FIXED: Better parsing functions
def extract_summary(text):
    """Extract key insight from LLM output"""
    # Look for INSIGHT: pattern
    patterns = [
        r"INSIGHT:\s*([^\n]+)",
        r"Insight:\s*([^\n]+)", 
        r"insight:\s*([^\n]+)",
        r"SUMMARY:\s*([^\n]+)",
        r"Summary:\s*([^\n]+)"
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            insight = match.group(1).strip()
            # Remove any trailing punctuation artifacts and clean up
            insight = re.sub(r'\s+', ' ', insight)  # Clean multiple spaces
            insight = insight.rstrip('.,;:')  # Remove trailing punctuation
            
            # Ensure it's a reasonable length (not too long, not too short)
            words = insight.split()
            if len(words) > 20:  # If too long, truncate but keep complete thought
                insight = " ".join(words[:20])
            elif len(words) < 3:  # If too short, try fallback
                break
            
            return insight.strip()
    
    # Fallback: extract first meaningful line that forms a complete thought
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    for line in lines:
        if not any(keyword in line.lower() for keyword in ['risk', 'answer', 'complete', 'partial', 'analysis']):
            # Clean the line
            line = re.sub(r'\s+', ' ', line)
            words = line.split()
            if len(words) >= 5:  # Ensure meaningful length
                # Take up to 20 words but try to end at sentence boundary
                if len(words) <= 20:
                    return line.rstrip('.,;:')
                else:
                    truncated = " ".join(words[:20])
                    # Try to end at a natural break
                    if '.' in truncated:
                        return truncated.split('.')[0] + '.'
                    return truncated
    
    return "No clear insight extracted"

def extract_risk(text):
    """Extract risk assessment from LLM output only"""
    # Look for RISK: pattern from LLM response
    patterns = [
        r"RISK:\s*(Yes|No|Unclear)",
        r"Risk:\s*(Yes|No|Unclear)",
        r"risk:\s*(yes|no|unclear)"
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            return match.group(1).capitalize()
    
    # If no clear pattern found, return what the LLM likely intended
    # Look for Yes/No in the text without keyword assumptions
    if re.search(r'\byes\b', text, re.IGNORECASE):
        return "Yes"
    elif re.search(r'\bno\b', text, re.IGNORECASE):
        return "No"
    
    return "Unclear"

def extract_coverage(text):
    """Extract answer completeness from LLM output only"""
    # Look for ANSWERED: pattern from LLM response
    patterns = [
        r"ANSWERED:\s*(Complete|Partial|None)",
        r"Answered:\s*(Complete|Partial|None)",
        r"answered:\s*(complete|partial|none)"
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            result = match.group(1).capitalize()
            return "Fully answered" if result == "Complete" else f"{result}ly answered" if result == "Partial" else "Not answered"
    
    # If no clear pattern found, look for the exact terms the LLM might use
    if re.search(r'\bcomplete\b', text, re.IGNORECASE):
        return "Fully answered"
    elif re.search(r'\bpartial\b', text, re.IGNORECASE):
        return "Partially answered"
    elif re.search(r'\bnone\b', text, re.IGNORECASE):
        return "Not answered"
    
    return "Unclear"

# Initialize all columns with empty values - let LLM decide everything
print("Initializing result columns...")
df["key_findings"] = ""
df["risk_or_distress"] = ""
df["answer_coverage"] = ""

# OPTIMIZED: Single loop for mapping and counting
print("Mapping results to dataframe...")
risk_detected_pairs = 0
fully_answered_pairs = 0
valid_insight_pairs = 0

for i, (pair, llm_output) in enumerate(zip(qa_pairs, outputs_all)):
    if llm_output == "Processing failed":
        continue
        
    # Extract insights ONCE
    summary = extract_summary(llm_output)
    risk = extract_risk(llm_output)
    coverage = extract_coverage(llm_output)
    
    # Count pair statistics
    if risk == "Yes":
        risk_detected_pairs += 1
    if coverage == "Fully answered":
        fully_answered_pairs += 1
    if len(summary.split()) >= 3 and summary not in ["", "No clear insight extracted", "Processing failed"]:
        valid_insight_pairs += 1
    
    # Apply to ALL question rows in this group
    for q_idx in pair["question_indices"]:
        if q_idx < len(df):
            df.loc[q_idx, "key_findings"] = summary
            df.loc[q_idx, "risk_or_distress"] = risk
            df.loc[q_idx, "answer_coverage"] = coverage
    
    # Apply to ALL answer rows in this group
    for a_idx in pair["answer_indices"]:
        if a_idx < len(df):
            df.loc[a_idx, "key_findings"] = summary
            df.loc[a_idx, "risk_or_distress"] = risk
            df.loc[a_idx, "answer_coverage"] = coverage

# Add validity flag
df["valid_summary"] = df["key_findings"].apply(
    lambda x: len(str(x).split()) >= 3 and 
             str(x) not in ["", "No clear insight extracted", "Processing failed"] and
             not str(x).lower().startswith(('analysis', 'the ', 'this '))
)

# Summary statistics
print("\n=== PROCESSING SUMMARY ===")
print(f"Total Q&A pairs processed: {len(qa_pairs)}")
successful_analyses = len([o for o in outputs_all if o != 'Processing failed'])
print(f"Successful analyses: {successful_analyses}")
print(f"Pairs with valid insights: {valid_insight_pairs}")
print(f"Risk detected in: {risk_detected_pairs} pairs")
print(f"Fully answered: {fully_answered_pairs} pairs")

# Cleanup model to free memory before JSON processing
del model, pipe
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# JSON OUTPUT CREATION (AFTER model cleanup to avoid memory issues)
print("\nCreating JSON output...")

import json
from datetime import datetime

def safe_convert(obj):
    """Convert all data types to JSON-safe types"""
    if pd.isna(obj) or obj is None:
        return None
    elif isinstance(obj, (int, float, str, bool)):
        return obj
    elif hasattr(obj, 'item'):  # numpy types
        return obj.item()
    elif hasattr(obj, 'tolist'):  # numpy arrays
        return obj.tolist()
    else:
        return str(obj)

# Create JSON structure
json_output = {
    "metadata": {
        "processing_date": datetime.now().isoformat(),
        "model_used": "microsoft/Phi-4-mini-instruct",
        "total_qa_pairs": len(qa_pairs),
        "successful_analyses": successful_analyses,
        "processing_summary": {
            "pairs_with_valid_insights": valid_insight_pairs,
            "risk_detected_pairs": risk_detected_pairs,
            "fully_answered_pairs": fully_answered_pairs
        }
    },
    "qa_analyses": []
}

# Add each Q&A pair analysis
for i, (pair, llm_output) in enumerate(zip(qa_pairs, outputs_all)):
    qa_analysis = {
        "qa_pair_id": i + 1,
        "grouping_info": {
            "year": safe_convert(pair["year"]),
            "quarter": safe_convert(pair["quarter"]),
            "question_no": safe_convert(pair["question_no"])
        },
        "question_text": str(pair["questions"]),
        "answer_text": str(pair["answer"]),
        "raw_llm_output": str(llm_output),
        "extracted_results": {
            "key_findings": str(extract_summary(llm_output) if llm_output != "Processing failed" else ""),
            "risk_or_distress": str(extract_risk(llm_output) if llm_output != "Processing failed" else ""),
            "answer_coverage": str(extract_coverage(llm_output) if llm_output != "Processing failed" else "")
        },
        "processing_status": "success" if llm_output != "Processing failed" else "failed",
        "question_indices": [safe_convert(idx) for idx in pair["question_indices"]],
        "answer_indices": [safe_convert(idx) for idx in pair["answer_indices"]]
    }
    json_output["qa_analyses"].append(qa_analysis)

# Save JSON file
json_filename = f"qa_analysis_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"

with open(json_filename, 'w', encoding='utf-8') as f:
    json.dump(json_output, f, indent=2, ensure_ascii=False, default=safe_convert)

print(f"JSON output saved to: {json_filename}")

print("\nProcessing complete. Files created:")
print("1. Updated CSV with columns: key_findings, risk_or_distress, answer_coverage, valid_summary")
print(f"2. JSON analysis file: {json_filename}")

# Optional: Save updated CSV
# df.to_csv("improved_qa_analysis.csv", index=False)

Loading model...


Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.60s/it]
Device set to use cuda


Extracting Q&A pairs based on Year, Quarter, Question No grouping (Q&A session only)...
Found 219 Q&A pairs from Q&A session only (presentation==0)

Filtered to Q&A session: 1065 rows from 1141 total rows
First few Q&A pairs:
  Pair 1: Year=2021, Quarter=1, Q_No=0.0
    Questions: 2 rows, Answers: 2 rows
    Question preview: Good morning, everybody. I just had two questions. The first was on capital. I was surprised that th...
    Answer preview: On restructuring charges, we’re not changing our full-year guidance we gave at full-year results. Yo...

  Pair 2: Year=2021, Quarter=1, Q_No=1.0
    Questions: 1 rows, Answers: 1 rows
    Question preview: Morning. Just a couple of questions. The first one is on margins. You gave colour on this, Ewen, dur...
    Answer preview: On NIM, it was, I think, almost exclusively driven by the shift in yield curves. We’ve broadly repri...

  Pair 3: Year=2021, Quarter=1, Q_No=2.0
    Questions: 3 rows, Answers: 4 rows
    Question preview: Good morni

Processing batches:  21%|██▏       | 3/14 [11:16<41:19, 225.42s/it]


KeyboardInterrupt: 

In [74]:
df.to_csv("hsbc_final_qa_analysis.csv", index=False)

In [77]:
import torch
import gc

# Clear GPU cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    torch.cuda.ipc_collect()

# Force garbage collection
gc.collect()

0

In [68]:
df.to_csv("hsbc_final_qa_analysis_.csv", index=False)

# 🏦 Deutsche Bank - Data Processing
This section focuses on Deutsche Bank's data processing pipeline.


## Downloading PDFs

In [None]:
# URL to your Drive folder
folder_url = "https://drive.google.com/drive/folders/1rqP0g6AFk0Uu6gprstKnxXLo6XKXnIUM"

# 1. Download all files from the folder URL
# gdown.download_folder with just the URL downloads files to a temp dir and returns a list of file paths.
downloaded_files = gdown.download_folder(url=folder_url, quiet=False, use_cookies=False)

# 2. Move the downloaded files from their temporary location to the current working directory
# downloaded_files is a list of paths to the downloaded files in a temporary directory.
if downloaded_files:
    # Assuming all files are downloaded to the same parent temporary directory
    # We can get the parent directory from the first downloaded file path
    temp_output_folder = os.path.dirname(downloaded_files[0]) if downloaded_files else None

    if temp_output_folder and os.path.exists(temp_output_folder):
        for filename in os.listdir(temp_output_folder):
            src = os.path.join(temp_output_folder, filename)
            dst = os.path.join(".", filename)
            if os.path.isfile(src):
                shutil.move(src, dst)
        # Clean up the temporary directory after moving files
        try:
            os.rmdir(temp_output_folder)
            print(f"Removed temporary directory: {temp_output_folder}")
        except OSError as e:
            print(f"Could not remove temporary directory {temp_output_folder}: {e}")
    elif not downloaded_files:
        print("No files were downloaded from the folder.")
    else:
        print(f"Temporary download directory not found: {temp_output_folder}")


In [None]:
# Constant config
bank_name = "Deutsche Bank"
output_excel_presentations = "presentation_slides_with_titles.xlsx"
output_excel_qa = "qa_speaker_blocks_final.xlsx"

In [None]:
deutsche_bank_speakers = {"Christian Sewing", "Ioana Patriniche", "James von Moltke", "Silke-Nicole Szypa"}

In [None]:
# Step 1: Extract text from PDF, skipping first page
def extract_text(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            if i == 0:
                continue
            page_text = page.extract_text(layout=True)
            if page_text:
                text += page_text + "\n"
    return text

In [None]:
# Step 2: Cut before Q&A
def extract_presentation_section(text):
    split = re.split(r"Questions? & Answers?", text, flags=re.IGNORECASE)
    return split[0].strip() if split else ""

In [None]:
# Step 3: Parse slides with titles and create data frame for presentation
def extract_slides_with_titles(text):
    slide_pattern = re.compile(r"(Slide\s\d+\s[–-]\s.+)")
    slides = []
    parts = slide_pattern.split(text)
    for i in range(1, len(parts), 2):
        title = parts[i].strip()
        body = parts[i + 1].strip() if i + 1 < len(parts) else ""

        # Remove empty lines and page numbers
        lines = body.splitlines()
        cleaned = [line.strip() for line in lines if line.strip() and not re.match(r"^\d{1,3}$", line.strip())]

        # Combine to single string for bullet logic
        combined = " ".join(cleaned)

        # Insert line breaks before each bullet-style phrase: "- Capital..."
        # But ensure we keep the very first `-` if it's already there
        combined = re.sub(r"\s*-\s+(?=[A-Z])", r"\n- ", combined)

        # Remove accidental double bullets
        combined = re.sub(r"^- -", r"-", combined, flags=re.MULTILINE)

        slides.append((title, combined.strip()))

    return slides

In [None]:
# Step 4: Build DataFrame
def build_slide_df(slides, year, quarter, filename):
    return pd.DataFrame([{
        "File": filename,
        "Bank Name": bank_name,
        "Year": year,
        "Quarter": quarter,
        "Slide_title": title,
        "Text": text
    } for title, text in slides])

In [None]:
# Step 5: Extract Year/Quarter from filename
def extract_year_quarter(filename):
    match = re.search(r"(Q[1-4])[-_](\d{4})", filename)
    if match:
        quarter_str, year = match.group(1), int(match.group(2))
        quarter = int(quarter_str[1])  # Convert 'Q1' → 1
        return year, quarter
    return None, None

In [None]:
# Step 6: Isolate Q&A section
def extract_qa_section(text):
    match = re.search(r"(Questions? & Answers?.*)", text, re.IGNORECASE | re.DOTALL)
    return match.group(1) if match else ""

In [None]:
# Step 7: Create the Q&A part
# 7.1. Parse speaker blocks (speaker name + speaker text)
def parse_speaker_blocks(text):
    lines = text.splitlines()
    speaker_blocks = []
    current_speaker = None
    current_text = ""

    speaker_line_pattern = re.compile(r"^\s{2,}([A-Z][a-z]+(?: [A-Z][a-z]+)+)\s{2,}(.+)?$")

    for line in lines:
        line = line.rstrip()
        if not line.strip():
            continue

        match = speaker_line_pattern.match(line)
        if match:
            if current_speaker and current_text:
                speaker_blocks.append((current_speaker.strip(), current_text.strip()))
                current_text = ""
            current_speaker = match.group(1)
            first_line = match.group(2) or ""
            current_text = first_line.strip()
        else:
            current_text += " " + line.strip()

    if current_speaker and current_text:
        speaker_blocks.append((current_speaker.strip(), current_text.strip()))

    return speaker_blocks



In [None]:
############################################
import re

def parse_speaker_blocks(text):
    lines = text.splitlines()
    speaker_blocks = []
    current_speaker = None
    current_text = []

    # This pattern matches lines like "Chris Hallam (Goldman Sachs)"
    speaker_line_pattern = re.compile(r"^([A-Z][a-z]+(?: [A-Z][a-z]+)+(?: \([^)]+\))?)\s+(.*)?$")

    for line in lines:
        line = line.strip()
        if not line:
            continue

        match = speaker_line_pattern.match(line)
        if match:
            # Save the current speaker block before starting a new one
            if current_speaker and current_text:
                speaker_blocks.append((current_speaker, " ".join(current_text).strip()))
            current_speaker = match.group(1)
            first_line = match.group(2) or ""
            current_text = [first_line.strip()]
        else:
            if current_text is not None:
                current_text.append(line)

    # Append the last speaker block
    if current_speaker and current_text:
        speaker_blocks.append((current_speaker, " ".join(current_text).strip()))

    return speaker_blocks


In [None]:
import re

def parse_speaker_blocks(text):
    # Normalize whitespace
    text = re.sub(r"\n+", "\n", text)

    # Define a regex to match speaker labels (e.g., "Christian Sewing", "James von Moltke")
    speaker_pattern = re.compile(r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)\b\s*:")

    # Split text on speaker labels
    speaker_blocks = []
    current_speaker = None
    current_text = []

    for line in text.split("\n"):
        line = line.strip()
        if not line:
            continue

        match = speaker_pattern.match(line)
        if match:
            # Store previous block
            if current_speaker and current_text:
                speaker_blocks.append((current_speaker, " ".join(current_text)))
            current_speaker = match.group(1)
            current_text = [line[len(match.group(0)):].strip()]
        else:
            if current_speaker:
                current_text.append(line)

    # Append last speaker block
    if current_speaker and current_text:
        speaker_blocks.append((current_speaker, " ".join(current_text)))

    return speaker_blocks


In [None]:
# 7.2 First pass: collect institution map from known mentions
def build_institution_map(blocks):
    institution_map = {}
    for speaker, text in blocks:
        match = re.search(r"\(([^)]+)\)", text)
        if match:
            institution_map[speaker] = match.group(1)
    return institution_map



In [None]:
def build_speaker_df(blocks, institution_map, year, quarter, filename):
    rows = []
    question_counter = 1

    for speaker, text in blocks:
        match = re.search(r"\(([^)]+)\)", text)
        if match:
            institution = match.group(1)
            text_clean = re.sub(r"\([^)]+\)", "", text).strip()
            institution_map[speaker] = institution
        else:
            institution = institution_map.get(speaker)
            if not institution and speaker in deutsche_bank_speakers:
                institution = "Deutsche Bank"
            text_clean = text.strip()

        has_question = "?" in text_clean
        question_number = question_counter if has_question else None
        if has_question:
            question_counter += 1

        rows.append({
            "File": filename,
            "Bank Name": bank_name,
            "Year": year,
            "Quarter": quarter,
            "Speaker name": speaker,
            "Institution": institution,
            "Question Number": question_number,
            "Text": text_clean,
            "flag_question": int(has_question)
        })

    df = pd.DataFrame(rows)
    df["Question Number"] = df["Question Number"].ffill().astype("Int64")  # fill down
    return df


In [None]:
# === MAIN LOOP ===
all_files = glob("Q*-Analyst-Call-Transcript*.pdf")
all_present_dfs = []

all_qa_dfs = []

for pdf_path in sorted(all_files, reverse=True):  # optional: newest first
    year, quarter = extract_year_quarter(pdf_path)
    if not year:
        print(f"Skipping {pdf_path} — quarter/year not found.")
        continue

    print(f"Processing {pdf_path} ({quarter} {year})")
    try:
        text = extract_text(pdf_path)
        presentation_text = extract_presentation_section(text)
        slides = extract_slides_with_titles(presentation_text)
        df = build_slide_df(slides, year, quarter, os.path.basename(pdf_path))
        all_present_dfs.append(df)

        qa_text = extract_qa_section(text)
        speaker_blocks = parse_speaker_blocks(qa_text)
        institution_map = build_institution_map(speaker_blocks)
        df_qa = build_speaker_df(speaker_blocks, institution_map, year, quarter, os.path.basename(pdf_path))
        all_qa_dfs.append(df_qa)
    except Exception as e:
        print(f"Failed to process {pdf_path}: {e}")

# Combine and export
if all_present_dfs:
    final_df = pd.concat(all_present_dfs, ignore_index=True)
    final_df.to_excel(output_excel_presentations, index=False)
    json_output_presentations = output_excel_presentations.replace(".xlsx", ".json")
    final_df.to_json(json_output_presentations, orient="records", indent=2, force_ascii=False)

    print(f"\n Saved to {output_excel_presentations} with {len(final_df)} slides.")
else:
    print(" No slides extracted.")

if all_qa_dfs:
    final_df_qa = pd.concat(all_qa_dfs, ignore_index=True)
    final_df_qa.to_excel(output_excel_qa, index=False)
    print(f"\n Saved to {output_excel_qa} with {len(final_df_qa)} slides.")
    json_output_qa = output_excel_qa.replace(".xlsx", ".json")
    final_df_qa.to_json(json_output_qa, orient="records", indent=2, force_ascii=False)
else:
    print(" No QA extracted.")



Processing Q4-2024-Analyst-Call-Transcript.pdf (4 2024)
Failed to process Q4-2024-Analyst-Call-Transcript.pdf: 'Question Number'
Processing Q4-2023-Analyst-Call-Transcript.pdf (4 2023)
Failed to process Q4-2023-Analyst-Call-Transcript.pdf: 'Question Number'
Processing Q3-2024-Analyst-Call-Transcript.pdf (3 2024)
Failed to process Q3-2024-Analyst-Call-Transcript.pdf: 'Question Number'
Processing Q3-2023-Analyst-Call-Transcript.pdf (3 2023)
Failed to process Q3-2023-Analyst-Call-Transcript.pdf: 'Question Number'
Processing Q2-2024-Analyst-Call-Transcript.pdf (2 2024)
Failed to process Q2-2024-Analyst-Call-Transcript.pdf: 'Question Number'
Processing Q2-2023-Analyst-Call-Transcript.pdf (2 2023)
Failed to process Q2-2023-Analyst-Call-Transcript.pdf: 'Question Number'
Processing Q1-2025-Analyst-Call-Transcript.pdf (1 2025)
Failed to process Q1-2025-Analyst-Call-Transcript.pdf: 'Question Number'
Processing Q1-2024-Analyst-Call-Transcript.pdf (1 2024)
Failed to process Q1-2024-Analyst-Call-Tr

## FINBERT

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
import spacy
import re

In [None]:
# Load FinBERT for sentiment analysis
model_name = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

Device set to use cpu


In [None]:
# Load spaCy for topic extraction
import nltk
nltk.download('punkt')
nlp = spacy.load("en_core_web_sm")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# Helper: Extract top noun chunks or named entities
def extract_topics(text):
    doc = nlp(text)
    topics = set()

    for chunk in doc.noun_chunks:
        if len(chunk.text) > 3:
            topics.add(chunk.text.strip())

    for ent in doc.ents:
        if ent.label_ in {"ORG", "MONEY", "GPE", "PRODUCT", "EVENT"}:
            topics.add(ent.text.strip())

    return ", ".join(topics)

In [None]:
import re

removal_phrases = ["Q&A", "thank you", "thanks", "good morning", "hello",
        "hi", "good afternoon", "good evening","i appreciate it",
        "thank you very much", "thank you for taking my question",
        "cheers","many thanks","which", "sorry"]

def clean_text(text):
    text = text.replace("\n", " ").strip()

    # Remove predefined phrases (case-insensitive)
    for phrase in removal_phrases:
        text = re.sub(rf"\b{re.escape(phrase)}\b[,.\s]*", "", text, flags=re.IGNORECASE)

    # Remove phrases like "Thank you," "Thanks," "Good morning," at the start
    text = re.sub(r"^(thank you|thanks|good morning|hello|hi)[,.\s]*", "", text, flags=re.IGNORECASE)

    # Remove question-related filler phrases
    text = re.sub(r"\b(the question|first question|second question|two questions|a quick question|just one question)\b[,.\s]*", "", text, flags=re.IGNORECASE)

    # Remove name after greeting (e.g., "James von Moltke,")
    text = re.sub(r"^[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*[,.\s]+", "", text)

    return text.strip()

In [None]:
def add_sentiment_and_topic(df):
    sentiments = []
    topics = []
    scores = []

    for text in df["Text"]:
        cleaned = clean_text(text)
        if len(cleaned.split()) < 10:
            sentiments.append(None)
            topics.append(None)
            scores.append(None)
        else:
            result = sentiment_pipeline(cleaned[:512])[0]
            sentiments.append(result["label"].lower())
            topics.append(extract_topics(cleaned))
            scores.append(round(result["score"], 4))

    # Add to DataFrame
    df["Sentiment"] = sentiments
    df["Score"] = scores
    df["Topics"] = topics
    return df


In [None]:
add_sentiment_and_topic(final_df_qa)

Unnamed: 0,File,Bank Name,Year,Quarter,Speaker name,Institution,Question Number,Text,flag_question,Sentiment,Score,Topics
0,Q4-2024-Analyst-Call-Transcript.pdf,Deutsche Bank,2024,4,Nicolas Payen,Kepler Cheuvreux,1,"Good morning. I have two questions, please. Th...",1,neutral,0.9994,"the year, a share buyback annual growth, € 750..."
1,Q4-2024-Analyst-Call-Transcript.pdf,Deutsche Bank,2024,4,Christian Sewing,EU,2,"Thank you, Nicolas, it's Christian. Let me let...",1,positive,1.0000,"US, a chance, those years, the discussions, ou..."
2,Q4-2024-Analyst-Call-Transcript.pdf,Deutsche Bank,2024,4,Anke Reingen,RBC,3,Thank you very much for taking my question. I ...,1,neutral,0.9983,"FX, technology, the divisional level, a great ..."
3,Q4-2024-Analyst-Call-Transcript.pdf,Deutsche Bank,2024,4,Christian Sewing,EU,3,"Anke, I 100% support what James is saying. Jus...",0,neutral,0.9887,"our prepared remarks, that, Rebecca, the outco..."
4,Q4-2024-Analyst-Call-Transcript.pdf,Deutsche Bank,2024,4,Kian Abouhoussein,JP Morgan,4,Thanks for taking my questions. I just wanted ...,1,neutral,0.9986,"the expense side, the opportunity, FX, risk-we..."
...,...,...,...,...,...,...,...,...,...,...,...,...
229,Q1-2023-Analyst-Call-Transcript.pdf,Deutsche Bank,2023,1,Andrew Lim,Societe Generale,17,Hi. Thanks for taking my question. I just have...,1,neutral,0.9945,"Group NII, the coming quarters, a 40 to 60 bas..."
230,Q1-2023-Analyst-Call-Transcript.pdf,Deutsche Bank,2023,1,Andrew Lim,Societe Generale,17,"It's the total Group deposit base, but I can c...",0,neutral,1.0000,"that, the total Group deposit base, that ratio..."
231,Q1-2023-Analyst-Call-Transcript.pdf,Deutsche Bank,2023,1,Andrew Lim,Societe Generale,17,"Great, thanks. James von Moltke Thanks, Andrew.",0,,,
232,Q1-2023-Analyst-Call-Transcript.pdf,Deutsche Bank,2023,1,Andrew Lim,Societe Generale,17,"Sorry, lastly, on the impacts due to internal ...",0,neutral,0.9950,"LGDs, the Investment Bank, either the Investme..."


In [None]:
add_sentiment_and_topic(final_df)

Unnamed: 0,File,Bank Name,Year,Quarter,Slide_title,Text,Sentiment,Score,Topics
0,Q4-2024-Analyst-Call-Transcript.pdf,Deutsche Bank,2024,4,Slide 2 – Actions taken in 2024 position Deuts...,"target in 2025 and beyond\n- Thank you Ioana, ...",negative,0.9975,"2 billion euros year, capital returns, a total..."
1,Q4-2024-Analyst-Call-Transcript.pdf,Deutsche Bank,2024,4,Slide 3 – Resilient full-year results reflecti...,performance\n- We increased 2024 pre-provision...,positive,0.9999,"our efficiency program, FX, the Postbank takeo..."
2,Q4-2024-Analyst-Call-Transcript.pdf,Deutsche Bank,2024,4,Slide 4 – Clear traction across divisions set ...,higher profitability\n- At our investor day in...,positive,1.0000,"March, the commercial focus, substantially bet..."
3,Q4-2024-Analyst-Call-Transcript.pdf,Deutsche Bank,2024,4,Slide 5 – Strong execution and positioning und...,"trajectory\n- Since 2021, we have delivered a ...",positive,1.0000,"US, these trends, FX, this substantial growth,..."
4,Q4-2024-Analyst-Call-Transcript.pdf,Deutsche Bank,2024,4,Slide 6 – Significantly lower expenses in 2025...,"execution of efficiency measures\n- In 2025, o...",positive,0.8964,our initially-planned mandatory and strategic ...
...,...,...,...,...,...,...,...,...,...
161,Q1-2023-Analyst-Call-Transcript.pdf,Deutsche Bank,2023,1,Slide 20 – Investment Bank,- Revenues for the first quarter were 19% lowe...,positive,1.0000,"improvements, A slight increase, March - Rates..."
162,Q1-2023-Analyst-Call-Transcript.pdf,Deutsche Bank,2023,1,Slide 21 – Private Bank,- Private Bank revenues were 2.4 billion euros...,positive,1.0000,"Wealth Management, inflation impacts, the sale..."
163,Q1-2023-Analyst-Call-Transcript.pdf,Deutsche Bank,2023,1,Slide 22 – Asset Management,- Let me continue with Asset Management on sli...,negative,0.9996,"115 million euros, Passive, management fees, 1..."
164,Q1-2023-Analyst-Call-Transcript.pdf,Deutsche Bank,2023,1,Slide 23 – Corporate & Other,- A reminder that Corporate & Other now includ...,positive,1.0000,"a significant improvement, timing, bank levies..."


In [None]:
# Define financial metrics with possible aliases
FINANCIAL_METRICS = {
    "revenue": ["revenue", "revenue target"],
    "expense": ["expense"],
    "income": ["income", "net income", "operating income", "earnings"],
    "margin": ["margin"],
    "profit": ["profit", "net profit"],
    "loss": ["loss", "LGD", "loss given default"],
    "ebitda": ["ebitda"],
    "ebit": ["ebit"],
    "share buyback": ["share buyback"],
    "distribution": ["distribution"],
    "capital ratio": ["capital ratio", "tier 1 ratio", "capital", "CET1", "CET2"],
    "expenses": ["expenses", "opex"],
    "capex": ["capex"],
    "ROE": ["ROE", "return on equity", "return on tangible equity"],
    "cost-income ratio": ["cost-income ratio", "cost/income ratio"],
    "dividend": ["dividend"],
    "aum": ["assets under management", "aum"],
    "net interest income": ["NII", "net interest income","interest income"],
    "market share": ["market share"],
    "cash flow": ["cash flow"],
    "net margin": ["net margin"],
    "EPS": ["EPS", "earnings per share"],
    "RWA": ["RWA", "risk-weighted assets", "risk weighted assets"]
}

# Build a reverse mapping from alias to canonical metric name
alias_to_canonical = {}
all_metric_aliases = []
for canonical, aliases in FINANCIAL_METRICS.items():
    for alias in aliases:
        alias_lower = alias.lower()
        alias_to_canonical[alias_lower] = canonical
        all_metric_aliases.append(alias_lower)

In [None]:
def build_metric_pattern(aliases):
    # Create regex pattern that matches any of the aliases as whole words (with spaces accounted for)
    return "|".join(
        [r"\b" + re.sub(r"\s+", r"\\s+", re.escape(alias)) + r"\b" for alias in aliases]
    )


In [None]:
def extract_financial_metrics(text):
    text = text.replace("\n", " ").strip()
    results = []

    metric_pattern = build_metric_pattern(all_metric_aliases)
    value_pattern = r"(€\s?\d+(?:\.\d+)?\s*(?:million|billion|thousand|bn|m|k)?)"
    percent_pattern = r"(\d{1,3}(?:\.\d+)?\s*%)"

    # Pattern 1: currency value followed by metric
    value_first = re.findall(
        rf"{value_pattern}\s+(?:of\s+)?({metric_pattern})",
        text,
        flags=re.IGNORECASE
    )

    # Pattern 2: metric followed by currency value (up to 50 chars away)
    metric_first = re.findall(
        rf"({metric_pattern})[^\.]{{0,50}}?{value_pattern}",
        text,
        flags=re.IGNORECASE
    )

    # Pattern 3: metric followed by percentage (up to 50 chars away)
    percent_matches = re.findall(
        rf"({metric_pattern})[^\.]{{0,50}}?{percent_pattern}",
        text,
        flags=re.IGNORECASE
    )

    # Normalize and collect results
    for value, metric in value_first:
        canonical = alias_to_canonical.get(metric.lower().strip(), metric.lower().strip())
        results.append({'metric': canonical, 'value': value.strip()})

    for metric, value in metric_first:
        canonical = alias_to_canonical.get(metric.lower().strip(), metric.lower().strip())
        results.append({'metric': canonical, 'value': value.strip()})

    for metric, value in percent_matches:
        canonical = alias_to_canonical.get(metric.lower().strip(), metric.lower().strip())
        results.append({'metric': canonical, 'value': value.strip()})

    return results


In [None]:
def extract_to_columns(row):
    metrics = extract_financial_metrics(row["Text"])
    # Return a Series of dictionaries or empty strings
    return pd.Series({f"metric{i+1}": metric for i, metric in enumerate(metrics)})

In [None]:
# Apply extraction and expand into new columns
metrics_df_qa = final_df_qa.apply(extract_to_columns, axis=1)

In [None]:
# Concatenate with the original DataFrame
cols_to_fill_str = metrics_df_qa.columns.tolist()
final_df_qa_expanded = pd.concat([final_df_qa, metrics_df_qa], axis=1)
final_df_qa_expanded[cols_to_fill_str] = final_df_qa_expanded[cols_to_fill_str].fillna("")

In [None]:
final_df_qa_expanded.head()

Unnamed: 0,File,Bank Name,Year,Quarter,Speaker name,Institution,Question Number,Text,flag_question,Sentiment,Score,Topics,metric1,metric2,metric3
0,Q4-2024-Analyst-Call-Transcript.pdf,Deutsche Bank,2024,4,Nicolas Payen,Kepler Cheuvreux,1,"Good morning. I have two questions, please. Th...",1,neutral,0.9994,"the year, a share buyback annual growth, € 750...","{'metric': 'revenue', 'value': '€ 32 billion'}",,
1,Q4-2024-Analyst-Call-Transcript.pdf,Deutsche Bank,2024,4,Christian Sewing,EU,2,"Thank you, Nicolas, it's Christian. Let me let...",1,positive,1.0,"US, a chance, those years, the discussions, ou...","{'metric': 'net interest income', 'value': '€ ...","{'metric': 'distribution', 'value': '€ 2.1 bil...",
2,Q4-2024-Analyst-Call-Transcript.pdf,Deutsche Bank,2024,4,Anke Reingen,RBC,3,Thank you very much for taking my question. I ...,1,neutral,0.9983,"FX, technology, the divisional level, a great ...","{'metric': 'capital ratio', 'value': '62.5%'}",,
3,Q4-2024-Analyst-Call-Transcript.pdf,Deutsche Bank,2024,4,Christian Sewing,EU,3,"Anke, I 100% support what James is saying. Jus...",0,neutral,0.9887,"our prepared remarks, that, Rebecca, the outco...",,,
4,Q4-2024-Analyst-Call-Transcript.pdf,Deutsche Bank,2024,4,Kian Abouhoussein,JP Morgan,4,Thanks for taking my questions. I just wanted ...,1,neutral,0.9986,"the expense side, the opportunity, FX, risk-we...","{'metric': 'revenue', 'value': '€ 32 billion'}","{'metric': 'expense', 'value': '€ 20.8 billion'}",


In [None]:
output_excel_qa_sentiment = "qa_speaker_blocks_with_sentiments.xlsx"
final_df_qa_expanded.to_excel(output_excel_qa_sentiment, index=False)

In [None]:
# Apply extraction and expand into new columns for presentation part
metrics_df = final_df.apply(extract_to_columns, axis=1)

# Concatenate with the original DataFrame
cols_to_fill_str = metrics_df.columns.tolist()
final_df_expanded = pd.concat([final_df, metrics_df], axis=1)
final_df_expanded[cols_to_fill_str] = final_df_expanded[cols_to_fill_str].fillna("")

In [None]:
final_df_expanded.head()

Unnamed: 0,File,Bank Name,Year,Quarter,Slide_title,Text,Sentiment,Score,Topics,metric1,metric2,metric3,metric4,metric5
0,Q4-2024-Analyst-Call-Transcript.pdf,Deutsche Bank,2024,4,Slide 2 – Actions taken in 2024 position Deuts...,"target in 2025 and beyond\n- Thank you Ioana, ...",negative,0.9975,"2 billion euros year, capital returns, a total...","{'metric': 'profit', 'value': '19%'}","{'metric': 'income', 'value': '65%'}","{'metric': 'capital ratio', 'value': '13.8%'}",,
1,Q4-2024-Analyst-Call-Transcript.pdf,Deutsche Bank,2024,4,Slide 3 – Resilient full-year results reflecti...,performance\n- We increased 2024 pre-provision...,positive,0.9999,"our efficiency program, FX, the Postbank takeo...","{'metric': 'profit', 'value': '19%'}","{'metric': 'revenue', 'value': '4%'}","{'metric': 'income', 'value': '13%'}",,
2,Q4-2024-Analyst-Call-Transcript.pdf,Deutsche Bank,2024,4,Slide 4 – Clear traction across divisions set ...,higher profitability\n- At our investor day in...,positive,1.0,"March, the commercial focus, substantially bet...","{'metric': 'revenue', 'value': '9%'}","{'metric': 'revenue', 'value': '5%'}",,,
3,Q4-2024-Analyst-Call-Transcript.pdf,Deutsche Bank,2024,4,Slide 5 – Strong execution and positioning und...,"trajectory\n- Since 2021, we have delivered a ...",positive,1.0,"US, these trends, FX, this substantial growth,...","{'metric': 'revenue', 'value': '5.9%'}","{'metric': 'revenue', 'value': '4%'}",,,
4,Q4-2024-Analyst-Call-Transcript.pdf,Deutsche Bank,2024,4,Slide 6 – Significantly lower expenses in 2025...,"execution of efficiency measures\n- In 2025, o...",positive,0.8964,our initially-planned mandatory and strategic ...,"{'metric': 'income', 'value': '65%'}",,,,


In [None]:
output_excel_presentations_sentiment = "presentation_slides_with_sentiment.xlsx"
final_df_expanded.to_excel(output_excel_presentations_sentiment, index=False)

# 🏛️ UniCredit - Data Extraction
This section deals with UniCredit-specific data extraction tasks.


## Import PDFs


In [None]:
drive.mount('/content/drive')
!ls '/content/drive/My Drive/BoE'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
 Codes				'Presentation Documents'
 Data				 Project_Plan
'Ideas and Research Documents'	'Russell Hunter Notebooks'
 Meetings			 Video_presentation
'Other Documents'


In [None]:
rec = '/content/drive/My Drive/BoE/Data/Unicredit/recordings/'

transcript_folder = '/content/drive/My Drive/BoE/Data/Unicredit/transcripts/'

output_folder = '/content/drive/My Drive/BoE/Data/Unicredit/extracted_outputs/'

os.makedirs(output_folder, exist_ok=True)

In [None]:
# q123 = os.path.join(rec, '1Q23-UniCredit-Conference-Call.mp3')
# q223 = os.path.join(rec, '2Q23-UniCredit-Conference-Call.mp3')
# q323 = os.path.join(rec, '3Q23-UniCredit-Conference-Call.mp3')
# q423 = os.path.join(rec, '4Q23-UniCredit-Conference-Call.mp3')

# q124 = os.path.join(rec, '1Q24-UniCredit-Conference-Call.mp3')
# q224 = os.path.join(rec, '2Q24-UniCredit-Conference-Call.mp4')
# q324 = os.path.join(rec, '3Q24-UniCredit-Conference-Call.mp3')
# q424 = os.path.join(rec, '4Q24-UniCredit-Conference-Call.mp4')


## Extract info

bert

In [None]:
# model_name = "yiyanghkust/finbert-tone"
# tokenizer = BertTokenizer.from_pretrained(model_name)
# model = BertForSequenceClassification.from_pretrained(model_name)
# finbert_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)


# def add_finbert_sentiment(df, text_column="text"):
#     sentiments = []
#     scores = []
#     for text in df[text_column].fillna("").tolist():
#         try:
#             result = finbert_pipeline(text[:512])[0]  # Truncate if needed
#             sentiments.append(result["label"].upper())
#             scores.append(round(result["score"], 3))  # Confidence score
#         except Exception:
#             sentiments.append("ERROR")
#             scores.append(0.0)
#     df["finbert_sentiment"] = sentiments
#     df["finbert_confidence"] = scores
#     return df

both Finbert and Roberta

In [None]:
# Load FinBERT
finbert_model_name = "yiyanghkust/finbert-tone"
finbert_tokenizer = BertTokenizer.from_pretrained(finbert_model_name)
finbert_model = BertForSequenceClassification.from_pretrained(finbert_model_name)
finbert_pipeline = pipeline("sentiment-analysis", model=finbert_model, tokenizer=finbert_tokenizer)

# Load RoBERTa
roberta_model_name = "cardiffnlp/twitter-roberta-base-sentiment"
roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_model_name)
roberta_model = AutoModelForSequenceClassification.from_pretrained(roberta_model_name)
roberta_pipeline = pipeline("sentiment-analysis", model=roberta_model, tokenizer=roberta_tokenizer)

# Combined function
def add_dual_sentiment_models(df, text_column="text"):
    finbert_sentiments = []
    finbert_scores = []
    roberta_sentiments = []
    roberta_scores = []

    for text in df[text_column].fillna("").tolist():
        try:
            fin_result = finbert_pipeline(text[:512])[0]
            finbert_sentiments.append(fin_result["label"].upper())
            finbert_scores.append(round(fin_result["score"], 3))
        except Exception:
            finbert_sentiments.append("ERROR")
            finbert_scores.append(0.0)

        # try:
        #     rob_result = roberta_pipeline(text[:512])[0]
        #     roberta_sentiments.append(rob_result["label"].upper())
        #     roberta_scores.append(round(rob_result["score"], 3))
        # except Exception:
        #     roberta_sentiments.append("ERROR")
        #     roberta_scores.append(0.0)

    df["finbert_sentiment"] = finbert_sentiments
    df["finbert_confidence"] = finbert_scores
    # df["roberta_sentiment"] = roberta_sentiments
    # df["roberta_confidence"] = roberta_scores

    return df

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

Device set to use cpu


config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Device set to use cpu


In [None]:


# Use lightweight spaCy
nlp = spacy.blank("en")

KEY_METRICS = {
    "EPS": ["eps", "earnings per share"],
    "NII": ["nii", "net interest income"],
    "ROE": ["roe", "return on equity"],
    "ROA": ["roa", "return on assets"],
    "NIM": ["nim", "net interest margin"],
    "LIQUIDITY": ["liquidity", "lcr", "liquidity coverage ratio", "wholesale funding"],
    "CAPITAL": ["capital", "cet1", "tcr", "total capital ratio"],
    "PROFITABILITY": ["profitability"],
    "PROVISIONS": ["provisions", "pll", "loan loss", "provision for losses"],
    "NET INCOME": ["net income"],
    "ASSET QUALITY": ["asset quality", "npl", "non performing loan"]
}

# Words that imply performance movement or quantification
QUALIFIERS = [
    "increase", "increased", "decrease", "decreased", "decline", "declined",
    "improve", "improved", "growth", "grew", "dropped", "fell", "stable",
    "stabilized", "unchanged", "flat", "rose", "risen", "higher", "lower",
    "million", "billion", "%"
]


# === Combined Metric + % Value Extraction Function ===
def extract_structured_financials(pdf_path):
    results = []

    fname = os.path.basename(pdf_path)
    match = re.search(r"Q(\d)[_\s-]?(\d{4})", fname)
    quarter, year = (int(match.group(1)), int(match.group(2))) if match else (None, None)

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if not text:
                continue

            lines = text.split('\n')
            for i, line in enumerate(lines):
                line_clean = line.strip()
                line_lower = f" {line_clean.lower()} "
                words_set = set(re.sub(r"[^\w\s]", "", line_lower).split())

                # === 1. Extract % values with sentence context ===
                percent_matches = re.findall(r"\d+[.,]?\d*\s?%", line)
                if percent_matches:
                    before = lines[i - 1] if i > 0 else ""
                    after = lines[i + 1] if i + 1 < len(lines) else ""
                    context = f"{before.strip()} {line_clean} {after.strip()}"
                    for match in percent_matches:
                        results.append({
                            "quarter": quarter,
                            "year": year,
                            "metric": "PERCENT VALUE",
                            "value": match.strip(),
                            "text": context.strip(),
                            "source_file": fname
                        })

                # === 2. Extract metric-related values with number or qualifier ===
                for metric_group, aliases in KEY_METRICS.items():
                    if any(f" {alias} " in line_lower for alias in aliases):  # enforce word boundary
                        has_number = bool(re.search(r"\d+[.,]?\d*", line))
                        has_qualifier = any(q in line_lower for q in QUALIFIERS)

                        if not has_number and not has_qualifier:
                            continue  # skip vague mentions

                        match = re.search(r"(\d+[.,]?\d*)\s?(million|billion|%)?", line, re.IGNORECASE)
                        value = match.group(0).strip() if match else None

                        results.append({
                            "quarter": quarter,
                            "year": year,
                            "metric": metric_group,
                            "value": value,
                            "text": line_clean,
                            "source_file": fname
                        })

    df = pd.DataFrame(results)
    if not df.empty:
        # df = add_finbert_sentiment(df)
        df = add_dual_sentiment_models(df)

    return df


In [None]:

nlp = spacy.load("en_core_web_sm")

# Add known banks — extend as needed
org_patterns = [
    {"label": "ORG", "pattern": "Bank of America"},
    {"label": "ORG", "pattern": "Goldman Sachs"},
    {"label": "ORG", "pattern": "JPMorgan"},
    {"label": "ORG", "pattern": "Morgan Stanley"},
    {"label": "ORG", "pattern": "UniCredit"},
    {"label": "ORG", "pattern": "Deutsche Bank"},
    {"label": "ORG", "pattern": "HSBC"},
    {"label": "ORG", "pattern": "UBS"},
    {"label": "ORG", "pattern": "Citi"},
    {"label": "ORG", "pattern": "BNP Paribas"},
    {"label": "ORG", "pattern": "ING"},
    {"label": "ORG", "pattern": "Credit Suisse"},
    {"label": "ORG", "pattern": "Barclays"},
    {"label": "ORG", "pattern": "Mediobanca"},
    {"label": "ORG", "pattern": "Credit Agricole"},
    {"label": "ORG", "pattern": "Autonomous Research"},
    {"label": "ORG", "pattern": "BNP"},
    {"label": "ORG", "pattern": "Citi"},
    {"label": "ORG", "pattern": "UBS"},
    {"label": "ORG", "pattern": "Jefferies"},
    {"label": "ORG", "pattern": "KBW"},
    {"label": "ORG", "pattern": "Bank of America"},
    {"label": "ORG", "pattern": "Morgan Stanley"},
]



def extract_person_and_org_from_operator(text):
    doc = nlp(text)
    person = None
    org = None

    # Try spaCy NER first
    for ent in doc.ents:
        if ent.label_ == "PERSON" and not person:
            person = ent.text
        elif ent.label_ == "ORG" and not org:
            org = ent.text

    # Fallback regex: e.g. "from John Smith of Goldman Sachs"
    if not person or not org:
        match = re.search(r"from\s+(.*?)\s+of\s+([A-Za-z\s&]+)", text, re.IGNORECASE)
        if match:
            if not person:
                person = match.group(1).strip()
            if not org:
                org = match.group(2).strip()

    return person, org

In [None]:
def extract_qna_long_format(pdf_path):
    MANAGEMENT_NAMES = ["Andrea Orcel", "Stefano Porro"]

    def is_speaker_line(line):
        return bool(re.match(r"^[A-Z][a-z]+(\s[A-Z][a-z]+)?$", line.strip())) or line.strip() in MANAGEMENT_NAMES or line.strip() == "Operator"

    fname = os.path.basename(pdf_path)
    match = re.search(r"Q(\d)[_\s-]?(\d{4})", fname)
    quarter, year = (int(match.group(1)), int(match.group(2))) if match else (None, None)

    with pdfplumber.open(pdf_path) as pdf:
        full_text = "\n".join(p.extract_text() for p in pdf.pages if p.extract_text())

    if "Question-and-Answer Session" not in full_text:
        return pd.DataFrame()

    qa_text = full_text.split("Question-and-Answer Session", 1)[1]
    lines = qa_text.split("\n")

    blocks = []
    current_speaker = None
    current_block = []

    for line in lines:
        line = line.strip()
        if is_speaker_line(line):
            if current_speaker and current_block:
                blocks.append({"speaker": current_speaker, "text": " ".join(current_block).strip()})
            current_speaker = line
            current_block = []
        else:
            current_block.append(line)

    if current_speaker and current_block:
        blocks.append({"speaker": current_speaker, "text": " ".join(current_block).strip()})

    entries = []
    speaker_to_org = {}
    current_question = None
    question_number = 0

    for i in range(len(blocks)):
        block = blocks[i]
        speaker = block["speaker"]

        if speaker == "Operator":
            # Try to extract person/org from the operator’s introduction
            person, org = extract_person_and_org_from_operator(block["text"])
            if person and org:
                speaker_to_org[person] = org

            # Look ahead to find the actual question
            if i + 1 < len(blocks):
                next_block = blocks[i + 1]
                if next_block["speaker"] not in MANAGEMENT_NAMES and next_block["speaker"] != "Operator":
                    question_number += 1
                    current_question = next_block["speaker"]
                    entries.append({
                        "quarter": quarter,
                        "year": year,
                        "question_number": question_number,
                        "speaker": current_question,
                        "institution": speaker_to_org.get(current_question),
                        "type": "Question",
                        "text": next_block["text"],
                        "source_file": fname
                    })

        elif speaker in MANAGEMENT_NAMES and current_question is not None:
            entries.append({
                "quarter": quarter,
                "year": year,
                "question_number": question_number,
                "speaker": speaker,
                "institution": "UniCredit",  # known from context
                "type": "Answer",
                "text": block["text"],
                "source_file": fname
            })

    return pd.DataFrame(entries)

Loop

In [None]:
all_financial_dfs = []
all_qna_long_dfs = []

pdf_files = [f for f in os.listdir(transcript_folder) if f.lower().endswith(".pdf")]

for filename in tqdm(pdf_files, desc="Processing transcripts", unit="file"):
    full_path = os.path.join(transcript_folder, filename)

    try:
        # === Financial Extraction ===
        df_fin = extract_structured_financials(full_path)
        if not df_fin.empty:
            all_financial_dfs.append(df_fin)

        # === Q&A Extraction + Sentiment ===
        df_qna = extract_qna_long_format(full_path)
        if not df_qna.empty:
            # df_qna = add_finbert_sentiment(df_qna)
            df_qna = add_dual_sentiment_models(df_qna)

            all_qna_long_dfs.append(df_qna)

    except Exception as e:
        print(f"❌ Error processing {filename}: {e}")

# === MERGE AND SAVE FINANCIAL DATA ===
if all_financial_dfs:
    df_fin_merged = pd.concat(all_financial_dfs, ignore_index=True)
    fin_excel = os.path.join(output_folder, "merged_financial_sentences_finbertonly.xlsx")
    fin_pickle = os.path.join(output_folder, "merged_financial_sentences_finbertonly.pkl")
    df_fin_merged.to_excel(fin_excel, index=False)
    df_fin_merged.to_pickle(fin_pickle)
    print(f"✅ Financial data saved to:\n- {fin_excel}\n- {fin_pickle}")
else:
    print("⚠️ No financial sentences were extracted.")

# === MERGE AND SAVE Q&A DATA ===
if all_qna_long_dfs:
    df_qna_merged = pd.concat(all_qna_long_dfs, ignore_index=True)
    qna_excel = os.path.join(output_folder, "merged_qna_long_finbertonly.xlsx")
    qna_pickle = os.path.join(output_folder, "merged_qna_long_finbertonly.pkl")
    df_qna_merged.to_excel(qna_excel, index=False)
    df_qna_merged.to_pickle(qna_pickle)
    print(f"Q&A data saved to:\n- {qna_excel}\n- {qna_pickle}")
else:
    print("No Q&A data extracted.")


Processing transcripts:   0%|          | 0/9 [00:00<?, ?file/s][A
Processing transcripts:  11%|█         | 1/9 [01:11<09:33, 71.73s/file][A
Processing transcripts:  22%|██▏       | 2/9 [02:31<08:53, 76.22s/file][A
Processing transcripts:  33%|███▎      | 3/9 [04:14<08:50, 88.44s/file][A
Processing transcripts:  44%|████▍     | 4/9 [05:52<07:41, 92.26s/file][A
Processing transcripts:  56%|█████▌    | 5/9 [07:25<06:11, 92.76s/file][A
Processing transcripts:  67%|██████▋   | 6/9 [08:41<04:20, 86.85s/file][A
Processing transcripts:  78%|███████▊  | 7/9 [10:07<02:53, 86.60s/file][A
Processing transcripts:  89%|████████▉ | 8/9 [11:27<01:24, 84.45s/file][A
Processing transcripts: 100%|██████████| 9/9 [12:49<00:00, 85.46s/file]


✅ Financial data saved to:
- /content/drive/My Drive/BoE/Data/Unicredit/extracted_outputs/merged_financial_sentences_finbertonly.xlsx
- /content/drive/My Drive/BoE/Data/Unicredit/extracted_outputs/merged_financial_sentences_finbertonly.pkl
Q&A data saved to:
- /content/drive/My Drive/BoE/Data/Unicredit/extracted_outputs/merged_qna_long_finbertonly.xlsx
- /content/drive/My Drive/BoE/Data/Unicredit/extracted_outputs/merged_qna_long_finbertonly.pkl


update ROBERTA labels

In [None]:
# qna_excel = os.path.join(output_folder, "merged_qna_long.xlsx")
# fin_excel = os.path.join(output_folder, "merged_financial_sentences.xlsx")

# # Label map for RoBERTa
# label_map = {
#     "LABEL_0": "NEGATIVE",
#     "LABEL_1": "NEUTRAL",
#     "LABEL_2": "POSITIVE"
# }

# # === Q&A file update ===
# df_qna = pd.read_excel(qna_excel)
# if "roberta_sentiment" in df_qna.columns:
#     df_qna["roberta_sentiment"] = df_qna["roberta_sentiment"].map(label_map).fillna(df_qna["roberta_sentiment"])
#     df_qna.to_excel(qna_excel, index=False)
#     print("✅ Q&A sentiment labels updated.")

# # === Financial file update ===
# df_fin = pd.read_excel(fin_excel)
# if "roberta_sentiment" in df_fin.columns:
#     df_fin["roberta_sentiment"] = df_fin["roberta_sentiment"].map(label_map).fillna(df_fin["roberta_sentiment"])
#     df_fin.to_excel(fin_excel, index=False)
#     print("✅ Financial sentiment labels updated.")

✅ Q&A sentiment labels updated.
✅ Financial sentiment labels updated.


## Topic extraction installation

In [None]:
drive.mount('/content/drive')
!ls '/content/drive/My Drive/BoE'

output_folder = '/content/drive/My Drive/BoE/Data/Unicredit/extracted_outputs/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
 Codes				'Presentation Documents'
 Data				 Project_Plan
'Ideas and Research Documents'	'Russell Hunter Notebooks'
 Meetings			 Video_presentation
'Other Documents'


In [None]:
df_fin_merged = pd.read_excel(output_folder + "merged_financial_sentences_finbertonly.xlsx")
df_qna_merged = pd.read_excel(output_folder + "merged_qna_long_finbertonly.xlsx")

import more packages

In [None]:
!pip install keybert sentence-transformers nltk openpyxl

import pandas as pd
import re
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm



## Topic extraction code

In [None]:
# Download required resources
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

# Load custom embedding model
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
kw_model = KeyBERT(model=embedding_model)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Function words
function_stopwords = [
    "we", "have", "has", "had", "were", "was", "are", "is", "it's", "its", "being", "been",
    "do", "does", "doing", "did", "can", "may", "might", "come", "comes", "going"
]

qa_filler_phrases = [
    "thanks", "thank you", "appreciate", "question", "questions", "ask", "asking", "follow-up",
    "couple of questions", "good morning", "good afternoon", "hello", "thanks for taking my question",
    "thanks for the call", "thank you for your time", "joining us", "line is open", "line",
    "hand over", "presentation", "prepared remarks", "pick up with team", "management remarks",
    "closing remarks", "just wondering", "can you talk a bit more about", "could you elaborate",
    "as you mentioned", "as you said", "i guess", "i wonder if", "may i ask", "i was going to ask",
    "one more question", "a quick clarification", "sorry to go on", "very helpful", "hope that makes sense", "indiscernible",
    "Andrea", "Stefano", "yes", "discussed", "tkank you", "very much"
]



# Combine all stopwords
lemmatizer = WordNetLemmatizer()
base_stopwords = set(stopwords.words("english"))
domain_stopwords = set(qa_filler_phrases  + function_stopwords)
all_stopwords = base_stopwords.union(domain_stopwords)

In [None]:
def clean_text(text):
    # Lowercase and remove special characters
    text = re.sub(r"[^\w\s]", " ", text.lower())
    words = text.split()
    # Lemmatize and remove stopwords
    clean_words = [
        lemmatizer.lemmatize(w)
        for w in words
        if w not in all_stopwords and len(w) > 2
    ]
    return " ".join(clean_words)

In [None]:
def add_topic_keywords(df, text_column="text", top_n=3):
    cleaned_texts = df[text_column].fillna("").astype(str).apply(clean_text)
    topics = []

    for text in tqdm(cleaned_texts, desc=f"Extracting topics for '{text_column}'", unit="row"):
        try:
            keywords = kw_model.extract_keywords(
                text,
                keyphrase_ngram_range=(1, 2),
                stop_words=None,
                top_n=top_n
            )
            topic_list = [kw[0] for kw in keywords]
            topics.append(", ".join(topic_list))
        except Exception:
            topics.append("ERROR")

    df["topics"] = topics
    return df

In [None]:
df_fin_topic= add_topic_keywords(df_fin_merged, text_column="text")
df_qna_topic = add_topic_keywords(df_qna_merged, text_column="text")


df_qna_topic.to_excel(output_folder + 'qna_topics.xlsx', index=False)
df_qna_topic.to_pickle(output_folder +  'qna_topics.pkl')

df_fin_topic.to_excel(output_folder + 'financial_topics.xlsx', index=False)
df_fin_topic.to_pickle(output_folder + 'financial_topics.pkl')

Extracting topics for 'text': 100%|██████████| 1815/1815 [06:01<00:00,  5.03row/s]
Extracting topics for 'text': 100%|██████████| 205/205 [03:02<00:00,  1.12row/s]
