In [None]:
import pandas as pd
import os
import shutil

import torch
import json
import requests

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, AutoProcessor
from PIL import Image

import re
import unicodedata

# Step 1: Data input

In [None]:
# Import data from google drive. The data files were extracted using API
file_ids = [
    '1XhLXQV6sQxiHsjhAmdHAp9cKrxY_njeU', #Barclays
    '1fhmMml1irnFwwOWtdE5N_yjn8ZUDjCIX', #Credit Suisse
    '1pcVpQ3SOHFtOuZA34W0akKxdkjh0-WCd', #Deutsche Bank
    '1EiSRaSu0fFj_yYtnBA8GuaoEo26WfAPS', #HSBC
    '1JdxdmL4ND-yVH6HvWcm8MPivBEQghi_7', #JPM
    '1kQj4HZxq0C1BqOL6gaoago7S22zyb8Nh', #Santander
    '10bjoeBGZ9tX9eoaNwOXjEX2_L_lWLxGB' #Unicredit
]

file_names = ['barc','cs','db','hsbc','jpm','san','uc']

# Dictionary to store JSON data
json_data = {}

# Loop to download and load each file
for file_id, name in zip(file_ids, file_names):
    url = f'https://drive.google.com/uc?id={file_id}'
    gdown.download(url, name, quiet=False)

    with open(name, 'r') as f:
        json_data[name] = json.load(f)




Downloading...
From: https://drive.google.com/uc?id=1XhLXQV6sQxiHsjhAmdHAp9cKrxY_njeU
To: /content/barc
100%|██████████| 1.48M/1.48M [00:00<00:00, 161MB/s]
Downloading...
From: https://drive.google.com/uc?id=1fhmMml1irnFwwOWtdE5N_yjn8ZUDjCIX
To: /content/cs
100%|██████████| 2.27M/2.27M [00:00<00:00, 195MB/s]
Downloading...
From: https://drive.google.com/uc?id=1pcVpQ3SOHFtOuZA34W0akKxdkjh0-WCd
To: /content/db
100%|██████████| 1.51M/1.51M [00:00<00:00, 65.0MB/s]
Downloading...
From: https://drive.google.com/uc?id=1EiSRaSu0fFj_yYtnBA8GuaoEo26WfAPS
To: /content/hsbc
100%|██████████| 1.06M/1.06M [00:00<00:00, 107MB/s]
Downloading...
From: https://drive.google.com/uc?id=1JdxdmL4ND-yVH6HvWcm8MPivBEQghi_7
To: /content/jpm
100%|██████████| 1.01M/1.01M [00:00<00:00, 128MB/s]
Downloading...
From: https://drive.google.com/uc?id=1kQj4HZxq0C1BqOL6gaoago7S22zyb8Nh
To: /content/san
100%|██████████| 1.27M/1.27M [00:00<00:00, 132MB/s]
Downloading...
From: https://drive.google.com/uc?id=10bjoeBGZ9tX9eoaN

# Step 2 Data processing

In [None]:
# 2.1. Create a dictionary to store the data that was uploaded from files
tr_split_all = {}

for bank_key in file_names:
    bank_data = json_data.get(bank_key, {})

    # Extract transcript_split for each quarter
    tr_split_all[bank_key] = {
        quarter: content['transcript_split']
        for quarter, content in bank_data.items()
        if 'transcript_split' in content
    }


In [None]:
# 2.2 Add additional column of positions held by presenters
pos_db = {
    "Silke Szypa": "Deputy Head of Investor Relations",
    "Christian Sewing": "Chief Executive Officer",
    "James von Moltke": "Chief Financial Officer",
    "Ioana Patriniche": "Head of Investor Relations"
}

pos_hsbc = {
    "Georges Elhedery": "Group Chief Executive Officer",
    "Pam Kaur": "Group Chief Financial Officer",
    "Colin Bell": "CEO, HSBC Bank plc & Europe",
    "Greg Guyett": "CEO, Global Banking & Markets",
    "Nuno Matos": "CEO, Wealth & Personal Banking"
}

pos_jpm = {
    "Jamie Dimon": "Chairman & CEO",
    "Jeremy Barnum": "Chief Financial Officer",
    "Marianne Lake": "Co‑CEO, Consumer & Community Banking",
    "Jennifer Piepszak": "Chief Financial Officer, Corporate & Investment Bank"
}

pos_cs = {
    "Ulrich Korner": "Group Chief Executive Officer",
    "Ulrich Koerner": "Group Chief Executive Officer",
    "Dixit Joshi": "Group Chief Financial Officer",
    "Thomas Gottstein": "Former Chief Executive Officer",
    "David Mathers": "Former Chief Financial Officer",
    "Axel Lehmann": "Chairman of Credit Suisse"
}

pos_ubs = {
    "Sergio P. Ermotti": "Group Chief Executive Officer",  # CEO since April 2023 :contentReference[oaicite:1]{index=1}
    "Todd Tuckner": "Group Chief Financial Officer",       # CFO since May 2023 :contentReference[oaicite:2]{index=2}
    "Colm Kelleher": "Chairman of the Board",              # Chair since April 2022 :contentReference[oaicite:3]{index=3}
    "George Athanasopoulos": "Co‑President Investment Bank",  # Appointed July 2024 :contentReference[oaicite:4]{index=4}
    "Aleksandar Ivanovic": "President Asset Management",      # Since March 2024 :contentReference[oaicite:5]{index=5}
    "Iqbal Khan": "Co‑President Global Wealth Management & President Asia Pacific",  # Since July & Sept 2024 :contentReference[oaicite:6]{index=6}
    "Robert Karofsky": "Co‑President Investment Bank & President UBS Americas",      # Named in 2024 :contentReference[oaicite:7]{index=7}
    "Kirt Gardner": "Group Chief Financial Officer (prior to Tuckner)", # CFO pre-2023 restructure :contentReference[oaicite:8]{index=8}
    "Kinner Lakhani": "Chief Financial Officer, Global Wealth Management"
}

#merge pos_cs and pos_ubs
pos_cs.update(pos_ubs)

pos_uc = {
    "Andrea Orcel": "Chief Executive Officer",
    "Stefano Porro": "Chief Financial Officer",
    "Stefano Porro ": "Chief Financial Officer",
    "Magda Palczynska": "Head of Investor Relations"
}

pos_san = {
    "Hector Grisi": "Chief Executive Officer",
    "Hector Grisi Checa": "Chief Executive Officer",
    "Jose Garcia": "Chief Financial Officer",
    "Mario Leao": "Chief Executive Officer (Brasil)",
    "Gustavo Alejo": "Chief Financial Officer (Brasil)"
}

pos_barc = {
    "C.S. Venkatakrishnan": "Group Chief Executive",
    "Anna Cross": "Group Finance Director",
    "Mark Mason": "Group Chief Financial Officer"
}


# Process the already existing names
def remove_diacritics(input_str):
    # Normalize the string to decomposed form and remove non-Latin characters
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    # Filter out non-Latin characters
    return ''.join(c for c in nfkd_form if not unicodedata.combining(c) and (c.isalpha() or c.isspace() or c == '.'))

def standardize_name(name):
    # Strip leading and trailing whitespace and remove diacritics
    name = name.strip()
    name = remove_diacritics(name)

    # Standardize C.S. Venkatakrishnan and similar names, including variations with prefixes
    if re.search(r'(?:C\.?\s?S\.?\s?)?Venkatakrishnan$', name, re.IGNORECASE):
        return 'C.S. Venkatakrishnan'
    # Standardize Anna Cross and similar names
    elif re.search(r'Anna\s?[-_]?Cross$', name, re.IGNORECASE):
        return 'Anna Cross'
    # Standardize Silke Szypa
    elif re.search(r'Silke\s?Szypa$', name, re.IGNORECASE):
        return 'Silke Szypa'
    # Standardize Christian Sewing
    elif re.search(r'Christian\s?Sewing$', name, re.IGNORECASE):
        return 'Christian Sewing'
    # Standardize James von Moltke
    elif re.search(r'James\s?von\s?Moltke$', name, re.IGNORECASE):
        return 'James von Moltke'
    # Standardize Ioana Patriniche
    elif re.search(r'Ioana\s?Patriniche$', name, re.IGNORECASE):
        return 'Ioana Patriniche'
    # Standardize Georges Elhedery
    elif re.search(r'Georges\s?Elhedery$', name, re.IGNORECASE):
        return 'Georges Elhedery'
    # Standardize Pam Kaur
    elif re.search(r'Pam\s?Kaur$', name, re.IGNORECASE):
        return 'Pam Kaur'
    # Standardize Colin Bell
    elif re.search(r'Colin\s?Bell$', name, re.IGNORECASE):
        return 'Colin Bell'
    # Standardize Greg Guyett
    elif re.search(r'Greg\s?Guyett$', name, re.IGNORECASE):
        return 'Greg Guyett'
    # Standardize Nuno Matos
    elif re.search(r'Nuno\s?Matos$', name, re.IGNORECASE):
        return 'Nuno Matos'
    # Standardize Jamie Dimon
    elif re.search(r'Jamie\s?Dimon$', name, re.IGNORECASE):
        return 'Jamie Dimon'
    # Standardize Jeremy Barnum
    elif re.search(r'Jeremy\s?Barnum$', name, re.IGNORECASE):
        return 'Jeremy Barnum'
    # Standardize Marianne Lake
    elif re.search(r'Marianne\s?Lake$', name, re.IGNORECASE):
        return 'Marianne Lake'
    # Standardize Jennifer Piepszak
    elif re.search(r'Jennifer\s?Piepszak$', name, re.IGNORECASE):
        return 'Jennifer Piepszak'
    # Standardize Ulrich Korner
    elif re.search(r'Ulrich\s?Korner$', name, re.IGNORECASE):
        return 'Ulrich Korner'
    # Standardize Dixit Joshi
    elif re.search(r'Dixit\s?Joshi$', name, re.IGNORECASE):
        return 'Dixit Joshi'
    # Standardize Thomas Gottstein
    elif re.search(r'Thomas\s?Gottstein$', name, re.IGNORECASE):
        return 'Thomas Gottstein'
    # Standardize Andrea Orcel
    elif re.search(r'Andrea\s?Orcel$', name, re.IGNORECASE):
        return 'Andrea Orcel'
    # Standardize Stefano Porro
    elif re.search(r'Stefano\s?Porro$', name, re.IGNORECASE):
        return 'Stefano Porro'
    # Standardize Magda Palczynska
    elif re.search(r'Magda\s?Palczynska$', name, re.IGNORECASE):
        return 'Magda Palczynska'
    # Standardize Hector Grisi followed by any suffix
    elif re.search(r'Hector\s?Grisi(\s\w+)*$', name, re.IGNORECASE):
        return 'Hector Grisi'
    # Standardize Jose Garcia followed by any suffix
    elif re.search(r'Jose\s?Garcia', name, re.IGNORECASE):
        return 'Jose Garcia'
    # Standardize Mario Leao
    elif re.search(r'Mario\s?Leao$', name, re.IGNORECASE):
        return 'Mario Leao'
    # Standardize Gustavo Alejo
    elif re.search(r'Gustavo\s?Alejo$', name, re.IGNORECASE):
        return 'Gustavo Alejo'
    # Standardize Mark Mason
    elif re.search(r'Mark\s?Mason$', name, re.IGNORECASE):
        return 'Mark Mason'
        # Standardize Sergio P. Ermotti
    elif re.search(r'Sergio\s?P\.?\s?Ermotti$', name, re.IGNORECASE):
        return 'Sergio P. Ermotti'
    # Standardize Todd Tuckner
    elif re.search(r'Todd\s?Tuckner$', name, re.IGNORECASE):
        return 'Todd Tuckner'
    # Standardize Colm Kelleher
    elif re.search(r'Colm\s?Kelleher$', name, re.IGNORECASE):
        return 'Colm Kelleher'
    # Standardize George Athanasopoulos
    elif re.search(r'George\s?Athanasopoulos$', name, re.IGNORECASE):
        return 'George Athanasopoulos'
    # Standardize Aleksandar Ivanovic
    elif re.search(r'Aleksandar\s?Ivanovic$', name, re.IGNORECASE):
        return 'Aleksandar Ivanovic'
    # Standardize Iqbal Khan
    elif re.search(r'Iqbal\s?Khan$', name, re.IGNORECASE):
        return 'Iqbal Khan'
    # Standardize Robert Karofsky
    elif re.search(r'Robert\s?Karofsky$', name, re.IGNORECASE):
        return 'Robert Karofsky'
    # Standardize Kirt Gardner
    elif re.search(r'Kirt\s?Gardner$', name, re.IGNORECASE):
        return 'Kirt Gardner'
    # Standardize Kinner Lakhani
    elif re.search(r'Kinner\s?Lakhani$', name, re.IGNORECASE):
        return 'Kinner Lakhani'
    elif re.search(r'Ulrich\s?Koerner$', name, re.IGNORECASE):
        return 'Ulrich Korner'
    # Standardize David Mathers
    elif re.search(r'David\s?Mathers$', name, re.IGNORECASE):
        return 'David Mathers'
    # Standardize Axel Lehmann
    elif re.search(r'Axel\s?Lehmann$', name, re.IGNORECASE):
        return 'Axel Lehmann'
    elif re.search(r'(A\s*[-–]\s*)?(Angela|Anna)\s?[-_]?Cross$', name, re.IGNORECASE):
        return 'Anna Cross'
    else:
        return name


In [None]:
# 2.3. Update all entries in transcript_split_all with their correct title
pos_all = {'db': pos_db, 'hsbc': pos_hsbc, 'jpm': pos_jpm, 'cs': pos_cs, 'uc': pos_uc, 'san': pos_san, 'barc': pos_barc}

for bank, quarters in tr_split_all.items():
    position_lookup = pos_all.get(bank, {})

    for quarter, entries in quarters.items():
        for i, entry in enumerate(entries):
            speaker = entry.get('speaker', '')

            # Get the standard name for the speaker using regex
            standard_speaker = standardize_name(speaker)

            # Assign the correct title based on the standard name
            if standard_speaker in position_lookup:
                title = position_lookup[standard_speaker]
            elif speaker.lower() == 'operator':
                title = 'Operator'
            else:
                title = 'Analyst'

            # Reorder entry with desired key order: speaker, title, text
            reordered_entry = {
                'speaker': speaker,
                'title': title,
                **{k: v for k, v in entry.items() if k not in ['speaker', 'title']}
            }

            # Replace original entry
            entries[i] = reordered_entry

# Save Q&A Transcripts of all banks as JSON
with open('tr_split_all.json', 'w') as f:
    json.dump(tr_split_all, f, indent=2)

print("Done. Transcripts saved.")

Done. Transcripts saved.


In [None]:
# 2.4. Create dataframe from the JSON file of all banks to handle data operations easier
# Step 2.4.1: Load the JSON file
with open("tr_split_all.json", "r") as f:
    tr_split_all = json.load(f)

# Step 2.4.2: Flatten into records
records = []
for bank, quarters in tr_split_all.items():
    for quarter, entries in quarters.items():
        year = quarter[:4]
        q = quarter[4:]
        for entry in entries:
            records.append({
                "year": year,
                "quarter": q,
                "bank": bank,
                "speaker": entry.get("speaker"),
                "title": entry.get("title"),
                "text": entry.get("text")
            })

# Step 2.4.3: Convert to DataFrame and change position so that bank is first
df_all = pd.DataFrame(records)
df_all = df_all[['bank', 'year', 'quarter', 'speaker', 'title', 'text']]

# Step 2.4.4: Export to xlsx
df_all.to_excel("all_banks_transcript_split.xlsx", index=False)


# Step 3 Creation of message prompts and save them in the dataframe as dictionaries for the Phi4 model pipeline

In [None]:
# Step 3.1. Detect question from analysts that have "?" in composition
# Initially A question is considered valid if there is a "?" in the paragraph and if it comes from an analyst.
# If it is from a person other than analyst it is considered an answer to the addressed question by analyst.
# Typical questions asked by non-analysts: "Should we take the next question?", "Pam?", etc and they refer to questions that connect with the question of the next analyst.
def detect_question(df):
    return ((df['text'].astype(str).str.contains(r'\?')) &
            (df['title'].astype(str).str.lower().str.contains('analyst'))).astype(int)


In [None]:
# Step 3.2. Function to detect Q&A section
def qa_sect_detect(df):
    df = df.copy()
    df['flag_qa_sect'] = 0 # It takes value 0 if the section is not in Q&A, it takes 1 if it is section of Q&A
    # Find begining and closing pattern phrases.

    # Usually there is an Operator that begins the Q&A section and ends it.
    # However there are cases when this Operator is not present.
    # If there is no Operator, the Q&A section begins with the first question of the analyst and
    # finishes with the last answer of presenter to last question posed by an analyst.
    # The Q&A section is extended until the analyst who had the last question is answered by a non-analyst with a "thank you" pattern.
    qa_start_pattern = (r'\b('
        r'start|begin|ready|open.*(q&a|questions|question[-\s]?and[-\s]?answer)|'
        r'first\s+question.*(comes|is|today)|'
        r'(our|we\s+already\s+have\s+the)\s+first\s+question.*(from|comes|is|today)|'
        r'kick.*off.*questions|'
        r'we(\'ll)?\s+start.*(questions|q&a)|'
        r'let(\'s)?\s+start.*(questions|q&a)|'
        r'we(\'ll)?\s+now\s+(begin|open).*question[-\s]?and[-\s]?answer|'
        r'we(\'ll)?\s+take\s+(the\s+)?(next\s+)?questions?|'
        r'we(\'re)?\s+ready\s+to\s+(take|start)\s+(your\s+)?questions?|'
        r'begin\s+with\s+the\s+first\s+question'
        r')\b')

    qa_end_pattern = (r'\b('
        r'no\s+(more|further)\s+questions|'
        r'last\s+question|'
        r'(concludes|ends)\s+(the\s+)?(q&a|question[-\s]?and[-\s]?answer|session|call)|'
        r'end\s+(of\s+)?(q&a|question[-\s]?and[-\s]?answer|session|call)|'
        r'(this\s+)?(concludes|ends)\s+(today(\'s)?\s+)?(call|conference(\s+call)?|session)|'
        r'closing\s+remarks|'
        r'(the\s+)?conference\s+(is\s+)?(now\s+)?over|'
        r'(you\s+may\s+)?disconnect(\s+your\s+telephones|)?|'
        r'thank\s+(you|everyone|everybody|all)\s+(again\s+)?for\s+(joining|participating|being\s+with\s+us)|'
        r'thank\s+you\s+for\s+joining\s+(today(\'s)?\s+)?call|'
        r'a\s+recording\s+of\s+the\s+(presentation|call)\s+will\s+be\s+available|'
        r'(you\s+may\s+)?(now\s+)?disconnect(\s+all)?'
        r')\b')

    # Create speaker_role column based on title
    df['speaker_role'] = df['title'].str.contains('analyst', case=False, na=False).map({True: 'analyst', False: 'non-analyst'})

    df['speaker_lc'] = df['speaker'].str.lower()
    df['text_lc'] = df['text'].str.lower()
    group_cols = ['year', 'quarter', 'bank']

    for _, group_df in df.groupby(group_cols):
        group_idx = group_df.index
        sub_df = df.loc[group_idx]

        # --- PRIMARY LOGIC: Look for Operator-based start ---
        qa_start_match = sub_df[
            (sub_df['speaker_lc'] == 'operator') &
            sub_df['text_lc'].str.contains(qa_start_pattern, regex=True)
        ]

        qa_start_idx = qa_start_match.index.min() if not qa_start_match.empty else None

        # --- FALLBACK: If no Operator start found, use first speaker ---
        if qa_start_idx is None:
            fallback_start_match = sub_df[
                sub_df['text_lc'].str.contains(qa_start_pattern, regex=True)
            ]
            if fallback_start_match.empty:
                continue  # Skip if still no match
            qa_start_idx = fallback_start_match.index.min()

        # --- PRIMARY: Look for Operator end ---
        qa_end_match = sub_df[
            (sub_df.index > qa_start_idx) &
            (sub_df['speaker_lc'] == 'operator') &
            sub_df['text_lc'].str.contains(qa_end_pattern, regex=True)
        ]

        qa_end_idx = qa_end_match.index.min() if not qa_end_match.empty else None

        # --- FALLBACK: If no Operator end found, use any speaker ---
        if qa_end_idx is None:
            fallback_end_match = sub_df[
                (sub_df.index > qa_start_idx) &
                sub_df['text_lc'].str.contains(qa_end_pattern, regex=True)
            ]
            qa_end_idx = fallback_end_match.index.min() if not fallback_end_match.empty else group_idx.max() + 1

        # Extend the Q&A section until the analyst who had the last question
        # is answered by a non-analyst with a "thank you" pattern.
        # Find the last analyst question index in the group
        analyst_questions = sub_df[
            (sub_df['flag_question'] == 1) & (sub_df['speaker_role'] == 'analyst')
        ]
        if not analyst_questions.empty:
            last_analyst_q_idx = analyst_questions.index.max()

            # Look for a non-analyst answer containing 'thank you' after last analyst question
            non_analyst_thanks = sub_df[
                (sub_df.index > last_analyst_q_idx) &
                (sub_df['speaker_role'] == 'non-analyst') &
                (sub_df['text_lc'].str.contains(r'\bthank you\b', regex=True))
            ]

            if not non_analyst_thanks.empty:
                thank_idx = non_analyst_thanks.index.min()
                if qa_end_idx is None or thank_idx > qa_end_idx:
                    qa_end_idx = thank_idx + 1

        # Flag rows in the Q&A range
        df.loc[(df.index >= qa_start_idx) & (df.index < qa_end_idx), 'flag_qa_sect'] = 1

    return df['flag_qa_sect']


In [None]:
# Step 3.3. Sometimes there are questions posed by analysts that do not display "?".
# In this case if it is in the Q&A section and longer than a formulation sequence of 16 (usually they are ending formulations)
# and has zero as flag_question, it will be considered a question.
def adjust_flag_question_for_analysts(df):
    df = df.copy()

    # Normalize speaker field to avoid issues with casing or spacing
    analyst_condition = df['title'].str.strip().str.lower() == 'analyst'

    condition = (
        (df['flag_qa_sect'] == 1) &
        analyst_condition &
        (df['flag_question'] == 0) &
        (df['text'].str.split().str.len() >= 16)
    )

    df.loc[condition, 'flag_question'] = 1

    return df

# Run the question detection, QA section detection and detection of questions without "?" in the QA section.
df_all['flag_question'] = detect_question(df_all)
df_all['flag_qa_sect'] = qa_sect_detect(df_all)
df_all = adjust_flag_question_for_analysts(df_all)
df_all['flag_qa_sect'] = qa_sect_detect(df_all) #run again in case the question from analyst that does not display "?" was the last question asked and extend the falg_qa_sect


  sub_df['text_lc'].str.contains(qa_start_pattern, regex=True)
  sub_df['text_lc'].str.contains(qa_end_pattern, regex=True)
  sub_df['text_lc'].str.contains(qa_start_pattern, regex=True)
  sub_df['text_lc'].str.contains(qa_end_pattern, regex=True)
  sub_df['text_lc'].str.contains(qa_start_pattern, regex=True)
  sub_df['text_lc'].str.contains(qa_end_pattern, regex=True)
  sub_df['text_lc'].str.contains(qa_start_pattern, regex=True)
  sub_df['text_lc'].str.contains(qa_end_pattern, regex=True)
  sub_df['text_lc'].str.contains(qa_start_pattern, regex=True)
  sub_df['text_lc'].str.contains(qa_end_pattern, regex=True)
  sub_df['text_lc'].str.contains(qa_start_pattern, regex=True)
  sub_df['text_lc'].str.contains(qa_end_pattern, regex=True)
  sub_df['text_lc'].str.contains(qa_start_pattern, regex=True)
  sub_df['text_lc'].str.contains(qa_end_pattern, regex=True)
  sub_df['text_lc'].str.contains(qa_start_pattern, regex=True)
  sub_df['text_lc'].str.contains(qa_end_pattern, regex=True)
  sub_df

In [None]:
# Step 3.4. Check which banks for each year and quarter don't have Operator.
# Identify (year, quarter, bank) combos that contain "Operator" in title or speaker
has_operator = df_all[
    (df_all['speaker'] == 'Operator') | (df_all['title'] == 'Operator')
][['year', 'quarter', 'bank']].drop_duplicates()

# Exclude these combinations from the original dataframe
clean_df = df_all.merge(has_operator, on=['year', 'quarter', 'bank'], how='left', indicator=True)
clean_df = clean_df[clean_df['_merge'] == 'left_only'].drop(columns=['_merge'])

# Group by bank, year, quarter. These will be treated separately when doing the querry-response part
df_bank_no_operator = clean_df.groupby(['year', 'quarter', 'bank']).size().reset_index(name='count')
df_bank_no_operator

Unnamed: 0,year,quarter,bank,count
0,2023,Q4,barc,92
1,2023,Q4,hsbc,71
2,2024,Q4,barc,57


In [None]:
# Step 3.5. Create different columns in the dataframe that would help to create message prompts.
# Create column flag_sum_question_qa to detect questions in the QA section . It will help to easier detect questions
df_all['flag_sum_question_qa'] = df_all['flag_question'] + df_all['flag_qa_sect']
# Create column flag_analyst that has 1 if title == 'Analyst', otherwise 0. It will help to easier detect what analysts say
df_all['flag_analyst'] = (df_all['title'] == 'Analyst').astype(int)
# Create column flag_operator that has 1 if title == 'Operator', otherwise 0. It will help to easier detect Operator intro, outro.
df_all['flag_operator'] = (df_all['title'] == 'Operator').astype(int)
# create column flag_presentor that has 1 if title is not 'Operator' or 'Analyst'. It will help to easier detect the presenters.
df_all['flag_presenter'] = (~df_all['title'].isin(['Operator', 'Analyst'])).astype(int)


# export as xlsx
df_all.to_excel("all_banks_transcript_split.xlsx", index=False)



In [None]:
# Step 3.6. Create a new column "type" what will have: query, answer of NaN. Ignore also phrases that have less than 20 words (these are like thank you, next question, etc)
# Rules specified
#     If flag_operator == 1 → then type = NaN
#     If flag_operator == 0 and flag_sum_question_qa == 2 → then type = 'query'
#     If flag_operator == 0 and flag_sum_question_qa == 1 and flag_analyst == 0 and flag_presenter == 1 → then type = 'answer'
#     If flag_operator == 0 and flag_sum_question_qa == 1 and flag_analyst == 1 and flag_presenter == 0 → then type = NaN
import numpy as np

def set_type(row):
    word_count = len(str(row['text']).split())

    # Only allow processing if word count is greater than 20
    if word_count <= 20:
        return np.nan

    if row['flag_operator'] == 1:
        return np.nan
    elif row['flag_sum_question_qa'] == 2:
        return 'query'
    elif (
        row['flag_sum_question_qa'] == 1 and
        row['flag_analyst'] == 0 and
        row['flag_presenter'] == 1
    ):
        return 'answer'
    elif (
        row['flag_sum_question_qa'] == 1 and
        row['flag_analyst'] == 1 and
        row['flag_presenter'] == 0
    ):
        return np.nan
    else:
        return np.nan

# Apply to DataFrame
df_all['type'] = df_all.apply(set_type, axis=1)


df_all.to_excel("all_banks_transcript_split.xlsx", index=False)


In [None]:
# Step 3.7. Create a dataframe from the original one that will be used in the creation of message prompts.
# This dataframe will have columns: type (query, answer), person_type (participant, presenter), name,
# job, text, message_user (dictionary that has question and answers), message_final (has the standard prompt and message_user for the Phi4 model)

# Create new column person_type that has values participant if it is analyst, presenter if it has flag_presenter = 1, otherwise NaN
df_all['person_type'] = np.where(df_all['flag_analyst'] == 1, 'participant', np.where(df_all['flag_presenter'] == 1, 'presenter', None))
# Create column name that takes the names from speaker if person_type is populated
df_all['name'] = np.where(df_all['person_type'].notnull(), df_all['speaker'], None)
# Create column job that takes values from title if the person_type is presenter
df_all['job'] = np.where(df_all['person_type'] == 'presenter', df_all['title'], None)
# Create new dataframe and filter only rows where type is not empty (type, person_type, name, job, text). Put column text last one
df_messages = df_all[df_all['type'].notnull()][['type', 'person_type', 'name', 'job', 'text']]

In [None]:
# Step 3.8. Create the message_final prompt for the Phi4 model.
# First is to create message_user that is a dictionary with keys: type (query / answers) and text. It is without the begining standard message content.
# It will be added later and will create message_final dictionary. This will be used for the Phi4 model.

# Define the standard prompt message. This will be added at the beginning of each of the message_user.
standard_message = [
    {
        'role': 'system',
        'content': (
            "You are a financial analyst. For each Q&A exchange, extract only the financial metric(s) discussed, "
            "provide numerical value if mentioned, any future trajectory (e.g., increase, decline, flat), and whether "
            "the presenters' responses answered the questions or avoided them. \n\n"
            "⚠️ Format your answer strictly as a flat list of strings like this:\n"
            "['Metric1', 'LEVEL or NO LEVEL', 'TRAJECTORY or NO TRAJECTORY', 'ANSWERED or AVOIDED', "
            "'Metric2', ...].\n"
            "❌ Do NOT return dictionaries, JSON objects, or multi-line strings.\n"
            "✅ Return a *single flat list* of values in the correct order."
        )
    },
    {
        'role': 'user',
        'content': (
            "From the following query object [{'type': 'query', 'person_type': 'participant', 'name': 'Russ Hunter', "
            "'text': 'My first question relate to FTSE 100, starting at the level of 1500 points, do you think it will "
            "grow in medium term? And my second question refers to both ROE and ROA? Do you think they will increase "
            "or decrease?'}] and the following answer objects [{'type': 'answer', 'person_type': 'presenter',"
            "'name': 'Olivia Hunter', 'job': 'CFO', 'text': 'The FTSE 100 has some really complex visualisations. "
            "Related to your second question we expect ROE to increase at 5% and ROA to decrease.'}], "
            "extract each financial metric mentioned, the level if mentioned (or NO LEVEL), future trajectory "
            "(or NO TRAJECTORY), and whether the question was ANSWERED or AVOIDED. Return result as a flat list."
        )
    },
    {
        'role': 'assistant',
        'content': (
            "['FTSE 100', '1500', 'GROW', 'AVOIDED', 'ROE', 'NO LEVEL', 'INCREASE', 'ANSWERED', "
            "'ROA', '5%', 'DECREASE', 'ANSWERED']"
        )
    }
]

# Initialize the column
df_messages['message_final'] = pd.Series(dtype='object')

# Iterate over the DataFrame
i = 0
while i < len(df_messages):
    current_row = df_messages.iloc[i]

    if current_row['type'] != 'query':
        i += 1
        continue

    # Construct the query object
    query_obj = {
        "type": "query",
        "person_type": current_row['person_type'],
        "name": current_row['name'],
        "text": current_row['text']
    }

    # Gather associated answers
    answer_objs = []
    j = i + 1
    while j < len(df_messages) and df_messages.iloc[j]['type'] == 'answer':
        ans_row = df_messages.iloc[j]
        answer_objs.append({
            "type": "answer",
            "person_type": ans_row['person_type'],
            "name": ans_row['name'],
            "job": ans_row['job'],
            "text": ans_row['text']
        })
        j += 1

    # Create the user content message
    message_user = {
        "role": "user",
        "content": (
            f"From the following query object {repr([query_obj])} and the following answer objects "
            f"{repr(answer_objs)}, extract each financial metric mentioned, the level if mentioned (or NO LEVEL), "
            f"future trajectory (or NO TRAJECTORY), and whether the question was ANSWERED or AVOIDED. "
            f"Return the result as a single flat list in the format: "
            f"['Metric', 'LEVEL or NO LEVEL', 'TRAJECTORY or NO TRAJECTORY', 'ANSWERED or AVOIDED', ...]."
        )
    }

    # Final message for Phi-4
    message_final = standard_message + [message_user]

    # Save to DataFrame
    df_messages.at[df_messages.index[i], 'message_final'] = message_final

    # Move to next question block
    i = j


In [None]:
df_messages.to_excel("df_messages.xlsx", index=True)

# Step 4 Run the Phi 4 model

In [None]:
# Step 4.1. Load Phi4 model.
torch.random.manual_seed(0)

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-4-mini-instruct",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

configuration_phi3.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-4-mini-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-4-mini-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.77G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-4-mini-instruct")

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/15.5M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/249 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/587 [00:00<?, ?B/s]

In [None]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "temperature": 0.0,
    "do_sample": False,
}

Device set to use cuda


In [None]:
# Step 4.2. Run the model Phi 4 for the message_final and save the output in a new column of the dataframe called model_output.
#
generated_outputs = []

for i, row in df_messages.iterrows():
    message_final = row['message_final']

    # Check message is not None and is a proper list
    if message_final is not None and isinstance(message_final, list):
        try:
            result = pipe(message_final, **generation_args)
            generated_text = result[0]['generated_text']
            #print(generated_text)
        except Exception as e:
            generated_text = f"ERROR: {e}"
    else:
        generated_text = None  # Skipped row

    generated_outputs.append(generated_text)

# Store results
df_messages['model_output'] = generated_outputs


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignore

In [None]:
df_messages

Unnamed: 0,type,person_type,name,job,text,message_final,model_output
4,query,participant,Omar Keenan,,Good morning everybody. Congratulations on a g...,"[{'role': 'system', 'content': 'You are a fina...","['RoTE', '15%', 'ABOVE 10%', 'ANSWERED', 'Capi..."
5,answer,presenter,Venkatakrishnan,Group Chief Executive,"Thanks, Omar. It's Venkat. So let me answer th...",,
6,answer,presenter,Anna Cross,Group Finance Director,"Yes, sure. Omar, you're right. We said that in...",,
8,query,participant,Jason Napier,,Good morning. Thank you for taking my question...,"[{'role': 'system', 'content': 'You are a fina...","['Loan loss charge', 'NO LEVEL', 'NO TRAJECTOR..."
9,answer,presenter,Anna Cross,Group Finance Director,"Thanks, Jason. Our impairment charge in the fi...",,
...,...,...,...,...,...,...,...
4187,answer,presenter,Stefano Porro,Chief Financial Officer,Yes. So in relation to the net interest income...,,
4189,query,participant,Pamela Zuluaga,,The first one is around excess capital. You sa...,"[{'role': 'system', 'content': 'You are a fina...","['Excess capital', 'NO LEVEL', 'RETURN BY 2027..."
4190,answer,presenter,Andrea Orcel,Chief Executive Officer,"Okay. So on excess capital, our commitment rem...",,
4191,answer,presenter,Stefano Porro,Chief Financial Officer,"Yes. So in relation to net interest income, so...",,


In [None]:
# Step 4.3. Save the dataframe. Concatenate with the initial dataframe, and save the whole dataframe.
df_messages.to_excel("df_messages_with_responses.xlsx", index=True)
# concatenate df_all and df_messages and create a new dataframe called df_all_banks_messages, based on the index and add one additional columns in df_all_banks_messages called model_output
 # Merge the model_output column
df_all_banks_messages = pd.concat([df_all, df_messages[["model_output"]]], axis=1)

In [None]:
# Retrieve data from df_all_banks where column type is query
df_all_banks_messages[df_all_banks_messages['type'] == 'query']

Unnamed: 0,bank,year,quarter,speaker,title,text,flag_question,flag_qa_sect,flag_sum_question_qa,flag_analyst,flag_operator,flag_presenter,type,person_type,name,job,model_output
4,barc,2023,Q1,Omar Keenan,Analyst,Good morning everybody. Congratulations on a g...,1.0,1.0,2.0,1.0,0.0,0.0,query,participant,Omar Keenan,,"['RoTE', '15%', 'ABOVE 10%', 'ANSWERED', 'Capi..."
8,barc,2023,Q1,Jason Napier,Analyst,Good morning. Thank you for taking my question...,1.0,1.0,2.0,1.0,0.0,0.0,query,participant,Jason Napier,,"['Loan loss charge', 'NO LEVEL', 'NO TRAJECTOR..."
11,barc,2023,Q1,Jason Napier,Analyst,"Thank you, Venkat. If I could just sort of fol...",1.0,1.0,2.0,1.0,0.0,0.0,query,participant,Jason Napier,,"['RWA', 'NO LEVEL', 'GROW', 'ANSWERED', 'Balan..."
16,barc,2023,Q1,Joseph Dickerson,Analyst,"Hi, good morning. Thank you for taking my ques...",1.0,1.0,2.0,1.0,0.0,0.0,query,participant,Joseph Dickerson,,"['Swap rates', '375', 'FLAT', 'ANSWERED', 'UK ..."
18,barc,2023,Q1,Joseph Dickerson,Analyst,"Yes, thanks. It's just very impressive when yo...",1.0,1.0,2.0,1.0,0.0,0.0,query,participant,Joseph Dickerson,,"['Net number', '£1.7 billion', 'NO TRAJECTORY'..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4175,uc,2025,Q1,Andrea Filtri,Analyst,"I've got 2. First, on guidance. How much of th...",1.0,1.0,2.0,1.0,0.0,0.0,query,participant,Andrea Filtri,,"['FTSE 100', '1500', 'GROW', 'AVOIDED', 'ROE',..."
4178,uc,2025,Q1,Marco Nicolai,Analyst,"So if I look at the headcount direction, this ...",1.0,1.0,2.0,1.0,0.0,0.0,query,participant,Marco Nicolai,,"['headcount', 'NO LEVEL', 'STABILIZING', 'ANSW..."
4182,uc,2025,Q1,Delphine Lee,Analyst,"The first one is on capital, just a clarificat...",1.0,1.0,2.0,1.0,0.0,0.0,query,participant,Delphine Lee,,"['Capital', 'NO LEVEL', 'GROW', 'ANSWERED', 'S..."
4186,uc,2025,Q1,Britta Schmidt,Analyst,"Yes. On net interest income, I hear that your ...",1.0,1.0,2.0,1.0,0.0,0.0,query,participant,Britta Schmidt,,"['Net interest income', 'NO LEVEL', 'IMPROVE',..."


In [None]:
df_messages

Unnamed: 0,type,person_type,name,job,text,message_final,model_output
4,query,participant,Omar Keenan,,Good morning everybody. Congratulations on a g...,"[{'role': 'system', 'content': 'You are a fina...","['RoTE', '15%', 'ABOVE 10%', 'ANSWERED', 'Capi..."
5,answer,presenter,Venkatakrishnan,Group Chief Executive,"Thanks, Omar. It's Venkat. So let me answer th...",,
6,answer,presenter,Anna Cross,Group Finance Director,"Yes, sure. Omar, you're right. We said that in...",,
8,query,participant,Jason Napier,,Good morning. Thank you for taking my question...,"[{'role': 'system', 'content': 'You are a fina...","['Loan loss charge', 'NO LEVEL', 'NO TRAJECTOR..."
9,answer,presenter,Anna Cross,Group Finance Director,"Thanks, Jason. Our impairment charge in the fi...",,
...,...,...,...,...,...,...,...
4187,answer,presenter,Stefano Porro,Chief Financial Officer,Yes. So in relation to the net interest income...,,
4189,query,participant,Pamela Zuluaga,,The first one is around excess capital. You sa...,"[{'role': 'system', 'content': 'You are a fina...","['Excess capital', 'NO LEVEL', 'RETURN BY 2027..."
4190,answer,presenter,Andrea Orcel,Chief Executive Officer,"Okay. So on excess capital, our commitment rem...",,
4191,answer,presenter,Stefano Porro,Chief Financial Officer,"Yes. So in relation to net interest income, so...",,


In [None]:
# Step 4.3. Save the dataframe. Concatenate with the initial dataframe, and save the whole dataframe.
df_messages.to_excel("df_messages_with_responses.xlsx", index=True)
# Save df_all_banks_messages as an xlsx
df_all_banks_messages.to_excel("df_all_banks_messages_output.xlsx", index=True)

# Step 4.4 Create JSON file
# Filter rows where 'type' is not null
df_filtered = df_all_banks_messages[df_all_banks_messages['type'].notnull()]
# Select and reorder the required columns
df_selected = df_filtered[['bank', 'year', 'quarter', 'speaker', 'title', 'type', 'text', 'model_output']]
# Convert to list of dictionaries
records = df_selected.to_dict(orient='records')
# Save to JSON file
with open("all_banks_messages_output.json", "w", encoding="utf-8") as f:
    json.dump(records, f, indent=2, ensure_ascii=False)

print("Saved as all_banks_messages_output.json")

#### In case we want to see how it looks the JSON file data
#with open("all_banks_messages_output.json", "r", encoding="utf-8") as f:
#    data = json.load(f)
