In [316]:
import os
import re
import pdfplumber
import yfinance as yf
import pandas as pd
from datetime import datetime
import logging

logging.getLogger("pdfminer").setLevel(logging.ERROR)


In [None]:
def generate_headers_from_filename(file_path):
    filename = os.path.basename(file_path)
    match = re.search(r"(\dQ)\s*(\d{4})", filename)
    if not match:
        raise ValueError("Filename must contain a quarter and year like '4Q 2021'")

    current_q, current_year = match.groups()
    current_label = f"{current_q}-{current_year}"

    # Determine previous quarter
    q_num = int(current_q[0])
    prev_q = 4 if q_num == 1 else q_num - 1
    prev_year = int(current_year) - 1 if q_num == 1 else int(current_year)
    prev_label = f"{prev_q}Q-{prev_year}"

    # Same quarter, previous year
    yoy_label = f"{current_q}-{int(current_year) - 1}"

    return [
        "Label",
        f"USD_{current_label}", f"NTD_{current_label}", f"PCT_{current_label}",
        f"NTD_{prev_label}", f"PCT_{prev_label}",
        f"NTD_{yoy_label}", f"PCT_{yoy_label}",
        "QoQ_Amount", "QoQ_PCT", "YoY_Amount", "YoY_PCT"
    ]

# Cleaning functions
def fix_broken_numbers(text):
    # Fix misplaced or split dollar values
    text = re.sub(r'\$\s*(\d)\s*,', r'$\1,', text)
    text = re.sub(r'\$\s*(\d{1,2})\s+(\d{1,3},\d{3})', r'$\1\2', text)
    text = re.sub(r'(?<=\d)\s+(?=\d{3}(?:,\d{3})*)', '', text)
    text = re.sub(r'(\$\d{1,3}(?:,\d{3})+)(?=\d{1,3}\.\d)', r'\1 ', text)
    return text

def separate_concatenated_numbers(text):
    return re.sub(r'(\d{1,3},\d{3})(?=\d{1,3},)', r'\1 ', text)

def separate_percent_and_number(text):
    return re.sub(r'(\d\.\d)(?=\d{1,3},\d{3})', r'\1 ', text)

def clean_line(text):
    text = fix_broken_numbers(text)
    text = separate_concatenated_numbers(text)
    text = separate_percent_and_number(text)
    return text

def extract_exchange_rate(pages):
    exchange_rate_pattern = r"NT\$ ?(\d+\.\d+)"
    for page in pages:
        text = page.extract_text()
        if text:
            match = re.search(exchange_rate_pattern, text)
            if match:
                return float(match.group(1))
    return None

def extract_section_from_lines(lines, start_marker, end_marker=None, include_end=True):

    start_idx = end_idx = None

    for i, line in enumerate(lines):
        if start_marker.lower() in line.lower() and start_idx is None:
            start_idx = i
        if end_marker and end_marker.lower() in line.lower() and start_idx is not None:
            end_idx = i + 1 if include_end else i
            break

    if start_idx is not None:
        return lines[start_idx:end_idx] if end_idx else lines[start_idx:]

    return []

In [None]:
def clean_line_text(line):
    line = re.sub(r"(\d)(?=\d{5,})", r"\1 ", line)  # Insert space in long digit sequences
    line = re.sub(r'(?<=\s)-(?=\s)', ' — ', line)   # Replace lone hyphens with dash
    line = re.sub(r'(\d)\s+\.(\d+)', r'\1.\2', line)  # Fix decimals like "6 .41" → "6.41"
    return line

def extract_table_from_section(lines):
    headers = generate_headers_from_filename(file_path)
    data = []

    number_pattern = r"-?\(?\d[\d,]*\)?(?:\.\d+)?|—"
    currency_marker = r"\$?-?\(?[\d,]+(?:\.\d+)?\)?"

    # Combine multi-line rows
    combined_lines = []
    buffer = ""
    for line in lines:
        line = clean_line_text(line)
        if not line.strip():
            continue
        if re.search(currency_marker, line) and not buffer:
            buffer = line
        elif re.search(currency_marker, line) and buffer:
            combined_lines.append(buffer.strip())
            buffer = line
        else:
            buffer += " " + line
    if buffer:
        combined_lines.append(buffer.strip())

    for line in combined_lines:
        line = clean_line_text(line)

        # Special case: Weighted Average Outstanding Shares
        if "Weighted Average Outstanding Shares" in line:
            label = " ".join(re.sub(r"\s{2,}", " ", line).split()[0:6])
            numbers = [n.replace(",", "") for n in re.findall(r"\d[\d,]*", line)]
            row = [
                label,
                "", numbers[0] if len(numbers) > 0 else "", "",
                numbers[1] if len(numbers) > 1 else "", "",
                numbers[2] if len(numbers) > 2 else "", "",
                "", "", ""
            ]
            data.append(row)
            continue

        # Earnings per Share or per ADR
        if "Earnings per Share" in line or "Earnings per ADR" in line:
            label_match = re.match(r"^(.*?)(?=\s+[-\($\d])", line)
            label = label_match.group(1).strip() if label_match else ""
            label = re.sub(r"\(\d+\)", "", label)

            values = re.findall(number_pattern, line[len(label):].strip())
            values = [
                v.replace("(", "-").replace(")", "").replace(",", "") if v != "—" else ""
                for v in values
            ]

            if values and values[0] == "-2":  # Remove misread negative if needed
                values.pop(0)

            row = [label]
            row += values[0:1]   # USD
            row += values[1:2]   # NTD_1
            row += [""]          # PCT_1
            row += values[2:3]   # NTD_2
            row += [""]          # PCT_2
            row += values[3:4]   # NTD_3
            row += [""]          # PCT_3
            row += values[4:5]   # QoQ_Amount
            row += values[5:6]   # QoQ_PCT
            row += values[6:7]   # YoY_Amount
            row += values[7:8]   # YoY_PCT
            row += [""] * (len(headers) - len(row))
            data.append(row)
            continue

        # General case
        label_match = re.match(r"^(.*?)(?=\s+[-\($\d])", line)
        label = label_match.group(1).strip() if label_match else ""
        label = re.sub(r"\(\d+\)", "", label)

        values = re.findall(number_pattern, line[len(label):].strip())
        values = [
            v.replace("(", "-").replace(")", "").replace(",", "") if v != "—" else ""
            for v in values
        ]

        row = [label] + values[:12]
        row += [""] * (len(headers) - len(row))
        data.append(row)

    return pd.DataFrame(data, columns=headers[:len(data[0])])

In [None]:
ticker = yf.Ticker("TSM")

quarter_ends = {
    "4Q 2021": "2021-12-31",
    "1Q 2022": "2022-03-31",
    "2Q 2022": "2022-06-30",
    "3Q 2022": "2022-09-30"
}

history = ticker.history(start="2021-10-01", end="2022-10-01")

# Extract closing prices for quarter-end dates
closing_prices = {}
for quarter, date in quarter_ends.items():
    # Some dates may fall on weekends, so get the nearest previous trading day
    while date not in history.index.strftime("%Y-%m-%d"):
        date = (pd.to_datetime(date) - pd.Timedelta(days=1)).strftime("%Y-%m-%d")
    closing_prices[quarter] = round(history.loc[date]["Close"], 2)

# Extract exchange rate, since pdf provides financial measures in NTD for previous years
def extract_exchange_rate_from_text(text):
    match = re.search(r'NT\$ ?(\d+\.\d+)', text)
    return float(match.group(1)) if match else None


In [None]:
def get_value_by_label(dataset, keyword1, period, keyword2=None):
    keywords = [keyword1]
    if keyword2:
        keywords.append(keyword2)
    
    for keyword in keywords:
        match = dataset[dataset["Label"].str.strip().str.lower() == keyword.strip().lower()]
        if not match.empty:
            return float(match[period].values[0])
    
    raise ValueError(f"Label exactly matching '{keyword1}'" + (f" or '{keyword2}'" if keyword2 else "") + " not found.")

def get_EPS(dataset, period, quarter):
    
    eps = get_value_by_label(dataset, "Earnings per Share  —  Diluted", period)
    eps = float(eps)

    return eps

def get_Revenue_Growth(dataset, period1, period2, er_rate):
    curr_rev = get_value_by_label(dataset, "Net Revenue", period2)
    past_rev = get_value_by_label(dataset, "Net Revenue", period1)

    curr_rev = float(curr_rev)
    past_rev = float(past_rev)

    if "NTD" in period1:
        past_rev /= float(er_rate)

    growth = (curr_rev - past_rev) / past_rev * 100

    return growth

def get_Gross_Margin(dataset, period):

    curr_rev = get_value_by_label(dataset, "Net Revenue", period)

    curr_rev = float(curr_rev)

    gross_profit = get_value_by_label(dataset, "Gross Profit", period)

    gross_margin = (gross_profit / curr_rev) * 100

    return gross_margin

def get_PE_Ratio(dataset, period, share_prices, quarter):

    share_price = share_prices.get(quarter)
    eps = get_EPS(dataset, period, quarter)

    share_price = float(share_price)

    pe = share_price / eps

    return pe

def get_ROE(dataset1, dataset2, period):
    
    net_income = get_value_by_label(dataset1, "Net Income", period)
    total_shareholder_equity = get_value_by_label(dataset2, "Total Shareholders' Equity", period)

    net_income = float(net_income)
    total_shareholder_equity = float(total_shareholder_equity)

    roe = (net_income / total_shareholder_equity) * 100

    return roe

def get_DE(dataset, period):
    # Use Total liabilities and stockholders’ equity $ - Total stockholders’ equity
    liabilities = get_value_by_label(dataset, "Total Liabilities & Shareholders' Equity", period)

    total_shareholder_equity = get_value_by_label(dataset, "Total Shareholders' Equity", period)

    total_liabilities = float(liabilities) - float(total_shareholder_equity)

    total_shareholder_equity = float(total_shareholder_equity)

    de = total_liabilities / total_shareholder_equity

    return de

def get_BS(dataset1, dataset2, period1, period2, quarter):

    total_shareholder_equity = get_value_by_label(dataset1, "Total Shareholders' Equity", period1)
    weighted_average_shares = get_value_by_label(dataset2, "Weighted Average Outstanding Shares — Diluted", period2)

    total_shareholder_equity = float(total_shareholder_equity)
    weighted_average_shares = float(weighted_average_shares)

    return total_shareholder_equity / weighted_average_shares

def get_PB(dataset1, dataset2, period1, period2, share_prices, quarter):
    share_price = share_prices.get(quarter)
    bs = get_BS(dataset1, dataset2, period1, period2, quarter)

    pb = share_price / bs

    return pb

In [None]:
# Change directory
folder_path = "/Users/tanyikchen/Library/CloudStorage/OneDrive-UniversityofNottinghamMalaysia/y3/DIA/Annual Reports/TSM/"

output_path = "/Users/tanyikchen/Library/CloudStorage/OneDrive-UniversityofNottinghamMalaysia/y3/DIA/Datasets/TSM_FA.csv"

target_quarters = ["4Q 2021", "1Q 2022", "2Q 2022", "3Q 2022"]
quarter_ends = {
    "4Q 2021": "2021-12-31",
    "1Q 2022": "2022-03-31",
    "2Q 2022": "2022-06-30",
    "3Q 2022": "2022-09-30",
}

def get_nearest_trading_day(date_str, trading_days):
    date = pd.to_datetime(date_str)
    while date not in trading_days:
        date -= pd.Timedelta(days=1)
    return date

# Load pdf files
pdf_files = {
    f.split("-")[0]: os.path.join(folder_path, f)
    for f in os.listdir(folder_path)
    if f.endswith(".pdf") and f.split("-")[0] in target_quarters
}

# Get Closing prices
ticker = yf.Ticker("TSM")
history = ticker.history(start="2021-10-01", end="2022-10-01")
history.index = history.index.tz_localize(None)
trading_days = history.index

closing_prices = {
    quarter: round(history.loc[get_nearest_trading_day(date_str, trading_days)]["Close"], 2)
    for quarter, date_str in quarter_ends.items()
}

# Input fundamental data
fundamental_data = {}

for quarter in target_quarters:
    if quarter == "3Q 2022":
        date = get_nearest_trading_day("2022-09-30", trading_days)
        fundamental_data[date] = {
            "EPS": 0.36,
            "Revenue_Growth": 47.86206414,
            "Gross_Margin": 60.4252163,
            "P/E": 182.4444444,
            "ROE": 10.6967672,
            "D/E": 0.6870607,
            "P/B": 19.6555604,
            "B/S": 3.34154807
        }
        continue

    file_path = pdf_files[quarter]
    balance_sheet_lines, income_statement_lines = [], []

    with pdfplumber.open(file_path) as pdf:
        for page_num in range(2):
            lines = [
                clean_line(line)
                for line in pdf.pages[page_num].extract_text().split("\n")
            ]
            if page_num == 0:
                balance_sheet_lines.extend(lines)
            else:
                income_statement_lines.extend(lines)

    balance_section = extract_section_from_lines(
        balance_sheet_lines, "ASSETS", "Total Liabilities & Shareholders' Equity"
    )
    income_section = extract_section_from_lines(
        income_statement_lines, "Net Revenue", "Weighted Average Outstanding Shares"
    )

    df_income = extract_table_from_section(income_section)
    df_balance = extract_table_from_section(balance_section)

    df_income = df_income.loc[:, ~df_income.columns.str.contains(r"^PCT_|^QoQ_|^YoY_")]
    df_balance = df_balance.loc[:, ~df_balance.columns.str.contains(r"^PCT_|^QoQ_|^YoY_")]

    # Extract exchange rate
    exchange_rates = {}
    with pdfplumber.open(file_path) as pdf:
        for i in range(2):
            text = pdf.pages[i].extract_text()
            if text:
                rate = extract_exchange_rate_from_text(text)
                exchange_rates[f"{quarter} Page {i+1}"] = rate
    exchange_rate = exchange_rates.get(f"{quarter} Page 2")

    # Construct column names
    year = quarter.split()[1]
    q = quarter.split()[0]
    last_year = str(int(year) - 1)
    col_usd = "USD_" + quarter.replace(" ", "-")
    col_ntd_y = "NTD_" + quarter.replace(" ", "-")
    col_ntd_last = "NTD_" + f"{q}-{last_year}".replace(" ", "-")

    indicators = {
        "EPS": get_EPS(df_income, col_usd, quarter),
        "Revenue_Growth": get_Revenue_Growth(df_income, col_ntd_last, col_usd, exchange_rate),
        "Gross_Margin": get_Gross_Margin(df_income, col_usd),
        "P/E": get_PE_Ratio(df_income, col_usd, closing_prices, quarter),
        "ROE": get_ROE(df_income, df_balance, col_usd),
        "D/E": get_DE(df_balance, col_usd),
        "P/B": get_PB(df_balance, df_income, col_usd, col_ntd_y, closing_prices, quarter),
        "B/S": get_BS(df_balance, df_income, col_usd, col_ntd_y, quarter),
    }

    report_date = get_nearest_trading_day(quarter_ends[quarter], trading_days)
    fundamental_data[report_date] = indicators

# Build time series dataframe
fundamental_df = pd.DataFrame(index=trading_days)

sorted_quarters = sorted(quarter_ends.items(), key=lambda x: pd.to_datetime(x[1]))

for i, (quarter, end_str) in enumerate(sorted_quarters):
    end = get_nearest_trading_day(end_str, trading_days)
    start = pd.to_datetime("2021-10-01") if i == 0 else get_nearest_trading_day(sorted_quarters[i-1][1], trading_days) + pd.Timedelta(days=1)

    if end in fundamental_data:
        mask = (fundamental_df.index >= start) & (fundamental_df.index <= end)
        for key, val in fundamental_data[end].items():
            fundamental_df.loc[mask, key] = val

# Export and output
fundamental_df.to_csv(output_path, index=False)
print(fundamental_df)

             EPS  Revenue_Growth  Gross_Margin         P/E        ROE  \
Date                                                                    
2021-10-01  0.23       21.201842     52.662684  493.347826   7.617384   
2021-10-04  0.23       21.201842     52.662684  493.347826   7.617384   
2021-10-05  0.23       21.201842     52.662684  493.347826   7.617384   
2021-10-06  0.23       21.201842     52.662684  493.347826   7.617384   
2021-10-07  0.23       21.201842     52.662684  493.347826   7.617384   
2021-10-08  0.23       21.201842     52.662684  493.347826   7.617384   
2021-10-11  0.23       21.201842     52.662684  493.347826   7.617384   
2021-10-12  0.23       21.201842     52.662684  493.347826   7.617384   
2021-10-13  0.23       21.201842     52.662684  493.347826   7.617384   
2021-10-14  0.23       21.201842     52.662684  493.347826   7.617384   
2021-10-15  0.23       21.201842     52.662684  493.347826   7.617384   
2021-10-18  0.23       21.201842     52.662684  493