In [None]:
import re
import logging
import os
import pandas as pd
import pandas_market_calendars as mcal
import pdfplumber
import yfinance as yf

logging.getLogger("pdfminer").setLevel(logging.ERROR)

# Function to extract text between headers
def extract_section(text, start_marker, end_marker):
    pattern = rf"{re.escape(start_marker)}(.*?){re.escape(end_marker)}"
    match = re.search(pattern, text, re.DOTALL)
    return match.group(1).strip() if match else ""

def merge_multiline_lines(lines):
    merged_lines = []
    skip_next = False

    for i in range(len(lines)):
        if skip_next:
            skip_next = False
            continue

        line = lines[i]
        if i + 1 < len(lines):
            next_line = lines[i + 1]
            # If current line has no digits but the next line has, merge them
            if not re.search(r'\d', line) and re.search(r'\d', next_line):
                merged_lines.append(line + " " + next_line)
                skip_next = True
            else:
                merged_lines.append(line)
        else:
            merged_lines.append(line)

    return merged_lines

# Function to parse financial tables
def text_to_dataframe(section_text, debug=False):
    section_text = re.sub(r'\(\d+\)', '', section_text)

    # Split and merge lines
    lines = section_text.split("\n")
    lines = merge_multiline_lines(lines)

    if debug:
        print("\n[DEBUG] Lines after merging:")
        for i, line in enumerate(lines):
            print(f"{i}: {line}")

    # Try to find header line with years
    header_line = next((line for line in lines if re.search(r"(20\d{2})", line)), "")
    years = re.findall(r"(20\d{2})", header_line)

    if not years:
        print("[DEBUG] No valid year headers found.")
        return pd.DataFrame()

    column_names = ["Label"] + years
    data = []

    for line in lines:
        values = re.findall(r'[-−(]?\$?\(?\d[\d,\.]*\)?|—', line)
        if not values:
            continue
        value_start = line.find(values[0])
        label = line[:value_start].strip()
        cleaned_values = [
            v.replace(',', '')
             .replace('−', '-')
             .replace('$', '')
             .replace('(', '-')
             .replace(')', '')
             .strip()
            for v in values
        ]
        row = [label] + cleaned_values[:len(column_names)-1]
        data.append(row)

    for row in data:
        if len(row) < len(column_names):
            row += [""] * (len(column_names) - len(row))
        elif len(row) > len(column_names):
            row = row[:len(column_names)]

    df = pd.DataFrame(data, columns=column_names)
    return df[df["Label"].str.lower() != "label"].reset_index(drop=True)

In [None]:
ticker = yf.Ticker("AMZN")

quarter_ends = {
    "Q4-2021": "2021-12-31",
    "Q1-2022": "2022-03-31",
    "Q2-2022": "2022-06-30",
    "Q3-2022": "2022-09-30"
}

history = ticker.history(start="2021-10-01", end="2022-10-01")

# Extract closing prices for quarter-end dates
closing_prices = {}
for quarter, date in quarter_ends.items():
    # Some dates may fall on weekends, so get the nearest previous trading day
    while date not in history.index.strftime("%Y-%m-%d"):
        date = (pd.to_datetime(date) - pd.Timedelta(days=1)).strftime("%Y-%m-%d")
    closing_prices[quarter] = round(history.loc[date]["Close"], 2)

Amazon Quarter-End Share Prices (Adjusted Close):
Q4-2021: $166.72
Q1-2022: $163.0
Q2-2022: $106.21
Q3-2022: $113.0


In [4]:
def get_value_by_label(dataset, keyword1, period, keyword2=None):
    keywords = [keyword1]
    if keyword2:
        keywords.append(keyword2)
    
    for keyword in keywords:
        match = dataset[dataset["Label"].str.strip().str.lower() == keyword.strip().lower()]
        if not match.empty:
            return float(match[period].values[0])
    
    raise ValueError(f"Label exactly matching '{keyword1}'" + (f" or '{keyword2}'" if keyword2 else "") + " not found.")

def get_EPS(dataset, period, quarter):
    # Use Diluted earnings per share from df_statement_of_ops

    eps = get_value_by_label(dataset, "Diluted earnings per share $", period)
    eps = float(eps)

    if quarter in pre_split_quarters:
        return eps/20
    else:
        return eps

def get_Revenue_Growth(dataset, period1, period2):
    # Use Total Net Sales from df_statement_of_ops

    curr_rev = get_value_by_label(dataset, "Total Net Sales", period2)
    past_rev = get_value_by_label(dataset, "Total Net Sales", period1)

    curr_rev = float(curr_rev)
    past_rev = float(past_rev)

    growth = (curr_rev - past_rev) / past_rev * 100

    return growth

def get_Gross_Margin(dataset, period):
    # Calculate gross profit. Use Total Net Sales - Operating expenses: Cost of sales

    curr_rev = get_value_by_label(dataset, "Total Net Sales", period)
    cost_of_sales = get_value_by_label(dataset, "Operating expenses: Cost of sales", period)
    diff = float(curr_rev) - float(cost_of_sales)

    diff = float(diff)
    curr_rev = float(curr_rev)

    gross_profit = (diff / curr_rev) * 100

    gross_margin = (gross_profit / curr_rev) * 100

    return gross_margin

def get_PE_Ratio(dataset, period, share_prices, quarter):

    share_price = share_prices.get(quarter)
    eps = get_EPS(dataset, period, quarter)

    share_price = float(share_price)

    pe = share_price / eps

    return pe

def get_ROE(dataset1, dataset2, period):
    # dataset1 should be statement of ops. Use Net income $ from df_statement_of_ops
    # dataset2 should be balance sheet. Use Total stockholders’ equity from df_balance_sheet
    # period should be "2021"
    
    net_income = get_value_by_label(dataset1, "Net Income $", period, "Net income (loss) $")
    total_shareholder_equity = get_value_by_label(dataset2, "Total stockholders’ equity", period)

    net_income = float(net_income)
    total_shareholder_equity = float(total_shareholder_equity)

    roe = (net_income / total_shareholder_equity) * 100

    return roe

def get_DE(dataset, period):
    # Use Total liabilities and stockholders’ equity $ - Total stockholders’ equity

    liabilities = get_value_by_label(dataset, "Total liabilities and stockholders’ equity $", period)

    total_shareholder_equity = get_value_by_label(dataset, "Total stockholders’ equity", period)

    total_liabilities = float(liabilities) - float(total_shareholder_equity)

    total_shareholder_equity = float(total_shareholder_equity)

    de = total_liabilities / total_shareholder_equity

    return de

def get_BS(dataset1, dataset2, period, quarter):
    # dataset1 should be balance sheet
    # detaset2 should be statement of operations
    # Use Diluted from df_statement_of_ops

    total_shareholder_equity = get_value_by_label(dataset1, "Total stockholders’ equity", period)
    weighted_average_shares = get_value_by_label(dataset2, "Diluted", period)

    total_shareholder_equity = float(total_shareholder_equity)
    weighted_average_shares = float(weighted_average_shares)

    if quarter in pre_split_quarters:
        return total_shareholder_equity / (weighted_average_shares * 20)
    else:
        return total_shareholder_equity / weighted_average_shares

def get_PB(dataset1, dataset2, period, share_prices, quarter):
    share_price = share_prices.get(quarter)
    bs = get_BS(dataset1, dataset2, period, quarter)

    pb = share_price / bs

    return pb

pre_split_quarters = ["Q1-2021", "Q2-2021", "Q3-2021", "Q4-2021", "Q1-2022"]

In [None]:
# Paths
folder_path = "/Users/tanyikchen/Library/CloudStorage/OneDrive-UniversityofNottinghamMalaysia/y3/DIA/Annual Reports/AMZN/"

output_path = "/Users/tanyikchen/Library/CloudStorage/OneDrive-UniversityofNottinghamMalaysia/y3/DIA/Datasets/AMZN_FA.csv"

# Filenames and date ranges
pdf_filenames = [
    "Q4-2021-Amazon-Earnings-Release.pdf",
    "Q1-2022-Amazon-Earnings-Release.pdf",
    "Q2-2022-Amazon-Earnings-Release.pdf",
    "Q3-2022-Amazon-Earnings-Release.pdf"
]

quarter_to_dates = {
    "Q4-2021": ("2021-10-01", "2021-12-31"),
    "Q1-2022": ("2022-01-01", "2022-03-31"),
    "Q2-2022": ("2022-04-01", "2022-06-30"),
    "Q3-2022": ("2022-07-01", "2022-09-30"),
}

# Get NYSE calendar
calendar_start = min(v[0] for v in quarter_to_dates.values())
calendar_end = max(v[1] for v in quarter_to_dates.values())
nyse = mcal.get_calendar("NYSE")
trading_days = nyse.schedule(start_date=calendar_start, end_date=calendar_end)
trading_dates = trading_days.index.normalize()

# Process pdfs
results = []

for filename in pdf_filenames:
    file_path = os.path.join(folder_path, filename)
    quarter = filename.split("-Amazon")[0]
    year = quarter.split("-")[1]
    last_year = str(int(year) - 1)

    # Extract all text from the pdf
    with pdfplumber.open(file_path) as pdf:
        full_text = "\n".join(
            f"--- Page {i+1} ---\n" + page.extract_text()
            for i, page in enumerate(pdf.pages)
        )

    # Define sections and extract data
    sections = {
        "Statement of Operations": ("Consolidated Statements of Operations", "Consolidated Statements of Comprehensive Income"),
        "Income": ("Consolidated Statements of Comprehensive Income", "Consolidated Balance Sheets"),
        "Balance Sheets": ("Consolidated Balance Sheets", "Supplemental Financial Information and Business Metrics"),
    }

    dfs = {}
    for section, (start, end) in sections.items():
        text = extract_section(full_text, start, end)
        df = text_to_dataframe(text)
        if df.shape[1] > 3:
            df = df.iloc[:, :3]
        dfs[section] = df

    # Calculate fundamental indicators
    indicators = {
        "quarter": quarter,
        "EPS": get_EPS(dfs["Statement of Operations"], year, quarter),
        "Revenue_Growth": get_Revenue_Growth(dfs["Statement of Operations"], last_year, year),
        "Gross_Margin": get_Gross_Margin(dfs["Statement of Operations"], year),
        "P/E": get_PE_Ratio(dfs["Statement of Operations"], year, closing_prices, quarter),
        "ROE": get_ROE(dfs["Income"], dfs["Balance Sheets"], year),
        "D/E": get_DE(dfs["Balance Sheets"], year),
        "B/S": get_BS(dfs["Balance Sheets"], dfs["Statement of Operations"], year, quarter),
        "P/B": get_PB(dfs["Balance Sheets"], dfs["Statement of Operations"], year, closing_prices, quarter),
    }

    results.append(indicators)

# Expand quarterly indicators to daily time series
expanded_rows = []
for entry in results:
    start, end = quarter_to_dates[entry["quarter"]]
    dates = trading_dates[(trading_dates >= pd.to_datetime(start)) & (trading_dates <= pd.to_datetime(end))]
    for date in dates:
        row = {"Date": date}
        row.update({k: v for k, v in entry.items() if k != "quarter"})
        expanded_rows.append(row)

# Create dataframe and export
fundamental_df = pd.DataFrame(expanded_rows).sort_values("Date").reset_index(drop=True)
fundamental_df.to_csv(output_path, index=False)

print(fundamental_df)

          Date     EPS  Revenue_Growth  Gross_Margin         P/E        ROE  \
0   2021-10-01  1.3875        9.443670      0.028904  120.158559  10.360592   
1   2021-10-04  1.3875        9.443670      0.028904  120.158559  10.360592   
2   2021-10-05  1.3875        9.443670      0.028904  120.158559  10.360592   
3   2021-10-06  1.3875        9.443670      0.028904  120.158559  10.360592   
4   2021-10-07  1.3875        9.443670      0.028904  120.158559  10.360592   
..         ...     ...             ...           ...         ...        ...   
247 2022-09-26  0.2800       14.699672      0.035181  403.571429   2.088894   
248 2022-09-27  0.2800       14.699672      0.035181  403.571429   2.088894   
249 2022-09-28  0.2800       14.699672      0.035181  403.571429   2.088894   
250 2022-09-29  0.2800       14.699672      0.035181  403.571429   2.088894   
251 2022-09-30  0.2800       14.699672      0.035181  403.571429   2.088894   

          D/E        B/S        P/B  
0    2.042056