In [1]:
%load_ext autoreload
%autoreload 2

### Import Libraries

In [2]:
# add path to custom python code for accessing data lake and working with dataframes
import sys
sys.path.append('/Users/markbills/Library/CloudStorage/OneDrive-Transformativ,LLC/Clients/Ovation Holdings/src')

# Azure Data Lake libraries
import azure_data_lake_interface as adl

# Helper function libraries
import helper_functions as hf

# Data analysis libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import weekly_margin_report as wmr

# display support
from IPython.display import display, Markdown

### Load Data

In [3]:
# attach to the data lake
config = hf.load_config("config/datalake_config.json", flush_cache=True)
service_client = adl.get_azure_service_client(config["blob_url"])
file_system_client = adl.get_azure_file_system_client(service_client, "consolidated")

In [4]:
# get data lake data
trans_type = "CustInvc"
end_date = "2025-05"
filename = f"monthly_{trans_type}_margin_analysis_dataset_2022-01_{end_date}.parquet"
combined_monthly_data = adl.get_parquet_file_from_data_lake(file_system_client, "presentation/margin_analysis", filename)

### Examine Data

In [None]:
Markdown(wmr.describe_dataset(combined_monthly_data))

In [None]:
combined_monthly_data.head()

### Explore Data

In [None]:
item_type = "Non-inventory Item"
manufacturer = "Yokogawa"
combined_monthly_data[
    (combined_monthly_data["manufacturer"] == manufacturer) &
    (combined_monthly_data["item_type"] == item_type)
]

### Tweak Data

In [5]:
# remove inactivated inventory df
combined_monthly_data = combined_monthly_data[~combined_monthly_data['item_name'].str.startswith('Inactivated')]

# fix misspellings on important OEMs
mfg_name_map = {
    "Yokogawa": ["Yokogawa", "YOKOGAWA", "YOKOGAWAS"]
}
for correct_name, misspellings in mfg_name_map.items():
    combined_monthly_data.loc[combined_monthly_data['manufacturer'].isin(misspellings), 'manufacturer'] = correct_name

# fill in nulls with something more helpful
combined_monthly_data["manufacturer"] = combined_monthly_data["manufacturer"].replace("null", "Not Specified").fillna("Not Specified")

### Analyze Data

In [6]:
# filter out rows with margin_pct <= 0 and calculate monthly average margin by manufacturer
margin_mask = (
    (combined_monthly_data["inventory_avg_margin_pct"] > 0) |
    (combined_monthly_data["commission_avg_margin_pct"] > 0)
)

monthly_manufacturer_margin = combined_monthly_data[margin_mask].reset_index().groupby(
    ['month', 'manufacturer']).agg(
    inventory_avg_margin_pct=('inventory_avg_margin_pct', 'mean'),
    commission_avg_margin_pct=('commission_avg_margin_pct', 'mean'),
)
monthly_manufacturer_margin = monthly_manufacturer_margin.round(2)

In [7]:
monthly_manufacturer_margin

Unnamed: 0_level_0,Unnamed: 1_level_0,inventory_avg_margin_pct,commission_avg_margin_pct
month,manufacturer,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01,A T Controls,40.00,
2022-01,Anderson Greenwood,32.50,
2022-01,Asahi America,41.76,
2022-01,Automation Service,15.02,
2022-01,Balon,19.74,
...,...,...,...
2025-04,Peaktronics,52.30,
2025-04,Reotemp,36.34,
2025-04,Sealweld,47.84,
2025-04,Sealweld Inc,40.00,


In [8]:
monthly_manufacturer_inventory_margin_pivot = monthly_manufacturer_margin.reset_index().pivot(
    index='manufacturer',
    columns='month',
    values='inventory_avg_margin_pct'
).ffill(axis=1)

monthly_manufacturer_commission_margin_pivot = monthly_manufacturer_margin.reset_index().pivot(
    index='manufacturer',
    columns='month',
    values='commission_avg_margin_pct'
).ffill(axis=1)

In [9]:
cum_inventory_sales_by_item = wmr.calculate_cumulative_sales(
    combined_monthly_data,
    sales_column='monthly_inventory_sales',
    result_column_name='cumulative_inventory_sales'
)

cum_commission_sales_by_item = wmr.calculate_cumulative_sales(
    combined_monthly_data,
    sales_column='monthly_commission_sales',
    result_column_name='cumulative_commission_sales'
)

In [10]:
n = 200
top_inventory_items = cum_inventory_sales_by_item[(cum_inventory_sales_by_item["item_type"] == "Inventory Item")][0:n]
top_commission_items = cum_commission_sales_by_item[(cum_commission_sales_by_item["item_type"] == "Non-inventory Item")][0:n]

In [11]:
top_inventory_items.to_excel('data_tables/top_inventory_items_by_cumulative_sales_Jan2022_to_Present.xlsx')
top_commission_items.to_excel('data_tables/top_commission_items_by_cumulative_sales_Jan2022_to_Present.xlsx')

In [12]:
inventory_manufacturers = sorted(top_inventory_items["manufacturer"].unique())
commission_manufacturers = sorted(top_commission_items["manufacturer"].unique())

In [13]:
start_month = "2024-04"
end_month = "2025-04"

top_mfr_inventory_margin_data = monthly_manufacturer_inventory_margin_pivot.loc[
    inventory_manufacturers, pd.period_range(start=start_month, end=end_month, freq='M')]
# top_mfr_commission_margin_data = monthly_manufacturer_inventory_margin_pivot.loc[
#     commission_manufacturers, pd.period_range(start=start_month, end=end_month, freq='M')]

In [34]:
def clear_directory_files(directory_path: str, skip_files: list = None) -> None:
    target_dir = Path(directory_path)
    skip_files = skip_files or set()  # Default to empty set if None provided

    for item in target_dir.iterdir():
        # Check if the item is a file and its name is not in the skip set.
        if item.is_file() and item.name not in skip_files:
            item.unlink()

In [37]:
# select dataset
data_set = "inventory"
df = top_mfr_inventory_margin_data if data_set == "inventory" else top_mfr_commission_margin_data

# remove old files
target_directory = f"pdfs/{data_set}"
clear_directory_files(target_directory, [f'top_mfr_{data_set}_margin_data.pdf'])

# flatten the multi-index…
df_flat = df.reset_index()   # brings both 'month' and 'manufacturer' into columns

# 2) Reorder so manufacturer is first
cols = ['manufacturer'] + [c for c in df_flat.columns if c != 'manufacturer']
df_flat = df_flat[cols]

green, red = '#d9ead3','#f4cccc'
n_rows, n_cols = df_flat.shape
cell_colors = [['white']*n_cols for _ in range(n_rows)]

for col in range(2, n_cols):
    prev = df_flat.iloc[:, col-1].values
    curr = df_flat.iloc[:, col  ].values
    for i, (p, c) in enumerate(zip(prev, curr)):
        if   c > p: cell_colors[i][col] = green
        elif c < p: cell_colors[i][col] = red
        # else: leave white

# 5) Draw the table
fig, ax = plt.subplots(figsize=(16,8), layout='constrained')
ax.axis('off')
table = ax.table(
    cellText   = df_flat.values,
    colLabels  = df_flat.columns,
    cellColours= cell_colors,
    cellLoc    = 'center',
    loc        = 'center'
)

header_columns = [col for (row, col) in table._cells.keys() if row == 0]

# bold all column headers:
for col in header_columns:
    cell = table._cells.get((0, col))
    if cell:
        cell.get_text().set_fontweight('bold')

# If there is similar logic for row-labels in col 0, apply similar checks.
row_labels = [row for (row, col) in table._cells.keys() if col == 0]
for row in row_labels:
    cell = table._cells.get((row, 0))
    if cell:
        cell.get_text().set_fontweight('bold')

# Add title
plt.title(f"Monthly Manufacturer Margin Data for {data_set.title()} Items", pad=20, size=14, weight='bold')

plt.savefig(f'pdfs/{data_set}/top_mfr_{data_set}_margin_data.pdf',
            orientation='landscape',
            bbox_inches='tight')
plt.close()

In [None]:
manufacturers = inventory_manufacturers if data_set == "inventory" else commission_manufacturers
margin_col = "inventory_avg_margin_pct" if data_set == "inventory" else "commission_avg_margin_pct"
save_path = "pdfs/inventory" if data_set == "inventory" else "pdfs/commission"

for mfr in manufacturers:
    wmr.plot_manufacturer_margins(monthly_manufacturer_margin, margin_col, mfr, start_date='2024-04', save_as_pdf=True, pdf_path=save_path)

In [31]:
import os
from pathlib import Path
from typing import Optional

from pydantic import BaseModel, Field
from PyPDF2 import PdfMerger


class PDFMergeConfig(BaseModel):
    """Configuration for generating and merging PDFs."""
    pdf_dir: Path = Field(..., description="Directory containing single-page PDFs")
    output_pdf: Path = Field(..., description="Path for the combined output PDF")
    cover_title: str = Field(..., description="Title text on the cover page")
    cover_subtitle: Optional[str] = Field(None, description="Optional subtitle on the cover page")
    data_source: str = Field("Invoice", description="Data source of the displayed data")
    data_table_pdf: Path = Field(..., description="Path for the data table PDF")

In [32]:
from pathlib import Path
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from reportlab.lib.utils import simpleSplit

def create_cover_page(cfg: PDFMergeConfig) -> Path:
    """Generate a single-page cover PDF in the specified directory."""
    cfg.pdf_dir.mkdir(parents=True, exist_ok=True)
    cover_path = cfg.pdf_dir / "cover_page.pdf"

    c = canvas.Canvas(str(cover_path), pagesize=letter)
    width, height = letter
    margin = 72  # 1" margin
    max_text_width = width - margin * 2

    lines = simpleSplit(cfg.cover_title, "Helvetica-Bold", 28, max_text_width)
    y = height / 2 + 50 + (len(lines) - 1) * 14  # stack lines upward
    for line in lines:
        c.setFont("Helvetica-Bold", 28)
        c.drawCentredString(width / 2, y, line)
        y -= 32  # line spacing = fontSize + 4

    # Draw subtitle if provided
    if cfg.cover_subtitle:
        c.setFont("Helvetica", 14)
        subtitle_lines = simpleSplit(cfg.cover_subtitle, "Helvetica", 14, max_text_width)
        y_sub = height / 2
        for line in subtitle_lines:
            c.drawCentredString(width / 2, y_sub, line)
            y_sub -= 18

    # add data source
    c.setFont("Helvetica-Oblique", 10)
    source = (
        f"The charts and tables contained in this report "
        f"are based on {cfg.data_source} data."
    )
    source_lines = simpleSplit(source, "Helvetica-Oblique", 10, max_text_width)
    y_src = 50 + (len(source_lines) - 1) * 12
    for line in source_lines:
        c.drawCentredString(width / 2, y_src, line)
        y_src -= 14

    c.save()
    return cover_path


In [33]:
def merge_pdfs(cfg: PDFMergeConfig, cover_pdf: Path) -> None:
    """Merge the cover PDF and all other PDFs in the directory into one output PDF.

    Args:
        cfg: PDFMergeConfig with pdf_dir and output_pdf.
        cover_pdf: Path to the cover page PDF created by create_cover_page.
    """
    merger = PdfMerger()

    # Append cover first
    merger.append(str(cover_pdf))

    # remove old copy of combined pdf
    try:
        os.remove(str(cfg.output_pdf))
    except FileNotFoundError:
        pass

    # Append each PDF in sorted order, skipping the cover itself
    for pdf_file in sorted(cfg.pdf_dir.glob("*.pdf")):
        if pdf_file == cover_pdf or pdf_file == cfg.data_table_pdf:
            continue
        merger.append(str(pdf_file))

    # Append data table at end
    merger.append(str(cfg.data_table_pdf))

    merger.write(str(cfg.output_pdf))
    merger.close()

In [39]:
import datetime

config = PDFMergeConfig(
    pdf_dir=Path(f"pdfs/{data_set}"),
    output_pdf=Path(f"pdfs/{data_set}/TTM_Manufacturer_Margin_Trends_From_{trans_type}.pdf"),
    cover_title=f"Monthly Manufacturer Margin Report for {data_set.title()} Items",
    cover_subtitle=f"Generated {datetime.date.today().strftime('%B %d, %Y')}",
    data_source="Invoice",
    data_table_pdf=Path(f"pdfs/{data_set}/top_mfr_{data_set}_margin_data.pdf")
)

cover_pdf = create_cover_page(config)
merge_pdfs(config, cover_pdf)
print(f"Combined PDF saved to: {config.output_pdf}")

Combined PDF saved to: pdfs/inventory/TTM_Manufacturer_Margin_Trends_From_CustInvc.pdf


In [None]:
part = '317854'
# Filter data for the specific item
item_data = combined_monthly_data.reset_index()
item_data = item_data[item_data['item_name'] == part]

# Drop any rows with NaN values in avg_margin_pct before calculating trendline
item_data = item_data.dropna(subset=['avg_margin_pct'])

if item_data.empty:
    print("Warning: The data is empty. Please ensure that the data source provides valid entries.")
else:
    x = np.arange(len(item_data))
    y = item_data['avg_margin_pct'].values

    # Check if x and y are non-empty
    if x.size == 0 or y.size == 0:
        print("Error: Non-empty data expected for computation but received empty arrays.")
    else:
        # Fit a polynomial of degree 1.
        z = np.polyfit(x, y, 1)
        p = np.poly1d(z)

        # Create the line plot
        plt.figure(figsize=(15, 10))
        plt.plot(item_data['month'].astype(str), item_data['avg_margin_pct'], marker='o', label='Actual')
        plt.plot(item_data['month'].astype(str), p(x), "r--", label='Trend')
        plt.title(f'Monthly Average Margin % for {part}')
        plt.xlabel('Month')
        plt.ylabel('Average Margin %')
        plt.ylim(bottom=0)  # Set y-axis to start at 0
        plt.xticks(rotation=45)
        plt.grid(True)
        plt.legend()
        plt.tight_layout()
        plt.show()