In [1]:
# Azure Data Lake libraries
import azure_data_lake_interface as adl

# Data analysis libraries
import pandas as pd

# Helper function libraries
import helper_functions as hf

In [2]:
def compute_po_price_stats(df: pd.DataFrame, months: int = 12) -> pd.DataFrame:

    # Calculate the cutoff date (current date minus N months)
    cutoff_date = hf.get_cutoff_date(months)

    # Filter the DataFrame for purchase orders in the past N months
    recent_df = df[df["created_date"] >= cutoff_date]

    # Group by item_name and aggregate:
    # - Retain the first manufacturer, description, and item_type values
    # - Compute the highest and average unit price for the group
    stats_df = recent_df.groupby("item_name").agg(
        manufacturer=("manufacturer", "first"),
        description=("description", "first"),
        item_type=("item_type", "first"),
        po_highest_price=("unit_price", "max"),
        po_average_price=("unit_price", "mean")
    ).reset_index()

    stats_df["po_high_minus_avg"] = stats_df["po_highest_price"] - stats_df["po_average_price"]

    return stats_df

In [3]:
def compute_inv_price_stats(df: pd.DataFrame, months: int = 12) -> pd.DataFrame:

    # rename columns
    df = df.rename(columns={"Item": "item_name", "Description": "description", "avg_unit_price": "unit_price"})

    # Calculate the cutoff date (current date minus N months)
    cutoff_date = hf.get_cutoff_date(months)

    # Filter the DataFrame for purchase orders in the past N months
    recent_df = df[df["Month"] >= cutoff_date]

    # Group by item_name and aggregate:
    # - Retain the first manufacturer, description, and item_type values
    # - Compute the highest and average unit price for the group
    stats_df = recent_df.groupby("item_name").agg(
        manufacturer=("manufacturer", "first"),
        description=("description", "first"),
        inv_highest_price=("unit_price", "max"),
        inv_avg_price=("unit_price", "mean")
    ).reset_index()

    stats_df["inv_high_minus_avg"] = stats_df["inv_highest_price"] - stats_df["inv_avg_price"]

    return stats_df.dropna()

In [4]:
# attach to the data lake
config = hf.load_config("config/datalake_config.json", flush_cache=True)
service_client = adl.get_azure_service_client(config["blob_url"])
file_system_client = adl.get_azure_file_system_client(service_client, "consolidated")

In [5]:
# get data
source_folder = "enhanced/netsuite"
filename = "PurchOrdItemLineItems_enhanced.parquet"
augmented_line_items = adl.get_parquet_file_from_data_lake(file_system_client, source_folder, filename)

filename = "unit_margin_analysis.parquet"
inventory_prices = adl.get_parquet_file_from_data_lake(file_system_client, source_folder, filename)
inventory_prices = inventory_prices.reset_index()

# convert pd.Period to datatime
inventory_prices['Month'] = inventory_prices['Month'].dt.to_timestamp()

In [6]:
# calc price stats for the two data sets, then combine them
po_price_stats = compute_po_price_stats(augmented_line_items)
inv_price_stats = compute_inv_price_stats(inventory_prices)
combined_price_stats = po_price_stats.merge(inv_price_stats[["item_name", "inv_highest_price", "inv_avg_price", "inv_high_minus_avg"]], on="item_name", how="left")
combined_price_stats["po_highest_minus_inv_avg_price"] = combined_price_stats["po_highest_price"] - combined_price_stats["inv_avg_price"]

# round all prices to 2 decimal places
po_price_stats = hf.round_float_columns(po_price_stats)
inv_price_stats = hf.round_float_columns(inv_price_stats)
combined_price_stats = hf.round_float_columns(combined_price_stats)

In [7]:
# -- get category level info for each item
unique_items = inventory_prices[
    ['Item', 'level_1_category', 'level_2_category', 'level_3_category', 'level_4_category', 'level_5_category',
     'level_6_category']].drop_duplicates()
unique_items = unique_items.rename(columns={'Item': 'item_name'})

# -- sum total_sales and total_quantity by item
sales_qty_by_item = inventory_prices.groupby('Item').agg(
    total_sales=('total_sales', 'sum'),
    total_quantity=('total_quantity', 'sum')
).reset_index()

sales_qty_by_item = hf.round_float_columns(sales_qty_by_item)

# -- join temp dataframes
unique_items = unique_items.merge(sales_qty_by_item.rename(columns={'Item': 'item_name'}), on='item_name', how='left')

# -- add info to price stats and reorder columns
combined_price_stats = combined_price_stats.merge(unique_items, on='item_name', how='left')
combined_price_stats = combined_price_stats[['item_name', 'manufacturer', 'level_1_category', 'level_2_category',
                                             'level_3_category', 'level_4_category', 'level_5_category',
                                             'level_6_category',
                                             'description', 'item_type', 'po_highest_price', 'po_average_price',
                                             'po_high_minus_avg', 'inv_highest_price', 'inv_avg_price',
                                             'inv_high_minus_avg',
                                             'po_highest_minus_inv_avg_price', 'total_sales', 'total_quantity']]


In [9]:
# save after removing illegal Excel characters in the description field
combined_price_stats = hf.clean_illegal_chars_in_column(combined_price_stats, 'description')
combined_price_stats.to_excel("analyses/combined_price_stats.xlsx", index=False)