In [15]:
# Azure Data Lake libraries
import azure_data_lake_interface as adl

# Helper function libraries
from helper_functions import load_config

In [16]:
# attach to the data lake
config = load_config("config/datalake_config.json", flush_cache=True)
service_client = adl.get_azure_service_client(config["blob_url"])

container_name = "consolidated"
file_system_client = adl.get_azure_file_system_client(service_client, container_name)

In [17]:
# get stock report, item master, and invoice line df (for sales prices)
source_folder = "cleaned/netsuite"
filename = "grouped_monthly_stock_reports_2022-01-2025-03_cleaned.parquet"
grouped_items = adl.get_parquet_file_from_data_lake(file_system_client, source_folder, filename)

source_folder = "enhanced/netsuite"
items = adl.get_parquet_file_from_data_lake(file_system_client, source_folder, "item_enhanced.parquet")
invoice_line_items = adl.get_parquet_file_from_data_lake(file_system_client, source_folder, "CustInvcItemLineItems_enhanced.parquet")

In [18]:
# there are duplicate item names for a few records. dropping them as they aren't pertinent.
# need to drop them to augment data in grouped_items.
duplicate_items = items[items.duplicated(subset='item_name', keep=False)]
items = items.drop_duplicates(subset='item_name', keep='first')

In [19]:
# Add manufacturer and level categories from df to grouped_items
# Match on grouped_items Item index with df item_name
for category in [f'level_{i}_category' for i in range(1, 7)]:
    grouped_items[category] = grouped_items.index.get_level_values('Item').map(items.set_index('item_name')[category])

grouped_items['manufacturer'] = grouped_items.index.get_level_values('Item').map(
    items.set_index('item_name')['manufacturer'])

In [20]:
# group the invoice line df to calculate average sales price for each month
invoice_line_items['Month'] = invoice_line_items['created_date'].dt.to_period('M')
grouped_line_items = invoice_line_items.groupby(['Month', 'item_name']).agg(
    total_quantity=('quantity', 'sum'),
    total_sales=('total_amount', 'sum'),
    num_invoice_lines=('created_date', 'size')
).assign(
    total_quantity=lambda df: df['total_quantity'] * -1,
    avg_unit_price=lambda df: df['total_sales'] / df['total_quantity']
).rename_axis(index={'item_name': 'Item'})

In [21]:
# add sales info to cost info in grouped_items
grouped_items = grouped_items.join(
    grouped_line_items[['total_quantity', 'total_sales', 'num_invoice_lines', 'avg_unit_price']])

In [22]:
# calculate unit costs and margins
grouped_items['avg_unit_cost'] = round(grouped_items['Value of Outputs'] / grouped_items['Total Output Quantity'], 2)
grouped_items['gross_unit_margin'] = round(grouped_items['avg_unit_price'] - grouped_items['avg_unit_cost'], 2)
grouped_items['gross_unit_margin_pct'] = round((grouped_items['gross_unit_margin'] / grouped_items['avg_unit_price']) * 100, 2)

In [23]:
# replace nulls values in strings with 'Not Specified'
string_cols = ['Description', 'level_1_category', 'level_2_category', 'level_3_category', 'level_4_category',
               'level_5_category', 'level_6_category', 'manufacturer']

grouped_items[string_cols] = grouped_items[string_cols].fillna('Not Specified')

# replace NaNs that should be 0
zero_cols = ['total_quantity', 'total_sales', 'num_invoice_lines']
grouped_items[zero_cols] = grouped_items[zero_cols].fillna(0.0)

In [24]:
# forward-fill NaNs on a oer-item basis (next row in the table is for another item, so have to extract the series for the item one at a time)
#
# Why it works
# groupby(level="Item") splits the DataFrame into sub-DataFrames, one per item.
#
# .ffill() on each sub-DataFrame carries the last non-NaN for that item forward through its chronological rows.
#
# The vectorized assignment back to grouped_items[cols_to_fill] merges all those per-item results into the master frame.

fill_cols = ['avg_unit_price', 'avg_unit_cost', 'gross_unit_margin', 'gross_unit_margin_pct']

filled = (
    grouped_items
      .groupby(level="Item")[fill_cols]
      .ffill()
)

# put the filled values back into original
grouped_items.loc[:, fill_cols] = filled

In [25]:
# save in the data lake and preserve Month, Item multi-index
adl.save_df_as_parquet_in_data_lake(grouped_items, file_system_client, source_folder, 'unit_margin_analysis.parquet', preserve_index=True)

In [36]:
# filter grouped_items: select top 100 df by sales
item_sales_totals = grouped_items.groupby('Item').agg({
    'total_quantity': 'sum',
    'total_sales': 'sum',
    'Description': 'first'
})
item_sales_totals = item_sales_totals.nlargest(100, 'total_sales')
item_sales_totals.reset_index(inplace=True)

top_100_items = grouped_items[grouped_items.index.get_level_values('Item').isin(item_sales_totals['Item'])]

In [38]:
item_sales_totals.to_excel('top100_items.xlsx', index=False)

In [42]:
# save in the data lake and preserve Month, Item multi-index
adl.save_df_as_parquet_in_data_lake(top_100_items, file_system_client, source_folder, 'top_100_items.parquet', preserve_index=True)