In [1]:
# Typing libraries
from typing import Dict, Tuple, List, Literal

# Azure Data Lake libraries
import azure_data_lake_interface as adl

# Data analysis libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

# Data visualization libraries
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

# Helper function libraries
from helper_functions import load_config

In [2]:
# attach to the data lake
config = load_config("config/datalake_config.json", flush_cache=True)
service_client = adl.get_azure_service_client(config["blob_url"])
file_system_client = adl.get_azure_file_system_client(service_client, "consolidated")

In [3]:
sales_orders = adl.get_parquet_file_from_data_lake(file_system_client, "raw/netsuite", "transaction/SalesOrd_raw.parquet")
sales_order_line_items = adl.get_parquet_file_from_data_lake(file_system_client, "raw/netsuite","transaction/SalesOrdItemLineItems_raw.parquet")

In [4]:
# get customer and related data
customers = pd.read_csv("data/active_customers.csv")

# change customer_id to string
customers["customer_id"] = customers["customer_id"].astype(str)

In [5]:
# change created_date to datetime and coerce all dates to datetime
sales_orders["created_date"] = pd.to_datetime(sales_orders["created_date"], errors="coerce")

# change net_amount to numeric and coerce all values to numeric
sales_orders["net_amount"] = pd.to_numeric(sales_orders["net_amount"], errors="coerce")

# convert values to numeric
numerics = ["quantity", "unit_price", "est_gross_profit", "est_extended_cost", "est_gross_profit_percent", "labor_hours"]
for col in numerics:
    sales_order_line_items[col] = pd.to_numeric(sales_order_line_items[col], errors="coerce")

In [6]:
# add sales order info to sales_order_line_items
sales_order_line_items = sales_order_line_items.merge(sales_orders[["tranid", "created_date", 'location', 'commission_only']], on="tranid", how="left")

In [7]:
# remove all rows with created_date before Jan 1, 2022
sales_orders = sales_orders[sales_orders["created_date"] >= "2022-01-01"]
sales_order_line_items = sales_order_line_items[sales_order_line_items["created_date"] >= "2022-01-01"]

# capture line df with NaT created_date
line_items_with_missing_dates = sales_order_line_items[sales_order_line_items["created_date"].isna()]

# drop line df with NaT created_date -- assume no invoice to match line df
sales_order_line_items = sales_order_line_items[~sales_order_line_items["created_date"].isna()]

In [8]:
sales_orders["net_amount"].sum()

np.float64(694757515.6300001)

In [9]:
sales_order_line_items["total_amount"] = -1 * sales_order_line_items["quantity"] * sales_order_line_items["unit_price"]
sales_order_line_items["total_amount"].sum()

np.float64(659866908.8790114)

In [10]:
sales_orders_ids = sales_orders["tranid"].unique()
line_items_ids = sales_order_line_items["tranid"].unique()

# find all sales orders that do not have line df
missing_line_items = sales_orders[~sales_orders["tranid"].isin(line_items_ids)]
missing_line_items

Unnamed: 0,links,actual_ship_date,ai_order_type,amount_paid,amount_unpaid,billing_address,close_date,commission_only,company_email,company_name,...,ship_date,shipping_address,start_date,status,total_cost_estimate,tran_date,tranid,transaction_number,type,voided
24921,[],1/4/2022,,0,0,"SDI, Inc.\r\n1414 Radcliffe Street\r\nSuite 30...",4/5/2022,F,,SDI c/o Cascades,...,1/18/2022,SDI c/o Cascades\r\n285 MIDFIELD RD\r\nBARNWEL...,1/1/1800,Sales Order : Closed,0,1/4/2022,288623,288626,SalesOrd,F
24924,[],1/13/2022,,0,0,Mertec Engineering\r\n1232 Monte Vista Ave\r\n...,1/13/2022,F,invoices@mertec.net,Mertec Engineering,...,1/18/2022,Mertec Engineering\r\n1232 Monte Vista Ave\r\n...,1/1/1800,Sales Order : Billed,0,1/4/2022,288633,288636,SalesOrd,F
24925,[],1/10/2022,,0,0,MEC-TRIC CONTROL COMPANY\r\nP.O. BOX 221918\r\...,1/10/2022,F,payables@mec-tric.com,Mec-Tric Controls Company,...,1/18/2022,Mec-Tric Control Company\r\n4110 Monroe Road\r...,1/1/1800,Sales Order : Billed,0,1/4/2022,288627,288630,SalesOrd,F
24931,[],1/1/1800,,0,0,KIMBERLY CLARK - MARINETTE\nPO BOX 59099\nKNOX...,1/10/2023,F,,Kimberly Clark - Marinette,...,1/4/2022,KIMBERLY CLARK - MARINETTE\n3120 Riverside Ave...,1/1/1800,Sales Order : Closed,0,1/4/2022,288630,288633,SalesOrd,F
24934,[],1/1/1800,,0,0,MADISON GAS & ELECTRIC CO\r\nPO BOX 1231\r\nMA...,1/16/2023,F,invoices@mge.com,Madison Gas & Electric,...,1/4/2022,MADISON GAS & ELECTRIC CO\r\n120 S. BALDWIN ST...,1/1/1800,Sales Order : Closed,0,1/4/2022,288636,288639,SalesOrd,F
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50653,[],11/3/2022,,0,0,Trilogy Industrial Sales\r\nPO Box 4182\r\nLak...,11/3/2022,F,AP@trilogyindustrial.com,Trilogy Industrial Sales,...,11/3/2022,"Trilogy Industrial Sales, LLC\r\n2885 Hwy 14 E...",1/1/1800,Sales Order : Billed,0,10/20/2022,314615,314618,SalesOrd,F
50654,[],11/3/2022,,0,0,M.G. Waldbaum - Wakefield\r\n105 North Main\r\...,11/3/2022,F,vendor.invoices@michaelfoods.com,Michael Foods - Wakefield,...,11/3/2022,MGW Wakefield Main Plant\r\n105 North Main Str...,1/1/1800,Sales Order : Billed,0,10/20/2022,314608,314611,SalesOrd,F
50655,[],1/12/2023,,0,0,Protech Sales USA Corp.\r\n12340 Mead Way\r\nL...,1/12/2023,F,ap@ptsusa.co,ProTech Sales USA Corp - Littleton,...,1/12/2023,WINSUPPLY RIVERTON WY CO\r\n390 WEBBWOOD RD\r\...,1/1/1800,Sales Order : Billed,0,10/20/2022,314597,314600,SalesOrd,F
50656,[],11/2/2022,,0,0,Clearwater Paper Corporation\r\nATTN: Accounts...,11/2/2022,F,apinvoice@clearwaterpaper.com,Clearwater Paper - Lewiston,...,11/3/2022,Clearwater Paper Corporation\r\n803 Mill Road\...,1/1/1800,Sales Order : Billed,0,10/20/2022,314606,314609,SalesOrd,F


In [11]:
len(missing_line_items), missing_line_items["net_amount"].sum()

(4833, np.float64(21877628.64))

In [12]:
cust_ids = customers["customer_id"].unique()
line_items_ids = sales_order_line_items["customer_id"].unique()

# find all line_items_ids that are not in cust_ids
line_items_ids_not_in_cust_ids = [line_item_id for line_item_id in line_items_ids if line_item_id not in cust_ids]
len(line_items_ids_not_in_cust_ids)

36

In [13]:
# find all line df with customer_id that are not in customers
missing_customers = sales_order_line_items[~sales_order_line_items["customer_id"].isin(customers["customer_id"])]
len(missing_customers)

914

In [14]:
# add customer information to sales_order_line_items
augmented_line_items = sales_order_line_items.merge(customers[["customer_id", "company_name", "subsidiary_name", "end_market", "sales_rep"]], on="customer_id", how="left")

In [17]:
# find all line df with no subsidiary_name
missing_subsidiary = augmented_line_items[augmented_line_items["subsidiary_name"].isna()]
len(missing_subsidiary)

0

In [16]:
# use unique location values to determine subsidiary_name via a dictionary with locations as keys and subsidiaries as values
locations_subsidiary_map = {
    'Riverdale Shop': 'Allied Valve',
    'Joliet-AV Shop': 'Allied Valve',
    'North Dakota': 'Allied Valve',
    'Tulsa': 'Valve Sales Inc.',
    'Portage': 'Allied Valve',
    'Cannon Falls': 'Allied Valve',
    'Appleton': 'Allied Valve',
    'Norwich': 'Allied Valve',
    'Casper': 'Valve Sales Inc.',
    'Highland': 'Allied Instrumentation',
    'Detroit': 'Allied Instrumentation',
    'Kansas City': 'Allied Instrumentation',
    'Fenton': 'Allied Instrumentation',
    'St. Louis': 'Allied Instrumentation',
    'Peoria': 'Allied Instrumentation',
    'Indianapolis': 'Allied Instrumentation',
    'Carlsbad': 'Valve Sales Inc.',
    'Kent': 'Valve Sales Inc.',
    'New England-Instrumentation': 'Allied Instrumentation',
    'Oak Park': 'Allied Instrumentation',
    'Hastings': 'Allied Valve',
    'Monroe': 'Allied Valve',
    'New Berlin': 'Allied Valve',
    'Lytle': 'Valve Sales Inc.',
    'Sutton': 'Allied Instrumentation'
}

# update augmented_line_items with subsidiary_name based on location
augmented_line_items["subsidiary_name"] = augmented_line_items["location"].map(locations_subsidiary_map)

In [18]:
# drop item types that are not related to products/services
drop_list = ["Description", "Markup", "Item Group", "Other Charge", "Payment", "Discount"]
augmented_line_items = augmented_line_items[~augmented_line_items["item_type"].isin(drop_list)]

In [19]:
augmented_line_items['item_type'].value_counts()

item_type
Inventory Item                262262
Service                        92560
Non-inventory Item             45069
Assembly/Bill of Materials      3540
Kit/Package                        7
Name: count, dtype: int64

In [20]:
# resolve custom_manufacturer and manufacturer columns
augmented_line_items["custom_manufacturer"] = augmented_line_items["custom_manufacturer"].fillna(augmented_line_items["manufacturer"])
augmented_line_items["manufacturer"] = augmented_line_items["manufacturer"].fillna(augmented_line_items["custom_manufacturer"])

In [21]:
# change "null" values in manufacturer to "Not Specified"
augmented_line_items["manufacturer"] = augmented_line_items["manufacturer"].replace("null", "Not Specified")

In [22]:
augmented_line_items["manufacturer"].value_counts()

manufacturer
Not Specified               297896
Consolidated                 37053
Kunkle                       19884
Peaktronics                   5213
Fisher                        4971
                             ...  
HAM-LET                          1
Aftermarket                      1
Cyrus Shank                      1
Panametrics Gas FlowComp         1
MID-WEST INSTRUMENT              1
Name: count, Length: 448, dtype: int64

In [23]:
# drop unnecessary columns
dropped_cols = ['links', 'assembly_component', 'cost_estimate_type', 'created_from', 'custom_manufacturer', 'est_gross_profit', 'est_gross_profit_percent',
                'handling_cost', 'id', 'item_base_price', 'line_number', 'mainline', 'purchased_price', 'special_order', 'tax_line', 'transaction_table_id',
                'last_purchase_price']
augmented_line_items = augmented_line_items.drop(columns=dropped_cols)

In [24]:
# calculate total amount for each line item, quantities are negative, so multiply by -1
augmented_line_items["total_amount"] = -1 * augmented_line_items["quantity"] * augmented_line_items["unit_price"]
augmented_line_items["total_cost"] = -1 * augmented_line_items["quantity"] * augmented_line_items["est_extended_cost"]
augmented_line_items["gross_profit"] = augmented_line_items["total_amount"] - augmented_line_items["total_cost"]
augmented_line_items["gross_profit_percent"] = augmented_line_items["gross_profit"] / augmented_line_items["total_amount"]

In [25]:
augmented_line_items.dtypes

customer_id                          object
description                          object
display_name                         object
est_extended_cost                   float64
item_name                            object
item_type                            object
labor_hours                         float64
level_1_category                     object
level_2_category                     object
level_3_category                     object
manufacturer                         object
quantity                            float64
sku                                  object
tranid                               object
unit_price                          float64
valve_spec_size                      object
vendor_commission_percent            object
created_date                 datetime64[ns]
location                             object
commission_only                      object
total_amount                        float64
company_name                         object
subsidiary_name                 

In [26]:
augmented_line_items.to_csv("data/augmented_line_items.csv", index=False)

In [27]:
augmented_line_items = pd.read_csv("data/augmented_line_items.csv")

In [None]:
augmented_line_items["created_date"] = pd.to_datetime(augmented_line_items["created_date"], errors="coerce")

In [None]:
total_booked_sales_by_customer = calculate_total_booked_sales_by_customer(augmented_line_items)
top_customers_by_subsidiary = get_top_customers_by_subsidiary(total_booked_sales_by_customer)

In [None]:
# drop rows of each subsidiary, except Automation Service
top_customers_by_subsidiary = top_customers_by_subsidiary.groupby("subsidiary_name").apply(lambda x: x.iloc[:-2] if x["subsidiary_name"].values[0] != "Automation Service" else x).reset_index(drop=True)

# drop company name that contains "Valve Sales, Inc." or "Allied Value" from Automation Service
top_customers_by_subsidiary = top_customers_by_subsidiary[~top_customers_by_subsidiary["company_name"].str.contains("Valve Sales, Inc.|Allied Valve")]

In [None]:
top_customers_by_subsidiary

In [None]:
# convert to dictionary with subsidiary as key and list of tuples with related customer_ids and company names as value
subsidiary_customers = top_customers_by_subsidiary.groupby("subsidiary_name")[["customer_id", "company_name"]].apply(lambda x: list(zip(x["customer_id"], x["company_name"]))).to_dict()

In [None]:
subsidiary_customers

In [None]:
monthly_booked_product_sales_by_customer = calculate_total_booked_sales_by_customer(augmented_line_items, include='Products', by_month=True)
monthly_booked_service_sales_by_customer = calculate_total_booked_sales_by_customer(augmented_line_items, include='Services', by_month=True)

In [None]:
def filter_and_sort_by_subsidiary(df: pd.DataFrame, subsidiary: str, subsidiary_customers: Dict[str, list]) -> pd.DataFrame:
    """
    Filter and sort a DataFrame for a given subsidiary.

    Args:
        df (pd.DataFrame): The input DataFrame containing customer sales data.
        subsidiary (str): The name of the subsidiary to filter the data for.
        subsidiary_customers (Dict[str, list]): A dictionary mapping subsidiaries to lists of customer tuples.

    Returns:
        pd.DataFrame: The filtered and sorted DataFrame.
    """
    # Filter out all rows that do not have a company ID that is in subsidiary_customers
    filtered_df = df[df["customer_id"].isin([customer[0] for customer in subsidiary_customers[subsidiary]])]

    # Sort by company name and then by created date
    filtered_sorted_df = filtered_df.sort_values(["company_name", "created_date"])

    return filtered_sorted_df

In [None]:
def plot_booked_sales_and_gross_profit(df: pd.DataFrame, subsidiary: str, subsidiary_customers: Dict[str, List[Tuple[int, str]]], subset: Literal["Product", "Service", "Product & Service"] = "Products & Services", figsize: Tuple[int, int] = (12, 6)) -> None:
    """
    Plot bar charts of booked sales and gross profit for each customer of a given subsidiary.

    Args:
        df (pd.DataFrame): The input DataFrame containing customer sales data.
        subsidiary (str): The name of the subsidiary to plot the data for.
        subsidiary_customers (Dict[str, List[Tuple[int, str]]]): A dictionary mapping subsidiaries to lists of customer tuples (customer_id, company_name).
        figsize (Tuple[int, int], optional): The size of the figure for each chart. Defaults to (12, 6).

    Returns:
        None
    """
    # Plot bar chart for each specific customer
    for customer in subsidiary_customers[subsidiary]:
        customer_id, company_name = customer
        customer_data = df[df["customer_id"] == customer_id]
        customer_data = customer_data.set_index("created_date")
        
        # Plotting booked sales and gross profit
        if not customer_data.empty:
            customer_data[["total_booked_sales", "total_gross_profit"]].plot(
                kind="bar",
                title=f"{subsidiary}: {company_name} - Booked {subset} Sales & Gross Profit",
                ylabel="Amount ($)",
                xlabel="Month",
                figsize=figsize
            )
            plt.show()
        else:
            print(f"No data available for {subsidiary}: {company_name}")

In [None]:
def plot_avg_gross_margin_pct(df: pd.DataFrame, subsidiary: str, subsidiary_customers: Dict[str, List[Tuple[int, str]]], subset: Literal["Product", "Service", "Product & Service"] = "Products & Services", figsize: Tuple[int, int] = (12, 6)) -> None:
    """
    Plot a line chart of average gross margin percentage for each customer of a given subsidiary.

    Args:
        df (pd.DataFrame): The input DataFrame containing customer sales data.
        subsidiary (str): The name of the subsidiary to plot the data for.
        subsidiary_customers (Dict[str, List[Tuple[int, str]]]): A dictionary mapping subsidiaries to lists of customer tuples (customer_id, company_name).
        figsize (Tuple[int, int], optional): The size of the figure for each chart. Defaults to (12, 6).

    Returns:
        None
    """
    # Plot line chart for each specific customer
    for customer in subsidiary_customers[subsidiary]:
        customer_id, company_name = customer
        customer_data = df[df["customer_id"] == customer_id]
        customer_data = customer_data.set_index("created_date")
        
        # Plotting average gross margin percentage
        if not customer_data.empty:
            customer_data["avg_gross_margin_pct"].plot(
                kind="line",
                title=f"{subsidiary}: {company_name} - Average {subset} Gross Margin (%)",
                ylabel="Percent (%)",
                xlabel="Month",
                figsize=figsize
            )
            plt.show()
        else:
            print(f"No data available for {subsidiary}: {company_name}")

In [None]:
def analysis_pipeline(df: pd.DataFrame, subsidiary: str, subsidiary_customers: Dict[str, List[Tuple[int, str]]], subset: Literal["Product", "Service", "Product & Service"] = "Products & Services", figsize: Tuple[int, int] = (12, 6)) -> None:
    """
    Execute a pipeline of analysis functions to filter, sort, and visualize sales data for a specific subsidiary.
    
    Args:
        df (pd.DataFrame): The input DataFrame containing customer sales data.
        subsidiary (str): The name of the subsidiary to perform the analysis on.
        subsidiary_customers (Dict[str, List[Tuple[int, str]]]): A dictionary mapping subsidiaries to lists of customer tuples (customer_id, company_name).
        figsize (Tuple[int, int], optional): The size of the figure for each chart. Defaults to (12, 6).

    Returns:
        None
    """
    # Step 1: Filter and Sort by Subsidiary
    filtered_sorted_df = filter_and_sort_by_subsidiary(df, subsidiary, subsidiary_customers)

    # Step 2: Plot Booked Sales and Gross Profit for each customer
    plot_booked_sales_and_gross_profit(filtered_sorted_df, subsidiary, subsidiary_customers, subset, figsize)

    # Step 3: Plot Average Gross Margin Percentage for each customer
    plot_avg_gross_margin_pct(filtered_sorted_df, subsidiary, subsidiary_customers, subset, figsize)

In [None]:
subsidiaries = list(subsidiary_customers.keys())
for subsidiary in subsidiaries:
    analysis_pipeline(monthly_booked_product_sales_by_customer, subsidiary, subsidiary_customers,"Product")

In [None]:
for subsidiary in subsidiaries:
    analysis_pipeline(monthly_booked_service_sales_by_customer, subsidiary, subsidiary_customers,"Service")

In [None]:
# analyze monthly booked product or service sales, profit, and margin % by subsidiary and explain trends for each subsidiary for all customers (no filtering)
def analyze_monthly_booked_sales_profit_margin_by_subsidiary(df: pd.DataFrame, subsidiary: str, subset: Literal["Product", "Service", "Product & Service"] = "Products & Services") -> None:
    """
    Analyze the monthly booked sales, profit, and margin percentage for a given subsidiary.

    Args:
        df (pd.DataFrame): The input DataFrame containing customer sales data.
        subsidiary (str): The name of the subsidiary to analyze.
        subset (Literal["Product", "Service", "Product & Service"], optional): The subset of data to analyze. Defaults to "Products & Services".

    Returns:
        None
    """
    # Filter the DataFrame for the given subsidiary
    df_filtered = df[df["subsidiary_name"] == subsidiary]

    # Plot the monthly booked sales, profit, and margin percentage
    fig, ax = plt.subplots(3, 1, figsize=(12, 12))

    df_filtered["total_booked_sales"].plot(kind="bar", ax=ax[0], title=f"{subsidiary}: Monthly Booked {subset} Sales", ylabel="Amount ($)")
    df_filtered["total_gross_profit"].plot(kind="bar", ax=ax[1], title=f"{subsidiary}: Monthly Gross Profit", ylabel="Amount ($)")
    df_filtered["avg_gross_margin_pct"].plot(kind="line", ax=ax[2], title=f"{subsidiary}: Monthly Average Gross Margin (%)", ylabel="Percent (%)")

    plt.tight_layout()
    plt.show()


In [None]:
subsidiaries = monthly_booked_service_sales_by_customer["subsidiary_name"].unique()

In [None]:
monthly_booked_product_sales_by_customer

In [None]:
for subsidiary in subsidiaries:
    analyze_monthly_booked_sales_profit_margin_by_subsidiary(monthly_booked_product_sales_by_customer, subsidiary, "Product")

In [None]:
# calculate total booked sales and gross profit for each sku and count number of orders of each sku
def calculate_total_booked_sales_by_sku(df: pd.DataFrame, by_month: bool = False) -> pd.DataFrame:
    """
    Calculate the total booked sales and gross profit for each SKU and count the number of orders for each SKU.

    Args:
        df (pd.DataFrame): The input DataFrame containing line item data.
        by_month (bool, optional): Whether to calculate totals by month. Defaults to False.

    Returns:
        pd.DataFrame: A DataFrame with total booked sales, gross profit, and order count for each SKU.
    """
    
    # Group by SKU and calculate total booked sales, gross profit, and order count
    grouped_df = df.groupby(["sku", "item_name", "display_name"]).agg(
        total_booked_sales=('total_amount', 'sum'),
        total_gross_profit=('gross_profit', 'sum'),
        order_count=('sku', 'count'),
        qty_sold=('quantity', 'sum')
    ).reset_index()

    grouped_df["qty_sold"] = -1 * grouped_df["qty_sold"]
    grouped_df["avg_gross_profit"] = grouped_df["total_gross_profit"] / grouped_df["order_count"]
    grouped_df["avg_gross_margin_pct"] = grouped_df["total_gross_profit"] / grouped_df["total_booked_sales"]
    
    # sort by total booked sales in descending order    
    return grouped_df.sort_values("total_booked_sales", ascending=False)

In [None]:
total_booked_sales_by_sku = calculate_total_booked_sales_by_sku(augmented_line_items)

In [None]:
total_booked_sales_by_sku

In [None]:
# find all skus that do not have a material impact on margin calcs
non_material_skus = total_booked_sales_by_sku[
    (total_booked_sales_by_sku['avg_gross_profit'] <= 100) & 
    (total_booked_sales_by_sku['order_count'] <= 50)
]

In [None]:
# remove all skus from augmented_line_items that are not in meaningful_skus
material_line_items = augmented_line_items[~augmented_line_items["sku"].isin(non_material_skus["sku"])]

# remove rows with zero quantity
material_line_items = material_line_items[material_line_items["quantity"] != 0]

# remove rows with both zero total_amount and zero total_cost
material_line_items = material_line_items[~((material_line_items["total_amount"] == 0) & (material_line_items["total_cost"] == 0))]

# remove rows with gross profit percent less than -100%
material_line_items = material_line_items[material_line_items["gross_profit_percent"] >= -1.5]

In [None]:
material_line_items

In [None]:
# find top 200 skus by total booked sales for each subsidiary
top_skus_by_subsidiary = material_line_items.groupby("subsidiary_name").apply(lambda x: x.nlargest(200, "total_amount")).reset_index(drop=True)

In [None]:
top_skus_by_subsidiary

In [None]:
# calculate monthy sales, profit and margin for each top 200 sku by subsidiary


In [None]:
# calculate monthly total amount and total gross profit for each top 200 sku by subsidiary
monthly_top_sku_sales_by_subsidiary = top_skus_by_subsidiary.groupby([top_skus_by_subsidiary["created_date"].dt.to_period("M"),"subsidiary_name", "sku", "custom_manufacturer", "manufacturer", "item_name", "display_name"]).agg(
    total_amount=('total_amount', 'sum'),
    total_gross_profit=('gross_profit', 'sum')
).reset_index()

monthly_top_sku_sales_by_subsidiary["avg_gross_margin_pct"] = monthly_top_sku_sales_by_subsidiary["total_gross_profit"] / monthly_top_sku_sales_by_subsidiary["total_amount"]

In [None]:
monthly_top_sku_sales_by_subsidiary

In [None]:
def calculate_ttm_avg(df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate trailing twelve months (TTM) average gross margin percentage for each SKU.
    
    Args:
        df (pd.DataFrame): The DataFrame containing monthly gross margin data.
    
    Returns:
        pd.DataFrame: The original DataFrame with an additional 'ttm_avg_gross_margin_percent' column.
    """
    # Sort the DataFrame
    df = df.sort_values(by=["subsidiary_name", "sku", "created_date"])
    
    # Calculate TTM average using rolling window
    df["ttm_avg_gross_margin_percent"] = (
        df.groupby(["subsidiary_name", "sku"])["avg_gross_margin_pct"]
        .rolling(window=12, min_periods=1)
        .mean()
        .reset_index(level=[0, 1], drop=True)
    )
    
    return df

In [None]:
monthly_top_sku_sales_by_subsidiary = calculate_ttm_avg(monthly_top_sku_sales_by_subsidiary)

In [None]:
monthly_top_sku_sales_by_subsidiary

In [None]:
def fill_missing_months(df: pd.DataFrame) -> pd.DataFrame:
    """
    Fill in missing months for each SKU and carry forward the avg_gross_margin_percent.

    Args:
        df (pd.DataFrame): The input DataFrame containing monthly gross margin data.

    Returns:
        pd.DataFrame: A DataFrame with missing months filled in and avg_gross_margin_percent carried forward.
    """

    # Ensure created_date is a datetime type
    if df["created_date"].dtype == "period[M]":  # Check if created_date is a period type
        df["created_date"] = df["created_date"].dt.to_timestamp()  # Convert Period to Timestamp

    # Set up a complete date range for the 36 months
    full_date_range = pd.date_range(start=df["created_date"].min(), end=df["created_date"].max(), freq='MS')

    # Create an empty list to hold the filled data for each SKU and subsidiary combination
    filled_data = []

    # Iterate over each SKU and subsidiary combination
    for (subsidiary, sku), group in df.groupby(["subsidiary_name", "sku"]):
        # Set created_date as the index for reindexing
        group = group.set_index("created_date")

        # Reindex to the full date range
        group = group.reindex(full_date_range)

        # Set subsidiary_name and sku columns after reindexing
        group["subsidiary_name"] = subsidiary
        group["sku"] = sku

        # Forward fill missing avg_gross_margin_percent values
        group["avg_gross_margin_percent"] = group["avg_gross_margin_pct"].ffill()

        # Fill other missing columns with appropriate values
        group["total_amount"] = group["total_amount"].fillna(0)  # Fill missing total_amount with 0
        group["total_gross_profit"] = group["total_gross_profit"].fillna(0)  # Fill missing total_gross_profit with 0

        # Reset the index back to created_date
        group = group.reset_index().rename(columns={"index": "created_date"})

        # Append the filled group to the list
        filled_data.append(group)

    # Concatenate all the filled groups back together
    filled_df = pd.concat(filled_data, ignore_index=True)

    return filled_df

In [None]:
monthly_top_sku_sales_by_subsidiary = fill_missing_months(monthly_top_sku_sales_by_subsidiary)

In [None]:
monthly_top_sku_sales_by_subsidiary

In [None]:
def identify_trend(df: pd.DataFrame) -> pd.DataFrame:
    """
    Identify SKUs with upward or downward trends in average gross margin percent over TTM.
    
    Args:
        df (pd.DataFrame): The DataFrame with TTM average gross margin percentage.
    
    Returns:
        pd.DataFrame: A DataFrame containing SKUs with trend information.
    """
    trend_results = []

    # Iterate over each SKU in each subsidiary
    for (subsidiary, sku), group in df.groupby(["subsidiary_name", "sku"]):
        group = group[-12:]  # Take the last 12 months (TTM)

        # Prepare data for linear regression
        X = np.arange(len(group)).reshape(-1, 1)  # Time index as the feature
        y = group["ttm_avg_gross_margin_percent"].values

        if len(X) < 2:  # Skip if there are not enough data points
            continue

        # Fit Linear Regression
        model = LinearRegression()
        model.fit(X, y)
        slope = model.coef_[0]  # The slope of the regression line

        trend_direction = "upward" if slope > 0 else "downward" if slope < 0 else "stable"
        margin_change = y[-1] - y[0]  # Calculate the change in margin over the TTM

        # Append the results
        trend_results.append({
            "subsidiary_name": subsidiary,
            "sku": sku,
            "trend_direction": trend_direction,
            "margin_change": margin_change,
            "slope": slope
        })

    # Convert results to a DataFrame
    trend_df = pd.DataFrame(trend_results)
    
    return trend_df

In [None]:
trend_results = identify_trend(monthly_top_sku_sales_by_subsidiary)

In [None]:
trend_results