In [1]:
%load_ext autoreload
%autoreload 2

In [6]:
### Import Libraries
# Azure Data Lake libraries
import common.utils.azure_data_lake_interface as adl

# data cleansing libraries
from common.utils.data_cleansing import clean_illegal_chars_in_column

# config libraries
import common.config
from common.utils.configuration_management import load_config

# Data analysis libraries
import pandas as pd

In [7]:
def find_declining_margins(
    df: pd.DataFrame,
    days: int,
    tolerance: float = 0.0,
    date_col: str = "created_date",
    sku_col: str = "sku",
    gpp_col: str = "gross_profit_percent",
    trans_id_col: str = "tranid",
) -> pd.DataFrame:

    """
    Identifies products with declining gross profit margins within a specified time window.

    This function takes transaction data, computes the decline in gross profit percent for
    each SKU over a specified period, and filters for transactions where this decline
    exceeds a provided tolerance level. The final result is returned as a subset DataFrame
    meeting these criteria.

    Args:
        df (pd.DataFrame): Input DataFrame containing transactional data with gross profit
            percent and dates of transactions.
        days (int): Number of days to consider for the cutoff window. Only transactions
            within this number of days from the latest transaction date are considered.
        tolerance (float, optional): Minimum difference between the previous gross profit
            percentage and the current one to qualify as a "decline". Default is 0.0.
        date_col (str, optional): Name of the column in the DataFrame representing the
            transaction dates. Default is "created_date".
        sku_col (str, optional): Name of the column in the DataFrame identifying the
            stock keeping unit (SKU). Default is "sku".
        gpp_col (str, optional): Name of the column in the DataFrame representing the
            gross profit percentage (GPP). Default is "gross_profit_percent".
        trans_id_col (str, optional): Name of the column in the DataFrame representing the
            transaction identifier. Default is "tranid".

    Returns:
        pd.DataFrame: A subset of the input DataFrame containing rows where the gross profit
        percentage for a product has declined by more than the specified tolerance within
        the defined time window. The returned DataFrame includes columns for the current
        and previous gross profit percentages, transaction dates, and transaction IDs.
    """

    df = df.copy()

    # 1. Parse and sort
    df[date_col] = pd.to_datetime(df[date_col], errors='raise')
    df = df.sort_values([sku_col, date_col])

    # 2. Compute the prior-value column
    df['prev_gpp'] = df.groupby(sku_col)[gpp_col].shift(1)
    df['prev_trans_date'] = df.groupby(sku_col)[date_col].shift(1)
    if df[trans_id_col].dtype == 'string':
        df['prev_trans_id'] = df.groupby(sku_col)[trans_id_col].shift(1).fillna('Not Specified').astype('string')
    else:
        df['prev_trans_id'] = df.groupby(sku_col)[trans_id_col].shift(1).fillna(0).astype(int)

    # 3. Compute cutoff
    cutoff = df['created_date'].max() - pd.Timedelta(days=days)

    # 4. Filter: within window AND declined
    mask_window   = df[date_col] >= cutoff
    mask_declined = (df["prev_gpp"] - df[gpp_col]) > tolerance
    result = df[mask_window & mask_declined]

    return result

In [3]:
# attach to the data lake
config = load_config(common.config, "datalake_config.json")
service_client = adl.get_azure_service_client(config["blob_url"])
file_system_client = adl.get_azure_file_system_client(service_client, "consolidated")

# get data
data_state = "curated"
trans_type = "CustInvc"
filename = f"transaction/{trans_type}ItemLineItems_{data_state}.parquet"
df = adl.get_parquet_file_from_data_lake(file_system_client, f"{data_state}/netsuite", filename)
saved_df = df.copy()

In [None]:
df = saved_df.copy()

In [8]:
df = clean_illegal_chars_in_column(df, "description")
df.to_excel(f'../excel_outputs/CustInvcItemLineItems_curated.xlsx', index=False)

In [9]:
df.columns

Index(['created_date', 'created_from', 'entered_by', 'ai_order_type',
       'commission_or_mfr_direct', 'id', 'tranid', 'sku', 'item_type',
       'vsi_item_category', 'manufacturer', 'item_name', 'description',
       'display_name', 'level_1_category', 'level_2_category',
       'level_3_category', 'level_4_category', 'level_5_category',
       'level_6_category', 'subsidiary_name', 'location', 'customer_id',
       'company_name', 'sales_rep', 'end_market', 'quantity', 'unit_cost',
       'cost_estimate_type', 'highest_quoted_cost', 'highest_recent_cost',
       'highest_cost', 'handling_cost', 'labor_hours', 'unit_price',
       'total_cost', 'total_amount', 'gross_profit', 'gross_profit_percent'],
      dtype='object')

In [None]:
lookback = 10
margin_declines = find_declining_margins(df, days=lookback)
margin_declines

In [None]:
margin_declines.to_excel(f'../excel_outputs/invoice_line_item_margin_declines_in_past_{lookback}_days.xlsx', index=False)

In [None]:
monthly_data = df.copy()
monthly_data['Month'] = monthly_data["created_date"].dt.to_period('M')

In [None]:
monthly_margin_by_subsidiary = monthly_data.groupby(['Month', 'subsidiary_name']).agg(
    avg_margin_pct=('gross_profit_percent', 'mean')
)

In [None]:
import matplotlib.pyplot as plt

# Reset index to convert Month from index to column
chart_data = monthly_margin_by_subsidiary.reset_index()

# Create figure and plot lines
plt.figure(figsize=(12, 6))
for subsidiary in chart_data['subsidiary_name'].unique():
    subsidiary_data = chart_data[chart_data['subsidiary_name'] == subsidiary]
    plt.plot(subsidiary_data['Month'].astype(str),
             subsidiary_data['avg_margin_pct'],
             marker='o',
             label=subsidiary)

plt.xlabel('Month')
plt.ylabel('Average Margin %')
plt.title('Average Margin % by Subsidiary Over Time')
plt.legend()
plt.xticks(rotation=45)
plt.grid(True)
plt.ylim(bottom=0)  # Set y-axis to start at 0
plt.tight_layout()

In [None]:
weighted_margin = monthly_data.groupby(['Month', 'subsidiary_name']).agg(
    total_costs=('total_cost', 'sum'),
    total_revenues=('total_amount', 'sum'),
).assign(
    weighted_margin=lambda x: round((x.total_revenues - x.total_costs) * 100 / x.total_revenues, 2)
)

In [None]:
import numpy as np

# Reset index to convert Month from index to column
chart_data = weighted_margin.reset_index()

# restrict data to specific window
start_date = '2024-01'
chart_data = chart_data[chart_data['Month'] >= start_date]

plt.figure(figsize=(12, 6))
for subsidiary in chart_data['subsidiary_name'].unique():
    subsidiary_data = chart_data[chart_data['subsidiary_name'] == subsidiary]

    # Convert Month to numeric values for trend line
    x = range(len(subsidiary_data))
    y = subsidiary_data['weighted_margin'].values

    # Calculate trend line
    z = np.polyfit(x, y, 1)
    p = np.poly1d(z)

    # Plot actual data and trend line
    plt.plot(subsidiary_data['Month'].astype(str),
             subsidiary_data['weighted_margin'],
             marker='o',
             label=subsidiary)
    plt.plot(subsidiary_data['Month'].astype(str),
             p(x),
             '--',
             alpha=0.8,
             label=f'{subsidiary} trend')

plt.xlabel('Month')
plt.ylabel('Weighted Margin %')
plt.title('Sales-Weighted Margin % by Subsidiary Over Time')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.grid(True)
plt.ylim(bottom=0)
plt.tight_layout()


In [None]:
monthly_price_cost = monthly_data.groupby(['Month', 'subsidiary_name']).agg(
    total_costs=('total_cost', 'sum'),
    total_revenues=('total_amount', 'sum'),
    total_qty=('quantity', 'sum')
).assign(
    weighted_avg_unit_cost=lambda x: x.total_costs / x.total_qty,
    weighted_avg_unit_price=lambda x: x.total_revenues / x.total_qty
)

In [None]:
from scipy import stats

# Reset index to convert Month from index to column
chart_data = monthly_price_cost.reset_index()

# restrict data to specific window
start_date = '2024-01'
chart_data = chart_data[chart_data['Month'] >= start_date]

# Count number of subsidiaries
n_subsidiaries = len(chart_data['subsidiary_name'].unique())

# Create separate figure for each subsidiary
for subsidiary in plot_data['subsidiary_name'].unique():
    plt.figure(figsize=(15, 10))
    subsidiary_data = chart_data[chart_data['subsidiary_name'] == subsidiary]

    # Convert Month to numeric values for trend line
    x = range(len(subsidiary_data))

    # Plot actual data points
    plt.plot(subsidiary_data['Month'].astype(str),
             subsidiary_data['weighted_avg_unit_cost'],
             'bo-', label='Weighted Average Cost')
    plt.plot(subsidiary_data['Month'].astype(str),
             subsidiary_data['weighted_avg_unit_price'],
             'ro-', label='Weighted Average Price')

    # Calculate and plot trend lines
    slope_cost, intercept_cost, _, _, _ = stats.linregress(x, subsidiary_data['weighted_avg_unit_cost'])
    slope_price, intercept_price, _, _, _ = stats.linregress(x, subsidiary_data['weighted_avg_unit_price'])

    trend_cost = [slope_cost * i + intercept_cost for i in x]
    trend_price = [slope_price * i + intercept_price for i in x]

    plt.plot(subsidiary_data['Month'].astype(str), trend_cost, 'b--', alpha=0.5, label='Cost Trend')
    plt.plot(subsidiary_data['Month'].astype(str), trend_price, 'r--', alpha=0.5, label='Price Trend')

    plt.title(f'{subsidiary} - Monthly Unit-Weighted Average Price and Cost')
    plt.xlabel('Month')
    plt.ylabel('Amount ($)')
    plt.xticks(rotation=45)
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout();

In [None]:
plot_data['highest_total_cost'] = plot_data['quantity'] * plot_data['highest_cost']

In [None]:
monthly_price_cost = plot_data.groupby(['Month', 'subsidiary_name']).agg(
    total_costs=('highest_total_cost', 'sum'),
    total_revenues=('total_amount', 'sum'),
    total_qty=('quantity', 'sum')
).assign(
    weighted_avg_unit_cost=lambda x: x.total_costs / x.total_qty,
    weighted_avg_unit_price=lambda x: x.total_revenues / x.total_qty
)

In [None]:
# Reset index to convert Month from index to column
chart_data = monthly_price_cost.reset_index()

# restrict data to specific window
start_date = '2024-01'
chart_data = chart_data[chart_data['Month'] >= start_date]

# Count number of subsidiaries
n_subsidiaries = len(chart_data['subsidiary_name'].unique())

# Create separate figure for each subsidiary
for subsidiary in plot_data['subsidiary_name'].unique():
    plt.figure(figsize=(15, 10))
    subsidiary_data = chart_data[chart_data['subsidiary_name'] == subsidiary]

    # Convert Month to numeric values for trend line
    x = range(len(subsidiary_data))

    # Plot actual data points
    plt.plot(subsidiary_data['Month'].astype(str),
             subsidiary_data['weighted_avg_unit_cost'],
             'bo-', label='Weighted Average Cost')
    plt.plot(subsidiary_data['Month'].astype(str),
             subsidiary_data['weighted_avg_unit_price'],
             'ro-', label='Weighted Average Price')

    # Calculate and plot trend lines
    slope_cost, intercept_cost, _, _, _ = stats.linregress(x, subsidiary_data['weighted_avg_unit_cost'])
    slope_price, intercept_price, _, _, _ = stats.linregress(x, subsidiary_data['weighted_avg_unit_price'])

    trend_cost = [slope_cost * i + intercept_cost for i in x]
    trend_price = [slope_price * i + intercept_price for i in x]

    plt.plot(subsidiary_data['Month'].astype(str), trend_cost, 'b--', alpha=0.5, label='Cost Trend')
    plt.plot(subsidiary_data['Month'].astype(str), trend_price, 'r--', alpha=0.5, label='Price Trend')

    plt.title(f'{subsidiary} - Monthly Unit-Weighted Average Price and Cost')
    plt.xlabel('Month')
    plt.ylabel('Amount ($)')
    plt.xticks(rotation=45)
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout();