In [1]:
# Azure Data Lake libraries
import common.utils.azure_data_lake_interface as adl

# Data analysis libraries
import pandas as pd

# Helper function libraries
from importlib.resources import files
from common.utils.configuration_management import load_config

In [2]:
# attach to the data lake
file_path = files("common.config") / "datalake_config.json"
config = load_config(str(file_path))
service_client = adl.get_azure_service_client(config["blob_url"])
file_system_client = adl.get_azure_file_system_client(service_client, "consolidated")

In [3]:
# get data
customers = adl.get_parquet_file_from_data_lake(file_system_client, "cleaned/netsuite", "customer_cleaned.parquet")

In [4]:
transactions = adl.get_parquet_file_from_data_lake(file_system_client, "raw/netsuite", "transaction/CustInvc_raw.parquet")

In [5]:
transactions['customer_id'] = transactions['customer_id'].astype(int)

In [6]:
save_transactions = transactions.copy()

In [7]:
missing = customers[customers.subsidiary_name.isna()]
missing

Unnamed: 0,ai_sales_rep,category,company_name,credit_limit,customer_number,customer_priority,datecreated,default_billing_address,email,end_market,...,on_credit_hold,overdue_balance,parent_company,phone,primary_sales_rep,primary_service_location,subsidiary_name,total_balance,unbilled_orders,sales_rep


In [8]:
transactions = transactions[transactions.customer_id.isin(customers.customer_id)]

In [9]:
transactions = transactions.merge(
    customers[["customer_id", "company_name", "subsidiary_name", "end_market", "sales_rep"]],
    on="customer_id",
    how="left")

In [10]:
transactions.subsidiary_name.isna().sum()

np.int64(0)

In [None]:
top_buy_resale_manufacturers = buy_resales.manufacturer.value_counts()[0:20]
top_buy_resale_manufacturers

In [None]:
top_buy_resale_mfr_line_items = buy_resales[buy_resales.manufacturer.isin(top_buy_resale_manufacturers.index.to_list())]

In [None]:
top_buy_resale_mfr_line_items = top_buy_resale_mfr_line_items[(top_buy_resale_mfr_line_items.highest_recent_cost != 0) &
                                                              (top_buy_resale_mfr_line_items.highest_quoted_cost != 0)]

In [None]:
top_buy_resale_mfr_line_items['highest_cost'] = top_buy_resale_mfr_line_items.apply(
    lambda x: x.highest_quoted_cost if pd.isna(x.highest_recent_cost)
    else x.highest_recent_cost if pd.isna(x.highest_quoted_cost)
    else max(x.highest_recent_cost, x.highest_quoted_cost), axis=1)

In [None]:
top_buy_resale_mfr_line_items

In [None]:
top_buy_resale_mfr_line_items['Month'] = pd.to_datetime(top_buy_resale_mfr_line_items.created_date).dt.to_period('M')

In [None]:
top_buy_resale_mfr_line_items = top_buy_resale_mfr_line_items[top_buy_resale_mfr_line_items.unit_price > 0]

In [None]:
top_buy_resale_mfr_line_items.loc[:, "gross_unit_margin_pct"] = round(((top_buy_resale_mfr_line_items.unit_price - top_buy_resale_mfr_line_items.highest_cost) / top_buy_resale_mfr_line_items.unit_price) * 100, 2)
top_buy_resale_mfr_line_items

In [None]:
top_buy_resale_mfr_line_items = top_buy_resale_mfr_line_items[~top_buy_resale_mfr_line_items.item_name.str.startswith("Inactivated")]
top_buy_resale_mfr_line_items = top_buy_resale_mfr_line_items[~top_buy_resale_mfr_line_items.item_name.str.startswith("EXPEDITE")]
top_buy_resale_mfr_line_items = top_buy_resale_mfr_line_items[top_buy_resale_mfr_line_items.gross_unit_margin_pct >= 0]
top_buy_resale_mfr_line_items

In [None]:
mfg_monthly_margin = top_buy_resale_mfr_line_items.groupby(['Month', 'manufacturer', 'level_2_category']).agg(
    item_name=('item_name', 'first'),
    item_type=('item_type', 'first'),
    quantity=('quantity', 'sum'),
    level_3_category=('level_3_category', 'first'),
    total_sales=('total_amount', 'sum'),
    avg_margin_pct=('gross_unit_margin_pct', 'mean'),
).sort_index()

assert isinstance(mfg_monthly_margin.index.levels[0], pd.PeriodIndex)

In [None]:
mfg_monthly_margin

In [None]:
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

# Create PDF file
with PdfPages('analyses/buy_resale_margins_by_mfr_and_level2_category.pdf') as pdf:
    # Loop through each manufacturer
    for manufacturer in mfg_monthly_margin.index.get_level_values('manufacturer').unique():
        # Get data for current manufacturer
        mfg_data = mfg_monthly_margin.xs(manufacturer, level='manufacturer')

        # Create plot
        plt.figure(figsize=(12, 10))
        for category in mfg_data.index.get_level_values('level_2_category').unique():
            cat_data = mfg_data.xs(category, level='level_2_category')
            plt.plot(cat_data.index.to_timestamp(), cat_data['avg_margin_pct'], marker='o', label=category)

        plt.title(f'Monthly Margins by Category for {manufacturer}')
        plt.xlabel('Month')
        plt.ylabel('Average Margin %')
        plt.xticks(rotation=45)
        plt.grid(True)
        plt.ylim(0, max(mfg_data['avg_margin_pct']) + (5 - max(mfg_data['avg_margin_pct']) % 5))
        plt.yticks(np.arange(0, plt.ylim()[1], 5))
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()

        # Save current figure to PDF
        pdf.savefig()
        plt.close()

In [None]:
c_sales = line_items[line_items.commission_or_mfr_direct == False]
c_sales.total_amount.sum()

In [None]:
last_month = line_items['created_date'].max().to_period('M')
last_month_line_items = line_items[
    (pd.to_datetime(line_items['created_date']).dt.to_period('M') == last_month) &
    (line_items['item_type'] == 'Non-inventory Item')
    ]
last_month_line_items.total_amount.sum()

In [None]:
line_items.item_type.value_counts()

In [None]:
# calculate the total number of invoices
num_invoices = transactions.groupby('customer_id').agg(
    num_invoices=('customer_id', 'size'),
).reset_index()

# calculate the total sales and gross profit
customer_totals = line_items.groupby('customer_id').agg(
    num_invoice_line_items=('customer_id', 'size'),
    total_net_sales=('total_amount', 'sum'),
    total_gross_profit=('gross_profit', 'sum')
).reset_index()

# combine info
customer_totals = pd.merge(customer_totals, num_invoices, on='customer_id', how='left')

In [None]:
# get info from customer table
customer_info_subset = customers[["company_name", "primary_sales_rep", "primary_service_location", "end_market"]]

# merge customer info and invoice totals
merged_data = pd.merge(customer_totals, customer_info_subset, on='customer_id', how='left')

# add subsidiary info via primary location
merged_data.rename(columns={'primary_service_location': 'location'}, inplace=True)
merged_data = hf.set_subsidiary_by_location(merged_data)


In [None]:
merged_data

In [None]:
# Save to Excel
output_file = 'analyses/invoice_totals_by_customer.xlsx'
merged_data.to_excel(output_file, index=False)

In [None]:
# Step 1: Extract the transaction created month
transactions['transaction_month'] = transactions['created_date'].dt.to_period('M')

# Group by 'transaction_month' and calculate total net sales and gross profit
# Group by 'YearMonth' and calculate total net sales, gross profit, and number of invoices
monthly_summary = transactions.groupby('transaction_month').agg(
    total_net_sales=('net_amount', 'sum'),
    total_gross_profit=('estimated_gross_profit', 'sum'),
    num_invoices=('created_date', 'size')  # Count of invoices
).reset_index()

In [None]:
# imporve column names
monthly_summary = monthly_summary.rename(columns={
    'transaction_month': 'Month Created',
    'num_invoices': 'Num Invoices',
    'total_net_sales': 'Total Net Sales',
    'total_gross_profit': 'Total Gross Profit'
})

# use month as index
monthly_summary = monthly_summary.set_index("Month Created")

# move num_invoices to the first column
cols = monthly_summary.columns.tolist()
cols = cols[-1:] + cols[:-1]
monthly_summary = monthly_summary[cols]

In [None]:
monthly_summary

In [None]:
# Save to Excel
output_file = 'analyses/invoices_by_month.xlsx'
monthly_summary.to_excel(output_file, index=True)

In [None]:
entered_by_totals = transactions.groupby('created_by').agg(
    num_invoices=('created_by', 'size'),            # Count the number of invoices
    total_net_sales=('net_amount', 'sum'),                 # Sum of net amount
    total_gross_profit=('estimated_gross_profit', 'sum')         # Sum of gross profit
).reset_index()

In [None]:
# imporve column names
entered_by_totals = entered_by_totals.rename(columns={
    'num_invoices': 'Num Invoices',
    'total_net_sales': 'Total Net Sales',
    'total_gross_profit': 'Total Gross Profit'
})

In [None]:
entered_by_totals

In [None]:
# Save to Excel
output_file = 'analyses/invoices_by_who_entered_them.xlsx'
entered_by_totals.to_excel(output_file, index=False)

In [None]:
# merge data from customers to get sales rep name
transactions = pd.merge(transactions, customer_info_subset, on='customer_id', how='left')

# Replace NaN values in 'primary sales rep' with "No Sales Rep"
transactions['primary_sales_rep'] = transactions['primary_sales_rep'].fillna("Not Specified")

In [None]:
sales_rep_totals = transactions.groupby('primary_sales_rep').agg(
    num_invoices=('primary_sales_rep', 'size'),            # Count the number of invoices
    total_net_sales=('net_amount', 'sum'),                 # Sum of net amount
    total_gross_profit=('estimated_gross_profit', 'sum')         # Sum of gross profit
).reset_index()

In [None]:
# imporve column names
sales_rep_totals = sales_rep_totals.rename(columns={
    'num_invoices': 'Num Invoices',
    'total_net_sales': 'Total Net Sales',
    'total_gross_profit': 'Total Gross Profit'
})

In [None]:
sales_rep_totals

In [None]:
# Save to Excel
output_file = 'analyses/invoices_by_sales_rep.xlsx'
sales_rep_totals.to_excel(output_file, index=False)

In [None]:
date_fields = transactions[['actual_ship_date', 'created_date', 'deliver_by_date', 'promised_date', 'ship_date']].copy()

In [None]:
len(date_fields)

In [None]:
for date_field in date_fields.columns:
    date_fields[date_field] = pd.to_datetime(date_fields[date_field], errors="coerce")

In [None]:
date_fields["deliver_by_date"].value_counts()

In [None]:
date_fields["created_to_shipped"] = date_fields["actual_ship_date"] - date_fields["created_date"]

In [None]:
trans_2024 = transactions[(transactions["created_date"] >= '2024-01-01') & (transactions["created_date"] <= '2024-12-31')]

In [None]:
trans_2024["net_amount"].sum()

In [None]:
len(trans_2024)