In [1]:
# suppress warnings
import warnings
warnings.filterwarnings("ignore")

# autoload modules
%load_ext autoreload
%autoreload 2

In [2]:
# Data analysis libraries
import pandas as pd
import margin_analysis as ma

In [3]:
# get customer and related data
customers = pd.read_csv("data/active_customers.csv")

# change customer_id to string
customers["customer_id"] = customers["customer_id"].astype(str)

In [4]:
augmented_line_items = pd.read_csv("data/augmented_invoice_line_items.csv")
augmented_line_items["created_date"] = pd.to_datetime(augmented_line_items["created_date"], errors="coerce")
len(augmented_line_items), augmented_line_items["total_amount"].sum()

(389512, np.float64(595103483.3361996))

In [5]:
augmented_line_items.columns

Index(['customer_id', 'description', 'display_name', 'est_extended_cost',
       'item_name', 'item_type', 'labor_hours', 'level_1_category',
       'level_2_category', 'level_3_category', 'manufacturer', 'quantity',
       'sku', 'tranid', 'unit_price', 'valve_spec_size',
       'vendor_commission_percent', 'custbody_nx_customer', 'created_date',
       'location', 'commission_only', 'job_type', 'total_amount',
       'company_name', 'subsidiary_name', 'end_market', 'sales_rep',
       'total_cost', 'gross_profit', 'gross_profit_percent'],
      dtype='object')

In [6]:
augmented_line_items["subsidiary_name"].unique()

array(['Valve Sales Inc.', 'Automation Service', 'Allied Valve',
       'Allied Instrumentation'], dtype=object)

In [7]:
line_itmes_for_analysis = augmented_line_items.copy()

In [8]:
line_itmes_for_analysis["subsidiary_name"].unique()

array(['Valve Sales Inc.', 'Automation Service', 'Allied Valve',
       'Allied Instrumentation'], dtype=object)

In [9]:
grouping_columns = ["subsidiary_name", "location", "sku", "item_name", "display_name", "item_type", "manufacturer", "job_type"]
missing_counts = line_itmes_for_analysis[grouping_columns].isna().sum()
missing_counts

subsidiary_name         0
location                0
sku                     0
item_name               0
display_name       186296
item_type               0
manufacturer            0
job_type           111823
dtype: int64

In [10]:
line_itmes_for_analysis[grouping_columns] = line_itmes_for_analysis[grouping_columns].fillna("Not Assigned")

In [11]:
grouping_columns=["sku", "item_name", "display_name", "item_type", "manufacturer"]
total_invoiced_sales_by_sku = ma.calculate_total_booked_sales(line_itmes_for_analysis, grouping_columns=grouping_columns)
total_invoiced_sales_by_sku["total_booked_sales"].sum()

np.float64(595103483.3361995)

In [12]:
# remove rows with zero quantity
line_itmes_for_analysis = line_itmes_for_analysis[line_itmes_for_analysis["quantity"] != 0]
line_itmes_for_analysis["total_amount"].sum()

np.float64(595103483.3361996)

In [13]:
# remove rows with both zero total_amount and zero total_cost
line_itmes_for_analysis = line_itmes_for_analysis[~((line_itmes_for_analysis["total_amount"] == 0) & (line_itmes_for_analysis["total_cost"] == 0))]
line_itmes_for_analysis["total_amount"].sum()

np.float64(595103483.3361995)

In [14]:
# remove rows with gross profit percent less than -100%
line_itmes_for_analysis = line_itmes_for_analysis[line_itmes_for_analysis["gross_profit_percent"] >= -1.5]
line_itmes_for_analysis["total_amount"].sum()

np.float64(595009781.4733797)

In [15]:
grouping_columns = ["subsidiary_name", "location", "sku", "item_name", "display_name", "item_type", "manufacturer", "job_type"]
top_booked_material_skus = ma.calculate_total_booked_sales(line_itmes_for_analysis, grouping_columns)
top_booked_material_skus["total_booked_sales"].sum()

np.float64(595009781.4733796)

In [18]:
filter = False
if filter:
    top_skus_by_subsidiary = top_booked_material_skus.groupby("subsidiary_name").apply(lambda x: x.nlargest(5000, "total_booked_sales"), include_groups=True).reset_index(drop=True)
    print(top_skus_by_subsidiary["total_booked_sales"].sum())

    # print formatted % of top_booked_material_skus included in top_skus_by_subsidiary
    print(f"{top_skus_by_subsidiary['total_booked_sales'].sum() / top_booked_material_skus['total_booked_sales'].sum():.2%}")

    # filter line df by skus in top skus by subsidiary
    top_sku_line_items = line_itmes_for_analysis[line_itmes_for_analysis["sku"].isin(top_skus_by_subsidiary["sku"])].copy()
else:
    top_sku_line_items = line_itmes_for_analysis

print(top_sku_line_items["total_amount"].sum())

595009781.4733797


In [19]:
# calculate monthy sales by sku by subsidiary
grouping_columns = ["created_date", "subsidiary_name", "location", "company_name", "end_market", "sku", "item_name", "display_name", "item_type", "manufacturer", "job_type"]
monthly_booked_sales_by_sku_by_subsidiary = ma.calculate_total_booked_sales(top_sku_line_items, grouping_columns)
monthly_booked_sales_by_sku_by_subsidiary["total_booked_sales"].sum()

np.float64(595009781.4733796)

In [20]:
from datetime import datetime
date = datetime.today()
file_namme = f"{date.strftime('%d%b%Y')}_{"Top_5000" if filter else "ALL"}_monthly_invoices_by_sku_by_subsidiary.csv"
monthly_booked_sales_by_sku_by_subsidiary.to_csv(f"data/{file_namme}", index=False)