In [1]:
# Azure Data Lake libraries
import azure_data_lake_interface as adl

# Data analysis libraries
import pandas as pd

# Data visualization libraries
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

# Helper function libraries
from helper_functions import load_config

In [2]:
# attach to the data lake
config = load_config("config/datalake_config.json", flush_cache=True)
service_client = adl.get_azure_service_client(config["blob_url"])
file_system_client = adl.get_azure_file_system_client(service_client, "consolidated")

In [3]:
invoices = adl.get_parquet_file_from_data_lake(file_system_client, "raw/netsuite", "transaction/CustInvc_raw.parquet")
invoice_line_items = adl.get_parquet_file_from_data_lake(file_system_client, "raw/netsuite","transaction/CustInvcItemLineItems_raw.parquet")

In [4]:
print(len(invoices), len(invoice_line_items))

158412 561391


In [5]:
# change created_date to datetime and coerce all dates to datetime
invoices["created_date"] = pd.to_datetime(invoices["created_date"], errors="coerce")

# change net_amount to numeric and coerce all values to numeric
invoices["net_amount"] = pd.to_numeric(invoices["net_amount"], errors="coerce")

# convert est_gross_profit, quantity, and unit_price to numeric and coerce all values to numeric
invoice_line_items["est_gross_profit"] = pd.to_numeric(invoice_line_items["est_gross_profit"], errors="coerce")
invoice_line_items["quantity"] = pd.to_numeric(invoice_line_items["quantity"], errors="coerce")
invoice_line_items["unit_price"] = pd.to_numeric(invoice_line_items["unit_price"], errors="coerce")

In [6]:
# count purchase orders with missing or NaT created_date
len(invoices[invoices["created_date"].isna()])

0

In [7]:
# add created_date to purchase_order_line_items
invoice_line_items = invoice_line_items.merge(invoices[["tranid", "created_date"]], on="tranid", how="left")

In [8]:
# remove all rows with created_date before Jan 1, 2022
invoices = invoices[invoices["created_date"] >= "2022-01-01"]
invoice_line_items = invoice_line_items[invoice_line_items["created_date"] >= "2022-01-01"]

# capture line df with NaT created_date
line_items_with_missing_dates = invoice_line_items[invoice_line_items["created_date"].isna()]

# drop line df with NaT created_date -- assume no invoice to match line df
invoice_line_items = invoice_line_items[~invoice_line_items["created_date"].isna()]

In [9]:
print(len(invoices), len(invoice_line_items))

154960 544610


In [10]:
# check to make sure there is no matching invoice for line df with missing created_date
invoice_tranids = line_items_with_missing_dates["tranid"].unique()
invoices[invoices["tranid"].isin(invoice_tranids)]

Unnamed: 0,links,actual_ship_date,ai_order_type,amount_paid,amount_unpaid,billing_address,close_date,commission_only,company_email,company_name,...,ship_date,shipping_address,start_date,status,total_cost_estimate,tran_date,tranid,transaction_number,type,voided


In [19]:
invoices["net_amount"].sum()

np.float64(613003785.7199999)

In [12]:
# match line df with purchase orders
cur_tranid = "328700"
invoice_line_items[invoice_line_items["tranid"] == cur_tranid]

Unnamed: 0,links,assembly_component,cost_estimate_type,created_from,custom_manufacturer,customer_id,description,display_name,est_extended_cost,est_gross_profit,...,quantity,sku,special_order,tax_line,tranid,transaction_table_id,unit_price,valve_spec_size,vendor_commission_percent,created_date
15343,[],F,ITEMDEFINED,,,201489,,,0,13494.81,...,-1.0,976230,F,F,328700,3715281,0.0,,0,2022-01-03


In [13]:
invoice_line_items["item_type"].value_counts()

item_type
Inventory Item                235577
Description                   104809
Service                        88344
Non-inventory Item             50727
Other Charge                   49621
Item Group                     12047
Assembly/Bill of Materials      3343
Payment                          119
Kit/Package                       11
Markup                             7
Discount                           5
Name: count, dtype: int64

In [14]:
# drop item types that are not related to products/services
drop_list = ["Description", "Markup", "Item Group", "Other Charge", "Discount", "Payment"]
invoice_line_items = invoice_line_items[~invoice_line_items["item_type"].isin(drop_list)]

In [15]:
invoice_line_items['item_type'].value_counts()

item_type
Inventory Item                235577
Service                        88344
Non-inventory Item             50727
Assembly/Bill of Materials      3343
Kit/Package                       11
Name: count, dtype: int64

In [18]:
invoice_line_items["total_amount"] = -1 * invoice_line_items["quantity"] * invoice_line_items["unit_price"]
invoice_line_items["total_amount"].sum()

np.float64(575514846.0806015)

In [21]:
# find invoice line df where created_from is null
line_items_with_no_sales_order = invoice_line_items[invoice_line_items["created_from"] == "null"]
line_items_with_no_sales_order["total_amount"].sum()

np.float64(17667687.982)

In [18]:
# calculate total sales for each sku using est_gross_profit, capture sku and item_name
total_sales_by_sku = invoice_line_items.groupby(["sku", "item_name", "display_name"])["est_gross_profit"].sum().reset_index()

In [19]:
total_sales_by_sku

Unnamed: 0,sku,item_name,display_name,est_gross_profit
0,100,912BDCM01 .5 x .75 KUNKLE SV,,29127.04
1,1000002,1000002,"71046508-010 8"" UniCup Pig in 65 Durometer Max...",2075.37
2,1000004,1000004,"71046506-010 6"" UniCup Pig in 65 Durometer Max...",7772.27
3,10002,4946001,MODULAR PISTON BOTTOM,17.01
4,10003,14393014299,,0.00
...,...,...,...,...
64972,999191,Rosemount® 3051S1TG3A2E11A1AB4,,2195.56
64973,999393,SS-8CP4-KZ-10,Swagelok,3304.96
64974,9995,123839,Teflon Seat,46.00
64975,9996,14355001365,,290.00


In [20]:
# save to excel
total_sales_by_sku.to_excel("data/total_sales_by_sku.xlsx", index=False)

In [None]:
# calculate monthly sales (using created_date) by sku using est_gross_profit, capture sku and item_name
invoiced_sales_by_sku_by_month = invoice_line_items.groupby([invoice_line_items["created_date"].dt.to_period("M"), "sku", "item_name"])["est_gross_profit"].sum().reset_index()

In [None]:
invoiced_sales_by_sku_by_month