# Create Initial Dataset from Koenig Data

The goal here is to try and build the first dataset that ties together Accounts, Dealer Stock Units, Customer Equipment, Stores, and Users. This will be a dataset that will allow us to identify customer buying cycles

In [4]:
%load_ext dotenv
%dotenv
%load_ext autoreload
%autoreload 2

In [62]:
import json

import polars as pl
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick

from src.transformation.translate import (
    translate_csv_to_common_model,
    translate_koenig_account_columns,
    translate_keonig_customer_equipment,
    translate_koenig_stock_unit,
    translate_koenig_purchase_orders,
    translate_koenig_service_requests,
    translate_koenig_user,
    translate_koenig_store
)

## Pull in Data and Translate

In [328]:
account_df = translate_csv_to_common_model(
    "data/dealers/koenig/account.csv",
    "koenig",
    "./src/transformation/semantic_layer.json",
    "account",
)
customer_equipment_df = translate_csv_to_common_model(
    "data/dealers/koenig/customer-equipment.csv",
    "koenig",
    "./src/transformation/semantic_layer.json",
    "customer_equipment",
)
stock_unit_df = translate_csv_to_common_model(
    "data/dealers/koenig/dealer-stock-unit.csv",
    "koenig",
    "./src/transformation/semantic_layer.json",
    "dealer_stock_unit",
)
purchase_orders_df = translate_csv_to_common_model(
    "data/dealers/koenig/purchase-order.csv",
    "koenig",
    "./src/transformation/semantic_layer.json",
    "purchase_order",
)
service_requests_df = translate_csv_to_common_model(
    "data/dealers/koenig/service-requests.csv",
    "koenig",
    "./src/transformation/semantic_layer.json",
    "service_requests",
)
user_df = translate_csv_to_common_model(
    "data/dealers/koenig/user.csv",
    "koenig",
    "./src/transformation/semantic_layer.json",
    "user",
)
store_df = translate_csv_to_common_model(
    "data/dealers/koenig/store.csv",
    "koenig",
    "./src/transformation/semantic_layer.json",
    "store",
)
task_df = translate_csv_to_common_model(
    "data/dealers/koenig/task.csv",
    "koenig",
    "./src/transformation/semantic_layer.json",
    "task",
)
account_df = translate_koenig_account_columns(account_df)
customer_equipment_df = translate_keonig_customer_equipment(customer_equipment_df)
stock_unit_df = translate_koenig_stock_unit(stock_unit_df)
purchase_orders_df = translate_koenig_purchase_orders(purchase_orders_df)
service_requests_df = translate_koenig_service_requests(service_requests_df)
user_df = translate_koenig_user(user_df)
store_df = translate_koenig_store(store_df)

In [None]:
task_df["task_subtype"].value_counts()

In [None]:
task_df.filter(pl.col("task_subtype") == "Task")

In [331]:
account_df = account_df[["account_id", "account_number", "account_owner_id", "customer_segment", "type_of_equipment", "customer_loyalty", "customer_business_class", "engagement_level"]]
customer_equipment_df = customer_equipment_df[["customer_equipment_id", "account_id", "dealer_stock_number", "dealer_stock_unit", "ce_model_year", "ce_make", "ce_model", "ce_serial_number", "ce_status", "ce_group", "ce_hours", "ce_sale_date", "ce_hours_date_updated", "ce_sale_amount", "ce_last_service_date"]]
stock_unit_df = stock_unit_df[["dealer_stock_unit_id", "dealer_stock_number", "dsu_account_id", "dsu_description", "dsu_status", "dsu_group", "dsu_model_year", "dsu_make", "dsu_model", "dsu_serial_number", "dsu_new_used", "dsu_sub_group", "dsu_hours_or_units", "dsu_store", "dsu_user_traded_by", "dsu_sale_price", "dsu_sales_date", "dsu_invoice_number", "dsu_sold_by"]]
user_df = user_df[["user_id", "user_branch_location", "user_primary_store_location_id", "user_manager", "user_region_manager", "user_title", "user_active"]]
store_df = store_df[["store_id", "store_branch", "store_postal_code", "store_state", "store_city", "store_county"]]
task_df = task_df[
    [
        "task_id",
        "task_account_id",
        "task_owner_id",
        "task_created_date",
        "task_activity_date",
        "task_type",
        "task_subtype",
        "task_status",
        "task_priority",
        "task_subject",
    ]
]

In [None]:
service_requests_df

In [None]:
service_requests_df.filter(
    pl.col("service_dealer_stock_unit_id") == "a065f000001RXNZAA4"
)

In [None]:
service_requests_df.filter(
    pl.col("service_dealer_stock_unit_id").is_in(
        stock_unit_df.filter(pl.col("dsu_account_id") == "0015f000005XwrwAAC")[
            "dealer_stock_unit_id"
        ].to_list()
    )
)

In [None]:
stock_unit_df.filter(pl.col("dsu_account_id") == "0015f000005XwrwAAC")

In [None]:
# Remove customer_equipment_df of rows with dealer_stock_number in stock_unit_df
print(len(customer_equipment_df))
customer_equipment_df = customer_equipment_df.filter(pl.col("dealer_stock_number").is_in(stock_unit_df["dealer_stock_number"].unique().to_list()))
print(len(customer_equipment_df))

In [None]:
task_df = (
    task_df.filter(pl.col("task_status") == "Completed")
    .filter((pl.col("task_subtype").is_in(["Email", "Call"])) | (pl.col("task_subject").str.contains("Call Report")))
    .group_by("task_account_id")
    .agg(
        pl.count("task_id").alias("task_count"),
        pl.max("task_activity_date").alias("last_task_date"),
        pl.min("task_activity_date").alias("first_task_date"),
    )
)

data_df = account_df.join(customer_equipment_df, on="account_id", how="left")
data_df = data_df.join(stock_unit_df, on="dealer_stock_number", how="left")
data_df = data_df.join(user_df, left_on="account_owner_id", right_on="user_id", how="left")
data_df = data_df.join(store_df, left_on="user_branch_location", right_on="store_id", how="left")
data_df = data_df.join(task_df, left_on="account_id", right_on="task_account_id", how="left")
data_df.head()

In [None]:
print(data_df["dsu_group"].unique().to_list())

In [None]:
data_df["dsu_make"].value_counts().sort(by="count", descending=True).head(10)

In [None]:
def get_majority_value(primary_keys: list, value:str):
    return (
        pl.col(value)
        .drop_nulls()  # Use this if you don't want to consider "None" as a possible mode
        .mode()
        .first()
        .over(primary_keys)
        .alias("majority_value")
    )


data_df[["dsu_make", "dsu_description"]].with_columns(
    get_majority_value(primary_keys=["dsu_make"], value="dsu_description")
)[["dsu_make", "majority_value"]].unique()

In [None]:
data_df["dsu_make"].value_counts().sort(by="count", descending=True).head(15)

In [None]:
data_df.group_by("dsu_make").agg(pl.sum("dsu_sale_price")).sort(by="dsu_sale_price", descending=True).head(20)

In [None]:
data_df.filter(pl.col("dsu_make") == "CT")["dsu_description"].value_counts().sort(
    by="count", descending=True
).head(10)

In [None]:
make_desc_df = data_df[["dsu_make", "dsu_description"]]
# for a given make, what are the most common descriptions
make_desc_df = make_desc_df.group_by(["dsu_make", "dsu_description"]).agg(
    pl.count("dsu_description").alias("count")
).sort(by="count", descending=True)
make_desc_df.with_columns(pl.col("dsu_)

In [None]:
data_df.group_by("dsu_group").agg(
    [pl.sum("dsu_sale_price"), pl.count("dealer_stock_unit_id")]
).sort("dealer_stock_unit_id", descending=True).head(10)

In [36]:
data_df = data_df.filter(
    ~pl.col("dsu_invoice_number").is_in(["NOTAVA", "EQPADD"])
).filter(pl.col("dsu_group") != "AMS COMPONENTS")

In [None]:
data_df.describe()

In [None]:
data_df["customer_segment"].value_counts().sort(by="count", descending=True)

In [None]:
data_df["customer_segment"].unique().to_list()

In [None]:
data_df["type_of_equipment"].value_counts().sort(by="count", descending=True)

## Create Aggregate Statistics for Accounts
- Calculate total equipment spend
- Most common category
- Frequency in purchases per year
- Monetary spend per year
- Age of equipment (if year > 1900)

In [326]:
account_level_df = (
    data_df.filter(~pl.col("dsu_invoice_number").is_in(["NOTAVA", "EQPADD"]))
    .filter(pl.col("dsu_group") != "AMS COMPONENTS")
    .group_by("account_id")
    .agg(
        [
            pl.sum("dsu_sale_price").alias("total_sale_amount"),
            pl.n_unique("dealer_stock_number").alias("total_units"),
            pl.col("dsu_group").mode().first().alias("most_common_group"),
            pl.when(pl.col("dsu_model_year") > 1900)
            .then(pl.col("dsu_sales_date").dt.year() - pl.col("dsu_model_year"))
            .otherwise(None)
            .mean()
            .alias("average_equipment_age"),
            pl.max("dsu_sales_date").alias("last_sale_date"),
            pl.mean("task_count").alias("task_count"),
            pl.max("last_task_date").alias("last_task_date"),
            pl.min("first_task_date").alias("first_task_date"),
            pl.first("customer_segment").alias("customer_segment"),
            pl.first("type_of_equipment").alias("type_of_equipment"),
            pl.first("customer_loyalty").alias("customer_loyalty"),
            pl.first("customer_business_class").alias("customer_business_class"),
            pl.first("engagement_level").alias("engagement_level"),
            pl.first("user_branch_location").alias("user_branch_location"),
            pl.first("account_owner_id").alias("owner_id"),
            pl.first("user_active").alias("user_active"),
        ]
    )
)
account_level_df = account_level_df.with_columns(
    pl.col("task_count").fill_null(0),
)

In [None]:
account_df.shape

In [None]:
account_level_df.head()

In [None]:
account_level_df.filter(pl.col("first_task_date").is_not_null()).sort("first_task_date", descending=False).head(10)

In [None]:
account_level_df.group_by("owner_id").agg(
    [pl.sum("total_sale_amount"), pl.sum("total_units")]
).sort(by="total_sale_amount", descending=True)

In [None]:
account_level_df.group_by("owner_id").agg(
    [pl.sum("total_sale_amount"), pl.sum("total_units")]
).sort(by="total_sale_amount", descending=True)

In [None]:
account_level_df.group_by("owner_id").agg(
    [pl.sum("total_sale_amount"), pl.sum("total_units")]
).sort(by="total_units", descending=True)

In [None]:
account_level_df.group_by("owner_id").agg(
    [pl.sum("total_sale_amount"), pl.sum("total_units")]
).quantile(.9)

In [None]:
account_level_df.filter(pl.col("user_active") == False)["total_sale_amount"].sum() / account_level_df["total_sale_amount"].sum()

In [None]:
# what's percent of sales volume by top 5 sales people
top_5_sales_people = account_level_df.group_by("owner_id").agg(
    pl.sum("total_sale_amount").alias("total_sale_amount")
).sort(by="total_sale_amount", descending=True).head(5)
top_5_sales_people["total_sale_amount"].sum() / account_level_df[
    "total_sale_amount"
].sum()

In [None]:
n_top_1_perc = int(account_level_df.shape[0] * 0.01)
top_1_perc_accounts = account_level_df.sort(by="total_sale_amount", descending=True).head(n_top_1_perc)
top_1_perc_accounts["total_sale_amount"].sum() / account_level_df["total_sale_amount"].sum()

In [None]:
top_1_perc_accounts["total_units"].sum() / account_level_df["total_units"].sum()

In [None]:
# what's percent of sales volume by top 5 sales people
top_5_sales_people = (
    account_level_df.group_by("owner_id")
    .agg(pl.sum("total_units").alias("total_units"))
    .sort(by="total_units", descending=True)
    .head(5)
)
top_5_sales_people["total_units"].sum() / account_level_df["total_units"].sum()

In [None]:
# percentage of accounts with non-active owner_id
len(account_level_df.filter(pl.col("user_active") == False)) / len(account_level_df)

In [None]:
account_level_df.filter(pl.col("account_id") == "0015f000005X2stAAC").head()

In [None]:
account_level_df.filter(pl.col("customer_segment") == "Cash Grain").sort(by="total_sale_amount", descending=True).head()

In [None]:
len(account_level_df.filter(pl.col("total_sale_amount") < 10)) / len(account_level_df)

In [None]:
len(account_level_df.filter(pl.col("customer_segment") == "Cash Grain"))

In [None]:
account_level_df.filter(pl.col("customer_segment") == "Cash Grain").group_by(
    "customer_business_class"
).agg(pl.count("account_id"))

In [None]:
account_level_df.filter(pl.col("customer_segment") == "Cash Grain").group_by(
    "customer_business_class"
).agg(pl.sum("total_sale_amount"))

In [None]:
account_level_df.filter(pl.col("customer_business_class") == "A - Strategic Account").group_by("customer_loyalty").agg(
    pl.count("account_id")
)

In [None]:
plt.figure(figsize=(10, 6))
ax = sns.scatterplot(
    data=account_level_df.filter(pl.col("customer_segment") == "Cash Grain").filter(pl.col("customer_business_class").is_in(["A - Strategic Account", "B - Key Account", "C - Relationship Account", "D - Transaction Account"])),
    x="total_sale_amount",
    y="total_units",
    hue="customer_business_class",
    palette="colorblind",
)
plt.ylabel("Total Units")
plt.xlabel("Total Sale Amount")
# set x-axis to 1,000s of $
ax.xaxis.set_major_formatter(mtick.FuncFormatter(lambda x, _: f"${x / 1000000:,.0f}M"))
plt.title("Cash Grain Customers")

In [None]:
plt.figure(figsize=(10, 6))
ax = sns.scatterplot(
    data=account_level_df.filter(pl.col("customer_segment") == "Governmental"),
    x="total_sale_amount",
    y="total_units",
    hue="customer_business_class",
    palette="colorblind",
)
plt.ylabel("Total Units")
plt.xlabel("Total Sale Amount")

# set x-axis to 1,000s of $
ax.xaxis.set_major_formatter(mtick.FuncFormatter(lambda x, _: f"${x / 1000000:,.0f}M"))
plt.title("Governmental Customers")

In [None]:
plt.figure(figsize=(10, 6))
ax = sns.scatterplot(
    data=account_level_df.filter(
        pl.col("customer_segment").is_in(
            ["Grain Beef", "Grain Hogs", "Grain Livestock", "Grain Dairy", "Dairy"]
        )
    ),
    x="total_sale_amount",
    y="total_units",
    hue="customer_business_class",
    palette="colorblind",
)
plt.ylabel("Total Units")
plt.xlabel("Total Sale Amount")

# set x-axis to 1,000s of $
ax.xaxis.set_major_formatter(mtick.FuncFormatter(lambda x, _: f"${x / 1000000:,.0f}M"))
plt.title("Livestock Customers")

In [None]:
plt.figure(figsize=(10, 6))
ax = sns.scatterplot(
    data=account_level_df.filter(pl.col("customer_segment") == "Cash Grain").filter(pl.col("customer_business_class").is_null()),
    x="total_sale_amount",
    y="total_units",
)
plt.ylabel("Total Units")
plt.xlabel("Total Sale Amount")
# set x-axis to 1,000s of $
ax.xaxis.set_major_formatter(mtick.FuncFormatter(lambda x, _: f"${x / 1000:,.0f}K"))
plt.title("Cash Grain Customers")

In [None]:
account_level_df[["customer_business_class", "task_count"]].group_by("customer_business_class").mean()

In [None]:
account_level_df.filter(pl.col("customer_business_class") == "A - Strategic Account").filter((pl.col("task_count") < 1)) .sort(by="total_sale_amount", descending=True).head()

In [None]:
len(account_level_df.filter(
    pl.col("customer_business_class") == "A - Strategic Account"
).filter((pl.col("task_count") < 2))) / len(account_level_df.filter(pl.col("customer_business_class") == "A - Strategic Account"))

In [None]:
len(
    account_level_df.filter(
        pl.col("customer_business_class") == "B - Key Account"
    ).filter((pl.col("task_count") < 2) | (pl.col("task_count").is_null())) 
) / len(account_level_df.filter(pl.col("customer_business_class") == "B - Key Account"))

In [None]:
sns.boxplot(
    data=account_level_df.filter(pl.col("customer_segment") == "Cash Grain"),
    x="customer_business_class",
    y="task_count",
)
plt.xticks(rotation=90)

In [None]:
sns.kdeplot(
    data=account_level_df.filter(pl.col("customer_segment") == "Cash Grain").filter(pl.col("customer_business_class").is_in(["A - Strategic Account", "B - Key Account"])),
    x="task_count",
    hue="customer_business_class",
    fill=True,
)
plt.xticks(rotation=90)

In [None]:
len(
    account_level_df.filter(pl.col("total_sale_amount") > 100000).filter(
        pl.col("customer_segment").is_null()
    )
) / len(
    account_level_df.filter(pl.col("total_sale_amount") > 100000)
)

In [None]:
len(
    account_level_df.filter(pl.col("total_sale_amount") > 10000).filter(
        pl.col("customer_business_class").is_null()
    )
) / len(account_level_df.filter(pl.col("total_sale_amount") > 10000))

In [None]:
len(
    account_level_df.filter(pl.col("total_sale_amount") > 10000).filter(
        pl.col("customer_segment").is_null()
    )
) / len(account_level_df.filter(pl.col("total_sale_amount") > 10000))

In [None]:
len(
    account_level_df.filter(pl.col("total_sale_amount") > 100000).filter(
        pl.col("customer_segment").is_null()
    )
) / len(account_level_df.filter(pl.col("total_sale_amount") > 100000))

## Dealer Stock Unit

In [None]:
len(stock_unit_df.filter(pl.col("dsu_model_year") == 0)) / len(stock_unit_df)

In [None]:
stock_unit_df["dsu_group"].value_counts().sort(by="count", descending=True)

In [None]:
stock_unit_df.filter(~pl.col("dsu_invoice_number").is_in(["NOTAVA", "EQPADD", "N/A"])).filter(pl.col("dsu_group") != "AMS COMPONENTS").filter(pl.col("dsu_sale_price") < 1)

In [None]:
len(stock_unit_df.filter(~pl.col("dsu_invoice_number").is_in(["NOTAVA", "EQPADD", "N/A"]))) /  len(stock_unit_df)

In [None]:
len(
    stock_unit_df.filter(
        ~pl.col("dsu_invoice_number").is_in(["NOTAVA", "EQPADD", "N/A"])
    ).filter(
        pl.col("dsu_group") == "COMBINES")
) / len(stock_unit_df.filter(
        pl.col("dsu_group") == "COMBINES"))

In [None]:
len(
    stock_unit_df.filter(
        ~pl.col("dsu_invoice_number").is_in(["NOTAVA", "EQPADD", "N/A"])
    ).filter(pl.col("dsu_group") == "ROW CROP TRACTOR")
) / len(stock_unit_df.filter(pl.col("dsu_group") == "ROW CROP TRACTOR"))

In [None]:
len(
    stock_unit_df.filter(pl.col("dsu_model_year") == 0).filter(
        pl.col("dsu_group") == "COMBINES"
    )
) / len(stock_unit_df.filter(pl.col("dsu_group") == "COMBINES"))

In [None]:
len(stock_unit_df.filter(pl.col("dsu_model_year") == 0)) / len(stock_unit_df)

In [None]:
len(
    stock_unit_df.filter(pl.col("dsu_model_year") == 0).filter(
        pl.col("dsu_group") == "ROW CROP TRACTOR"
    )
) / len(stock_unit_df.filter(pl.col("dsu_group") == "ROW CROP TRACTOR"))

In [None]:
len(
    stock_unit_df.filter(pl.col("dsu_model_year") == 0)
    .filter(~pl.col("dsu_invoice_number").is_in(["NOTAVA", "EQPADD", "N/A"]))
) / len(
    stock_unit_df.filter(
        ~pl.col("dsu_invoice_number").is_in(["NOTAVA", "EQPADD", "N/A"])
    ))

In [None]:
len(
    stock_unit_df.filter(pl.col("dsu_model_year") == 0)
    .filter(~pl.col("dsu_invoice_number").is_in(["NOTAVA", "EQPADD", "N/A"]))
    .filter(pl.col("dsu_group") == "ROW CROP TRACTOR")
) / len(
    stock_unit_df.filter(~pl.col("dsu_invoice_number").is_in(["NOTAVA", "EQPADD", "N/A"]))
    .filter(pl.col("dsu_group") == "ROW CROP TRACTOR")
)

In [None]:
len(
    stock_unit_df.filter(pl.col("dsu_model_year") == 0)
    .filter(~pl.col("dsu_invoice_number").is_in(["NOTAVA", "EQPADD", "N/A"]))
    .filter(pl.col("dsu_group") == "COMBINES")
) / len(
    stock_unit_df.filter(
        ~pl.col("dsu_invoice_number").is_in(["NOTAVA", "EQPADD", "N/A"])
    ).filter(pl.col("dsu_group") == "COMBINES")
)

In [None]:
len(
    stock_unit_df.filter(pl.col("dsu_hours_or_units") == 0)
    .filter(pl.col("dsu_new_used") == "U")
    .filter(pl.col("dsu_group") == "COMBINES")
) / len(
    stock_unit_df.filter(pl.col("dsu_group") == "COMBINES").filter(
        pl.col("dsu_new_used") == "U"
    )
)

In [None]:
len(
    stock_unit_df.filter(pl.col("dsu_hours_or_units") == 0)
    .filter(pl.col("dsu_new_used") == "U")
    .filter(pl.col("dsu_group") == "ROW CROP TRACTOR")
) / len(
    stock_unit_df.filter(pl.col("dsu_group") == "ROW CROP TRACTOR").filter(
        pl.col("dsu_new_used") == "U"
    )
)

In [None]:
len(
    stock_unit_df.filter(pl.col("dsu_hours_or_units") == 0).filter(
        pl.col("dsu_new_used") == "U"
    )
) / len(stock_unit_df.filter(pl.col("dsu_new_used") == "U"))

In [None]:
len(
    stock_unit_df.filter(pl.col("dsu_hours_or_units") == 0)
    .filter(pl.col("dsu_new_used") == "U")
    .filter(~pl.col("dsu_invoice_number").is_in(["NOTAVA", "EQPADD", "N/A"]))
) / len(
    stock_unit_df.filter(pl.col("dsu_new_used") == "U").filter(
        ~pl.col("dsu_invoice_number").is_in(["NOTAVA", "EQPADD", "N/A"])
    )
)

In [None]:
len(
    stock_unit_df.filter(pl.col("dsu_hours_or_units") == 0)
    .filter(pl.col("dsu_new_used") == "U")
    .filter(~pl.col("dsu_invoice_number").is_in(["NOTAVA", "EQPADD", "N/A"]))
    .filter(pl.col("dsu_group") == "ROW CROP TRACTOR")
) / len(
    stock_unit_df.filter(pl.col("dsu_new_used") == "U")
    .filter(~pl.col("dsu_invoice_number").is_in(["NOTAVA", "EQPADD", "N/A"]))
    .filter(pl.col("dsu_group") == "ROW CROP TRACTOR")
)

In [None]:
len(
    stock_unit_df.filter(pl.col("dsu_hours_or_units") == 0)
    .filter(pl.col("dsu_new_used") == "U")
    .filter(~pl.col("dsu_invoice_number").is_in(["NOTAVA", "EQPADD", "N/A"]))
    .filter(pl.col("dsu_group") == "ROW CROP TRACTOR")
) / len(
    stock_unit_df.filter(pl.col("dsu_new_used") == "U")
    .filter(~pl.col("dsu_invoice_number").is_in(["NOTAVA", "EQPADD", "N/A"]))
    .filter(pl.col("dsu_group") == "COMBINES")
)

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(x="dsu_sales_date", data=stock_unit_df.to_pandas())
plt.ylabel("Count of Units Sold")
plt.xlabel("Sales Date")

In [None]:
plt.figure(figsize=(10, 6))
ax = sns.kdeplot(x="dsu_sale_price", data=stock_unit_df.to_pandas())
plt.ylabel("Count of Sale Price")
plt.xlabel("Sale Price")
ax.xaxis.set_major_formatter(mtick.FuncFormatter(lambda x, _: f"${x / 1000:,.0f}K"))

In [None]:
stock_unit_df.filter(pl.col("dsu_sale_price") < 1).shape[0] / stock_unit_df.shape[0]

In [None]:
stock_unit_df.filter(pl.col("dsu_sale_price") < 1).filter(
    pl.col("dsu_group") == "ROW CROP TRACTOR"
).shape[0] / stock_unit_df.filter(pl.col("dsu_group") == "ROW CROP TRACTOR").shape[0]

In [None]:
stock_unit_df.filter(pl.col("dsu_sale_price") < 1).filter(
    pl.col("dsu_group") == "COMBINES"
).shape[0] / stock_unit_df.filter(pl.col("dsu_group") == "COMBINES").shape[0]

In [None]:
stock_unit_df.filter(pl.col("dsu_sale_price") < 1).filter(
    pl.col("dsu_group") == "ROW CROP TRACTOR"
).filter(~pl.col("dsu_invoice_number").is_in(["NOTAVA", "EQPADD", "N/A"])).shape[
    0
] / stock_unit_df.filter(
    pl.col("dsu_group") == "ROW CROP TRACTOR"
).filter(
    ~pl.col("dsu_invoice_number").is_in(["NOTAVA", "EQPADD", "N/A"])
).shape[
    0
]

In [None]:
stock_unit_df.filter(pl.col("dsu_sale_price") < 1).filter(
    pl.col("dsu_group") == "COMBINES"
).filter(~pl.col("dsu_invoice_number").is_in(["NOTAVA", "EQPADD", "N/A"])).shape[
    0
] / stock_unit_df.filter(
    pl.col("dsu_group") == "COMBINES"
).filter(
    ~pl.col("dsu_invoice_number").is_in(["NOTAVA", "EQPADD", "N/A"])
).shape[
    0
]

## Service Work Orders

In [None]:
service_requests_df = translate_csv_to_common_model(
    "data/dealers/koenig/service-requests.csv",
    "koenig",
    "./src/transformation/semantic_layer.json",
    "service_requests",
)
service_requests_df = translate_koenig_service_requests(service_requests_df)
service_requests_df.head()

In [None]:
len(service_requests_df)

In [None]:
# group by year and count
service_requests_df["service_close_date"].dt.year().value_counts().sort(
    by="service_close_date", descending=False
)

In [None]:
1 / 30

In [None]:
service_requests_df["service_close_date"].dt.year().value_counts().mean()

In [None]:
service_requests_df["service_invoice_value"].sum()

In [None]:
service_requests_df["service_invoice_value"].mean()

In [None]:
sns.histplot(x="service_close_date", data=service_requests_df.to_pandas())
plt.ylabel("Count of Service Requests")
plt.xlabel("Service Close Date")

In [None]:
import matplotlib.dates as mdates

service_requests_monthly_df = service_requests_df.group_by(
    service_requests_df["service_close_date"].dt.strftime("%Y-%m").alias("service_close_month").str.to_date("%Y-%m")
).agg(
    pl.count("service_order_id").alias("count"),
    pl.sum("service_invoice_value").alias("sum"),
)

plt.Figure(figsize=(10, 6))
ax = sns.lineplot(
    x="service_close_month", y="count", data=service_requests_monthly_df.to_pandas()
)
plt.ylabel("Count of Service Requests")
plt.xlabel("Service Close Date")

# Set x-ticks to every 6 months
plt.xticks(rotation=45)
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=6))
plt.show()

In [None]:
service_requests_monthly_df.head()

In [322]:
from statsmodels.tsa.seasonal import seasonal_decompose

result = seasonal_decompose(
    service_requests_monthly_df["count"], model="multiplicative", period=12
)

In [None]:
result.plot()
plt.show()

In [None]:
sns.kdeplot(x="service_invoice_value", data=service_requests_df.to_pandas())

In [None]:
service_requests_df.filter(pl.col("service_invoice_value") < 0).shape[0] / service_requests_df.shape[0]

In [None]:
service_requests_account_df = account_df.join(
    stock_unit_df[["dealer_stock_unit_id", "dsu_account_id"]],
    left_on="account_id",
    right_on="dsu_account_id",
    how="left",
).join(
    service_requests_df[["service_order_id", "service_dealer_stock_unit_id", "service_invoice_value"]],
    left_on="dealer_stock_unit_id",
    right_on="service_dealer_stock_unit_id",
    how="right",
)
service_requests_account_df = service_requests_account_df.group_by("account_id").agg(
    [
        pl.sum("service_invoice_value").alias("total_invoice_value"),
        pl.count("service_order_id").alias("total_service_requests"),
        pl.first("customer_segment").alias("customer_segment"),
        pl.first("type_of_equipment").alias("type_of_equipment"),
        pl.first("customer_loyalty").alias("customer_loyalty"),
        pl.first("customer_business_class").alias("customer_business_class"),
        pl.first("engagement_level").alias("engagement_level"),
    ]
)

service_requests_account_df = service_requests_account_df.with_columns(
    pl.col("total_invoice_value").fill_null(0),
    pl.col("total_service_requests").fill_null(0),
)
service_requests_account_df.head()

In [None]:
service_requests_account_df.sort(by="total_invoice_value", descending=True).head()

In [None]:
service_requests_account_df.filter(pl.col("customer_business_class") == "A - Strategic Account")["total_invoice_value"].sum()

In [None]:
service_requests_account_df.group_by("customer_segment").agg(
    pl.sum("total_invoice_value"), pl.sum("total_service_requests"), pl.mean("total_invoice_value").alias("average_invoice_value")
).sort(by="total_invoice_value", descending=True)

In [None]:
5.9003e7 / 120385000

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Prepare the data
features = account_level_df.filter(pl.col("customer_segment") == "Cash Grain").filter(pl.col("customer_business_class").is_not_null()).select(["total_units", "total_sale_amount", "average_equipment_age"]).to_pandas()
target = (
    account_level_df.filter(pl.col("customer_segment") == "Cash Grain")
    .filter(pl.col("customer_business_class").is_not_null())["customer_business_class"]
    .to_pandas()
)
# convert target to numeric
target = pd.Categorical(target)

# Handle missing values
features = features.fillna(features.mean())


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=69)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create and train the KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Make predictions
y_pred = knn.predict(X_test)

# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
len(
    account_level_df.filter(pl.col("customer_segment") == "Cash Grain").filter(
        pl.col("customer_business_class").is_not_null()
    )
)

In [None]:
len(
    account_df.filter(pl.col("customer_segment") == "Cash Grain").filter(
        pl.col("customer_business_class").is_not_null()
    )
)

In [None]:
features = (
    account_level_df.filter(pl.col("customer_segment") == "Cash Grain")
    .select(
        [
            "account_id",
            "total_units",
            "total_sale_amount",
            "average_equipment_age",
            "customer_business_class",
        ]
    )
    .to_pandas()
)
# fill na values with mean for total_units, total_sale_amount, and average_equipment_age but not for customer_business_class or account_id
features[["total_units", "total_sale_amount", "average_equipment_age"]] = features[
    ["total_units", "total_sale_amount", "average_equipment_age"]
].fillna(features[["total_units", "total_sale_amount", "average_equipment_age"]].mean())

preds = knn.predict(scaler.transform(features[['total_units', 'total_sale_amount', 'average_equipment_age']]))
features["predicted_business_class"] = preds
features.head()

In [None]:
features.loc[features["customer_business_class"] != features["predicted_business_class"]].sort_values(by="total_sale_amount", ascending=False)

In [None]:
features.loc[
    (features["customer_business_class"].isin(["R - Relationship Account"]))
    & (features["predicted_business_class"].isin(["A - Strategic Account"]))
].sort_values(by="total_sale_amount", ascending=False)

In [None]:
features.loc[
    (features["customer_business_class"].isin(["A - Strategic Account"]))
    & (features["predicted_business_class"].isin(["R - Relationship Account"]))
].sort_values(by="total_sale_amount", ascending=False)

In [None]:
import numpy as np
trailing_6_months = [280.31, 269.30, 276.29, 277.64, 268.65, 277.77]
# calculate the average, min, and max and 6 month forcast
average = sum(trailing_6_months) / len(trailing_6_months)
min_value = min(trailing_6_months)
max_value = max(trailing_6_months)
# Calculate the monthly increase trend
monthly_increase_trend = [(trailing_6_months[i] - trailing_6_months[i - 1]) for i in range(1, len(trailing_6_months))]


print(f"Average: {average}, Min: {min_value}, Max: {max_value}, Monthly Increase Trend: {np.mean(monthly_increase_trend)}")

In [None]:
import numpy as np

trailing_6_months = [10.49, 82.89, 411.01, 99.92, 93.46, 1063.70]
# calculate the average, min, and max and 6 month forcast
average = sum(trailing_6_months) / len(trailing_6_months)
min_value = min(trailing_6_months)
max_value = max(trailing_6_months)
# Calculate the monthly increase trend
monthly_increase_trend = [
    (trailing_6_months[i] - trailing_6_months[i - 1])
    for i in range(1, len(trailing_6_months))
]


print(
    f"Average: {average}, Min: {min_value}, Max: {max_value}, Monthly Increase Trend: {np.mean(monthly_increase_trend)}"
)

In [None]:
from sklearn.linear_model import LinearRegression
import numpy as np

# Prepare the data
X = np.arange(len(trailing_6_months)).reshape(-1, 1)
y = np.array(trailing_6_months[::-1])

# Create and train the model
model = LinearRegression()
model.fit(X, y)

# Make predictions
trend = model.predict(X)

# Print the coefficients
print(f"Intercept: {model.intercept_}, Slope: {model.coef_[0]}")

# Plot the results
plt.figure(figsize=(10, 6))
plt.plot(X, y, 'o', label='Original data')
plt.plot(X, trend, 'r', label='Fitted line')
plt.xlabel('Month')
plt.ylabel('Value')
plt.title('Trend in Trailing 6 Months')
plt.legend()
plt.show()

In [None]:
171211 / 373298     