# Koenig Data Exploration

Reviewing the data inside of Koenig's instance for issues, missing data, and analytics/modeling viability

In [None]:
%load_ext dotenv
%dotenv
%load_ext autoreload
%autoreload 2

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import polars as pl
import seaborn as sns

from src.transformation.translate import (
    translate_csv_to_common_model,
    translate_keonig_customer_equipment,
    translate_koenig_account_columns,
    translate_koenig_purchase_orders,
    translate_koenig_service_requests,
    translate_koenig_stock_unit,
    translate_koenig_store,
    translate_koenig_user,
)
from src.transformation.category import CleanMakeModelData

## Account

In [None]:
account_df = translate_csv_to_common_model("data/dealers/koenig/account.csv", "koenig", "./src/transformation/semantic_layer.json", "account")
account_df = translate_koenig_account_columns(account_df)
account_df.head()

In [None]:
len(account_df)

In [None]:
account_df["county"].value_counts().sort("count", descending=True)

In [None]:
account_df["aor_customer"].value_counts().sort("count", descending=True)

In [None]:
account_df["primary_store_location"].value_counts().sort("count", descending=True)

In [None]:
account_df["primary_store_location"].n_unique()

17 different where one is null 16 total locations. Koenig has 15 locations listed so that 16th one is probably corporate

In [None]:
len(account_df.filter(pl.col("parent_id").is_not_null())) / len(account_df)

In [None]:
# desribe the engagement_level
account_df["engagement_level"].value_counts().sort("count", descending=True)

In [None]:
account_df["technology_rep"].value_counts().sort("count", descending=True)

In [None]:
account_df["customer_segment"].value_counts().sort("count", descending=True)

In [None]:
len(account_df.filter(pl.col("customer_segment").is_not_null())) / len(account_df)

Only 22% of accounts have a segment associated with them

In [None]:
sns.histplot(x="customer_segment", data=account_df.to_pandas())
plt.title("Customer Segment Distribution")
plt.xticks(rotation=90)
plt.ylabel("")
plt.xlabel("")

In [None]:
sns.histplot(x="type_of_equipment", data=account_df.to_pandas())
plt.title("Type of Equipment")
plt.ylabel("")
plt.xlabel("")

In [None]:
account_df["type_of_equipment"].value_counts().sort("count", descending=True)

In [None]:
account_df["customer_loyalty"].value_counts().sort("count", descending=True)

In [None]:
customer_loyalty_counts = account_df["customer_loyalty"].value_counts().sort("count", descending=True).to_pandas()
sns.barplot(x="customer_loyalty", y="count", data=customer_loyalty_counts)
plt.title("Customer Loyalty")
plt.xticks(rotation=90)
plt.ylabel("")
plt.xlabel("")

In [None]:
len(account_df.filter(pl.col("customer_loyalty").is_not_null())) / len(account_df)

In [None]:
len(account_df.filter(pl.col("call_frequency").is_not_null())) / len(account_df)

In [None]:
customer_call_frequency_df = (
    account_df["call_frequency"]
    .value_counts()
    .sort("count", descending=True)
    .to_pandas()
)
sns.barplot(x="call_frequency", y="count", data=customer_call_frequency_df)
plt.title("Customer Call Frequency")
plt.xticks(rotation=90)
plt.ylabel("")
plt.xlabel("")

In [None]:
account_df["customer_business_class"].value_counts().sort("count", descending=True)

In [None]:
customer_business_class_df = (
    account_df["customer_business_class"]
    .value_counts()
    .sort("count", descending=True)
    .to_pandas()
)
sns.barplot(x="customer_business_class", y="count", data=customer_business_class_df)
plt.title("Customer Business Class")
plt.xticks(rotation=90)
plt.ylabel("")
plt.xlabel("")

In [None]:
len(account_df.filter(pl.col("customer_business_class").is_null())) / len(
    account_df
)

In [None]:
customer_engagement_level_df = (
    account_df["engagement_level"]
    .value_counts()
    .sort("count", descending=True)
    .to_pandas()
)
sns.barplot(x="engagement_level", y="count", data=customer_engagement_level_df)
plt.title("Customer Engagement")
plt.xticks(rotation=90)
plt.ylabel("")
plt.xlabel("")

In [None]:
len(account_df.filter(pl.col("engagement_level").is_null())) / len(account_df)

In [None]:
len(account_df.filter(pl.col("billing_postal_code").is_null())) / len(account_df)

In [None]:
account_df["billing_postal_code"].drop_nulls().to_pandas().apply(lambda x: len(x)).value_counts().sort_values(ascending=False)

5 digit zip codes are the typical with 9 being the complete (minus hyphen). Assuming most of the 10 digit ones have a hyphen

## Dealer Stock Unit

In [None]:
stock_unit_df = translate_csv_to_common_model(
    "data/dealers/koenig/dealer-stock-unit.csv",
    "koenig",
    "./src/transformation/semantic_layer.json",
    "dealer_stock_unit",
)
stock_unit_df = translate_koenig_stock_unit(stock_unit_df)
stock_unit_df.head()

In [None]:
stock_unit_df.filter((pl.col("dsu_make") == "JD") & (pl.col("dsu_model") == "DT75"))

In [None]:
stock_unit_df.filter((pl.col("dsu_make") == "JD") & (pl.col("dsu_model") == "1775"))["dsu_group"].value_counts().sort("count", descending=True)

In [None]:
clean_make_model.make_model_data.filter(pl.col("make") == "John Deere").filter(pl.col("model") == "4010")

In [None]:
clean_make_model = CleanMakeModelData()
clean_make_model.create_aggregated_data(stock_unit_df, make_col="dsu_make", model_col="dsu_model", group_col="dsu_group")

In [None]:
clean_make_model.make_model_data.filter(pl.col("make") == "John Deere").filter(
    pl.col("model").str.to_lowercase().str.contains("75".lower())
)

In [None]:
clean_make_model.aggregated_data.filter(pl.col("category") == "Unknown")

In [None]:
clean_make_model.aggregated_data.filter(pl.col("model") == "1775")

In [None]:
clean_make_model.clean_make_model_data("JD", "15GA", "GRAIN HAULING")

In [31]:
# update datetime columns to datetime type
updated_df = updated_df.with_columns(
    **{col: pl.col(col).str.to_date(format="%Y-%m-%d", strict=False) for col in date_cols},
)

In [None]:
updated_df.head()

In [None]:
stock_unit_df = translate_koenig_stock_unit(stock_unit_df)
stock_unit_df.head()

In [None]:
stock_unit_df.filter(pl.col("dsu_sales_date").is_null()).shape[0] / stock_unit_df.shape[0]

In [None]:
stock_unit_df.filter(pl.col("dealer_stock_unit_id") == "a065f000003fI2WAAU")

In [None]:
stock_unit_df.filter(pl.col("dsu_sales_date").is_not_null()).sort(
    by="dsu_sales_date", descending=False,
).head()

In [None]:
sns.histplot(x="dsu_sales_date", data=stock_unit_df.to_pandas())

In [None]:
most_common_sold_date = stock_unit_df.to_pandas()["dsu_sales_date"].value_counts().idxmax()
print(f"The most common dsu_sales_date is: {most_common_sold_date}")

In [None]:
filtered_df = stock_unit_df.filter(pl.col("dsu_sales_date") == most_common_sold_date)
filtered_df.head()

In [None]:
sns.histplot(x="dsu_model_year", data=stock_unit_df.to_pandas())

In [None]:
sns.histplot(x="dsu_model_year", data=stock_unit_df.filter(pl.col("dsu_model_year") != 0).to_pandas())

In [None]:
len(stock_unit_df.filter(pl.col("dsu_model_year").is_not_null())) / stock_unit_df.shape[0]

In [None]:
len(stock_unit_df.filter(pl.col("dsu_model_year") == 0)) / stock_unit_df.shape[0]

In [None]:
len(
    stock_unit_df.filter(pl.col("dsu_serial_number").is_not_null()),
) / stock_unit_df.shape[0]

In [None]:
len(stock_unit_df.filter(pl.col("dsu_new_used").is_not_null())) / stock_unit_df.shape[0]

In [None]:
sns.histplot(x="dsu_new_used", data=stock_unit_df.to_pandas())

In [None]:
len(stock_unit_df.filter(pl.col("dsu_sub_group").is_not_null())) / stock_unit_df.shape[
    0
]

In [None]:
sns.histplot(x="dsu_sub_group", data=stock_unit_df.to_pandas())
plt.xticks(rotation=90)

In [None]:
len(
    stock_unit_df.filter(pl.col("dsu_hours_or_units").is_not_null()),
) / stock_unit_df.shape[0]

In [None]:
len(
    stock_unit_df.filter(pl.col("dsu_hours_or_units") == 0),
) / stock_unit_df.shape[0]

In [None]:
sns.histplot(
    x="dsu_hours_or_units",
    data=stock_unit_df.filter(pl.col("dsu_hours_or_units") != 0).to_pandas(),
)

In [None]:
stock_unit_df.filter(pl.col("dsu_hours_or_units").is_not_null()).sort(
    by="dsu_hours_or_units", descending=True,
).head()

In [None]:
sns.histplot(
    x="dsu_hours_or_units",
    data=stock_unit_df.filter(pl.col("dsu_hours_or_units") != 0).filter(pl.col("dsu_hours_or_units") < 15000).to_pandas(),
)

In [None]:
len(stock_unit_df.filter(pl.col("dsu_hours_or_units") < 15000).filter(pl.col("dsu_hours_or_units") != 0)) / stock_unit_df.shape[0]

In [None]:
types = stock_unit_df.dtypes
columns = stock_unit_df.columns
print([f"{columns[i]}: {types[i]}" for i in range(len(columns))])

In [245]:
def eda_polars(df: pl.DataFrame) -> pl.DataFrame:
    results = []

    # Rate of missing data
    missing_data = {col: df[col].null_count() / df.height for col in df.columns}

    for col in df.columns:
        if col == "_":
            continue
        col_data = {
            "field_name": col,
            "missing_data_rate": missing_data[col],
        }
        if df[col].dtype == pl.Int32 or df[col].dtype == pl.Float64 or df[col].dtype == pl.Int64:  # Checks for numeric types
            col_data.update({
                "mean": df[col].mean(),
                "std": df[col].std(),
                "min": df[col].min(),
                "max": df[col].max(),
                "median": df[col].median(),
                "25th_percentile": df[col].quantile(0.25),
                "75th_percentile": df[col].quantile(0.75),
                "median": df[col].median(),
                "zero_count": df[col].filter(df[col] == 0).shape[0],
                "non_null_count": df[col].filter(df[col].is_not_null()).shape[0],
            })

        if  df[col].dtype == pl.Date or df[col].dtype == pl.Datetime:
            most_common_date = (
                df[col].filter(~df[col].is_null()).mode().to_list()[0]
                if len(df[col].filter(~df[col].is_null()).mode()) > 0
                else None
            )
            if most_common_date:
                most_common_value_count = (
                    df[col].filter(df[col] == most_common_date).shape[0]
                )
            else:
                most_common_value_count = df[col].filter(df[col].is_null()).shape[0]
            col_data.update({
                "most_common_non_null_date": most_common_date,
                "count_of_most_common_date": most_common_value_count,
                "unique_dates": df[col].n_unique(),
            })

        if df[col].dtype == "String" or df[col].dtype == pl.Utf8 :  # Checks for categorical/string types
            most_common_value = (
                df[col].filter(~df[col].is_null()).mode().to_list()[0]
                if len(df[col].filter(~df[col].is_null()).mode()) > 0
                else None
            )
            most_common_value_count = df[col].filter(df[col] == most_common_value).shape[0]

            col_data.update({
                "most_common_non_null_value": most_common_value,
                "count_of_most_common_value": most_common_value_count,
                "unique_values": df[col].n_unique(),
            })

        if df[col].dtype == pl.Boolean:
            col_data.update({
                "true_count": df[col].filter(df[col] == True).shape[0],
                "false_count": df[col].filter(df[col] == False).shape[0],
                "missing_count": df[col].filter(df[col].is_null()).shape[0],
            })

        results.append(col_data)
    eda_df = pl.DataFrame(results)

    return eda_df

In [None]:
stock_unit_df["dsu_model_year"]

In [None]:
eda_results = eda_polars(stock_unit_df)
eda_results

In [None]:
stock_unit_df["dsu_serial_number"].value_counts().filter(pl.col("count") > 5).sum()["count"][0] / stock_unit_df.shape[0]

In [None]:
stock_unit_df["dsu_serial_number"].value_counts().filter(pl.col("count") > 5)

In [None]:
stock_unit_df.filter(pl.col("dsu_sale_price") < 1)["dsu_model"].value_counts().sort(by="count", descending=True).head(20)

In [None]:
stock_unit_df["dsu_make"].value_counts().sort(
    by="count", descending=True
).head(5)

In [None]:
stock_unit_df["dsu_serial_number"].value_counts().sort(by="count", descending=True).head(5)

In [None]:
stock_unit_df.filter(pl.col("dsu_sale_price") < 1)

In [None]:
stock_unit_df.filter(pl.col("dsu_sale_price") < 1).filter(pl.col("dsu_group").is_in(["4WD TRACTOR", "ROW CROP TRACTOR"]))

In [None]:
stock_unit_df.filter(pl.col("dsu_sale_price") < 1).filter(pl.col("dsu_group").is_in(["4WD TRACTOR", "ROW CROP TRACTOR"])).shape[0] / stock_unit_df.filter(pl.col("dsu_group").is_in(["4WD TRACTOR", "ROW CROP TRACTOR"])).shape[0]

In [None]:
sns.histplot(
    x="dsu_sales_date",
    data=stock_unit_df.filter(pl.col("dsu_sale_price") < 1)
    .filter(pl.col("dsu_group").is_in(["4WD TRACTOR", "ROW CROP TRACTOR"]))
    .to_pandas(),
)

In [None]:
stock_unit_df.filter(pl.col("dsu_hours_or_units") < 1).filter(
    pl.col("dsu_group").is_in(["4WD TRACTOR", "ROW CROP TRACTOR"]),
).shape[0] / stock_unit_df.filter(
    pl.col("dsu_group").is_in(["4WD TRACTOR", "ROW CROP TRACTOR"]),
).shape[
    0
]

In [None]:
stock_unit_df.filter(pl.col("dsu_hours_or_units") < 1).filter(
    pl.col("dsu_group").is_in(["4WD TRACTOR", "ROW CROP TRACTOR"]),
)["dsu_new_used"].value_counts()

In [None]:
stock_unit_df.filter(pl.col("dsu_invoice_number") == "NOTAVA")["dsu_sales_date"].value_counts().sort(by="count", descending=True)

In [None]:
stock_unit_df.filter(pl.col("dsu_invoice_number") == "EQPADD")[
    "dsu_sale_price"
].value_counts().sort(by="count", descending=True)

In [None]:
stock_unit_df.filter(pl.col("dsu_invoice_number") == "EQPADD")["dsu_sale_price"].describe()

In [None]:
stock_unit_df.filter(pl.col("dsu_invoice_number") == "NOTAVA")[
    "dsu_sales_date"
].count()

In [None]:
stock_unit_df.filter(pl.col("dsu_invoice_number") == "NOTAVA").filter(
    pl.col("dsu_sale_price") < 1,
)

In [None]:
stock_unit_df.filter(pl.col("dsu_sales_date") >= pd.to_datetime("2014-01-01"))

In [None]:
stock_unit_df.filter(pl.col("dsu_sales_date") >= pd.to_datetime("2014-01-01")).filter(
    pl.col("dsu_sale_price") < 1,
).filter(pl.col("dsu_group").is_in(["4WD TRACTOR", "ROW CROP TRACTOR"])).shape[
    0
] / stock_unit_df.filter(
    pl.col("dsu_sales_date") >= pd.to_datetime("2014-01-01"),
).filter(
    pl.col("dsu_group").is_in(["4WD TRACTOR", "ROW CROP TRACTOR"]),
).shape[
    0
]

In [None]:
len(
    stock_unit_df.filter(pl.col("dsu_sales_date") >= pd.to_datetime("2014-01-01"))
    .filter(pl.col("dsu_sale_price") < 1)
    .filter(pl.col("dsu_group").is_in(["4WD TRACTOR", "ROW CROP TRACTOR"])),
)

In [None]:
stock_unit_df.filter(pl.col("dsu_sales_date") >= pd.to_datetime("2014-01-01")).filter(pl.col("dsu_sale_price") < 1).filter(pl.col("dsu_group").is_in(["4WD TRACTOR", "ROW CROP TRACTOR"]))["dsu_account_id"].n_unique()

In [247]:
account_df = translate_csv_to_common_model(
    "data/dealers/koenig/account.csv",
    "koenig",
    "./src/transformation/semantic_layer.json",
    "account",
)
customer_equipment_df = translate_csv_to_common_model(
    "data/dealers/koenig/customer-equipment.csv",
    "koenig",
    "./src/transformation/semantic_layer.json",
    "customer_equipment",
)
stock_unit_df = translate_csv_to_common_model(
    "data/dealers/koenig/dealer-stock-unit.csv",
    "koenig",
    "./src/transformation/semantic_layer.json",
    "dealer_stock_unit",
)
purchase_orders_df = translate_csv_to_common_model(
    "data/dealers/koenig/purchase-order.csv",
    "koenig",
    "./src/transformation/semantic_layer.json",
    "purchase_order",
)
service_requests_df = translate_csv_to_common_model(
    "data/dealers/koenig/service-requests.csv",
    "koenig",
    "./src/transformation/semantic_layer.json",
    "service_requests",
)
user_df = translate_csv_to_common_model(
    "data/dealers/koenig/user.csv",
    "koenig",
    "./src/transformation/semantic_layer.json",
    "user",
)
store_df = translate_csv_to_common_model(
    "data/dealers/koenig/store.csv",
    "koenig",
    "./src/transformation/semantic_layer.json",
    "store",
)
account_df = translate_koenig_account_columns(account_df)
customer_equipment_df = translate_keonig_customer_equipment(customer_equipment_df)
stock_unit_df = translate_koenig_stock_unit(stock_unit_df)
purchase_orders_df = translate_koenig_purchase_orders(purchase_orders_df)
service_requests_df = translate_koenig_service_requests(service_requests_df)
user_df = translate_koenig_user(user_df)
store_df = translate_koenig_store(store_df)

In [248]:
eda_results = eda_polars(stock_unit_df)
eda_results.to_pandas().to_csv("data/dealers/koenig/eda/stock_unit_eda.csv", index=False)
eda_results = eda_polars(account_df)
eda_results.to_pandas().to_csv("data/dealers/koenig/eda/account_eda.csv", index=False)
eda_results = eda_polars(customer_equipment_df)
eda_results.to_pandas().to_csv(
    "data/dealers/koenig/eda/customer_equipment_eda.csv", index=False,
)
eda_results = eda_polars(purchase_orders_df)
eda_results.to_pandas().to_csv(
    "data/dealers/koenig/eda/purchase_orders_eda.csv", index=False,
)
eda_results = eda_polars(service_requests_df)
eda_results.to_pandas().to_csv(
    "data/dealers/koenig/eda/service_requests_eda.csv", index=False,
)
eda_results = eda_polars(user_df)
eda_results.to_pandas().to_csv("data/dealers/koenig/eda/user_eda.csv", index=False)
eda_results = eda_polars(store_df)
eda_results.to_pandas().to_csv("data/dealers/koenig/eda/store_eda.csv", index=False)

In [None]:
user_df.dtypes