# Cohort Analysis

Looking at first customer purchase across different categories, identify different behaviors between different cohorts. Purchase frequency, purchase amount/age of equipment, etc.

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
df = pd.read_csv("../data/cycle-21st-century-data.csv")
df["Customer Invoice Date"] = pd.to_datetime(df["Customer Invoice Date"])
df.head()

In [None]:
df["Customer Invoice Date"].value_counts().sort_index().plot()

In [None]:
# Create a first_purchase_date column by combining Buying Year and Buying Month
df["purchase_date"] = pd.to_datetime(df["Buying Year"].astype(str) + "-" + df["Month Number"].astype(str))
df["first_purchase_date"] = df.groupby("account_number")["purchase_date"].transform("min")

# Assign customers to cohorts based on the year and quarter of their first purchase
df["cohort_year"] = df["first_purchase_date"].dt.year
df["cohort_quarter"] = df["first_purchase_date"].dt.to_period("Q")

# Create a cohort_id column to identify customers' first purchase cohort
df["cohort_id"] = df["cohort_year"].astype(str) + "_Q" + df["first_purchase_date"].dt.quarter.astype(str)

# Deliverable: A table with customer IDs, cohort assignment, and their corresponding first purchase date
cohort_table = df[["account_number", "cohort_id", "first_purchase_date"]].drop_duplicates()

print(cohort_table.head())

In [None]:
df.head()

### Looking at Tractor Cohorts

Different temporal cohorts that purchase Large Tractors

In [None]:
large_tractor_df = df[df["combined_product_group"] == "LARGE TRACTOR"].copy()
# must have at least 2 purchases to be considered a repeat customer
large_tractor_df = large_tractor_df[large_tractor_df["account_number"].isin(large_tractor_df["account_number"].value_counts()[large_tractor_df["account_number"].value_counts() > 1].index)]
large_tractor_df.sort_values(by=["account_name", "combined_product_group", "Customer Invoice Date"], ascending = [True, True, True],inplace=True)
large_tractor_df["previous_purchase_year"] = large_tractor_df.groupby(["account_name", "combined_product_group"])["Buying Year"].shift(1)
large_tractor_df["year_from_previous_purchase"] = large_tractor_df["Buying Year"] - large_tractor_df["previous_purchase_year"]
large_tractor_df.head()

In [None]:
sns.histplot(large_tractor_df["year_from_previous_purchase"], kde=True)

In [None]:
large_tractor_df.loc[(large_tractor_df.year_from_previous_purchase == 0)].groupby("account_name").count().sort_values(by="account_number", ascending=False).head(15)

In [None]:
# count distinct account_number in each cohort
cohort_counts = large_tractor_df.groupby(["cohort_year"])["account_number"].nunique().reset_index()
cohort_counts.head()

In [None]:
large_tractor_df["Buying Year"].value_counts().sort_index().plot(kind="bar", title="Large Tractor Sales by Year")

In [None]:
# group by cohort year and calculate the average year_from_previous_purchase
average_year_from_previous_purchase = large_tractor_df.loc[~large_tractor_df.year_from_previous_purchase.isna()].groupby(["cohort_year"])["year_from_previous_purchase"].mean().reset_index()
sns.lineplot(data=average_year_from_previous_purchase, x="cohort_year", y="year_from_previous_purchase")
plt.xlabel("Cohort Year")
plt.ylabel("Average Time Between Purchases (Years)")
plt.title("Average Time Between Purchases for Large Tractor Cohorts")

Within the year cohorts, is there a difference between customers who bought only Used

In [None]:
# count distinct account_number in each cohort
cohort_counts = large_tractor_df.loc[large_tractor_df["New/Used"] == "Used"].groupby(["cohort_year"])["account_number"].nunique().reset_index()
cohort_counts.head()

In [None]:
# group by cohort year and calculate the average year_from_previous_purchase
average_year_from_previous_purchase = large_tractor_df.loc[(~large_tractor_df.year_from_previous_purchase.isna()) & (large_tractor_df["New/Used"] == "Used")].groupby(["cohort_year"])["year_from_previous_purchase"].mean().reset_index()
sns.lineplot(data=average_year_from_previous_purchase, x="cohort_year", y="year_from_previous_purchase")
plt.xlabel("Cohort Year")
plt.ylabel("Average Time Between Purchases (Years)")
plt.title("Average Time Between Purchases for Large Tractor Cohorts - Used")

In [None]:
# group by cohort year and calculate the average year_from_previous_purchase
average_year_from_previous_purchase = large_tractor_df.loc[(~large_tractor_df.year_from_previous_purchase.isna()) & (large_tractor_df["New/Used"] == "Used")].groupby(["cohort_year"])["Machine Age"].mean().reset_index()
sns.lineplot(data=average_year_from_previous_purchase, x="cohort_year", y="Machine Age")
plt.xlabel("Cohort Year")
plt.ylabel("Average Age of Equipment (Years)")
plt.title("Average Age of Equipment for Large Tractor Cohorts - Used")

### Combines

In [None]:
combines_df = df[df["combined_product_group"] == "COMBINES"].copy()
combines_df.head()

In [None]:
combines_df["Machine Age"].value_counts().sort_index().plot(kind="bar", title="Combine Sales by Equipment Age")

In [None]:
# count distinct account_number in each cohort
cohort_counts = combines_df.groupby(["cohort_year"])["account_number"].nunique().reset_index()
cohort_counts.head()

In [None]:
# group by cohort year and calculate the average year_from_previous_purchase
average_year_from_previous_purchase = combines_df.loc[~combines_df.year_from_previous_purchase.isna()].groupby(["cohort_year"])["year_from_previous_purchase"].mean().reset_index()
sns.lineplot(data=average_year_from_previous_purchase, x="cohort_year", y="year_from_previous_purchase")
plt.xlabel("Cohort Year")
plt.ylabel("Average Time Between Purchases (Years)")
plt.title("Average Time Between Purchases for Combine Cohorts")

In [None]:
# group by cohort year and calculate the average year_from_previous_purchase
average_year_from_previous_purchase = combines_df.loc[(~combines_df.year_from_previous_purchase.isna()) & (combines_df["New/Used"] == "Used")].groupby(["cohort_year"])["year_from_previous_purchase"].mean().reset_index()
sns.lineplot(data=average_year_from_previous_purchase, x="cohort_year", y="year_from_previous_purchase")
plt.xlabel("Cohort Year")
plt.ylabel("Average Time Between Purchases (Years)")
plt.title("Average Time Between Purchases for Combine Cohorts - Used")

In [None]:
# group by cohort year and calculate the average year_from_previous_purchase
average_year_from_previous_purchase = combines_df.loc[(~combines_df.year_from_previous_purchase.isna()) & (combines_df["New/Used"] == "Used")].groupby(["cohort_year"])["Machine Age"].mean().reset_index()
sns.lineplot(data=average_year_from_previous_purchase, x="cohort_year", y="Machine Age")
plt.xlabel("Cohort Year")
plt.ylabel("Average Age of Equipment (Years)")
plt.title("Average Age of Equipment for Combine Cohorts - Used")