# Cluster Segmentation & Analysis

In [1]:
from concurrent.futures import ThreadPoolExecutor

import numpy as np
import pandas as pd
from dotenv import load_dotenv
from fuzzywuzzy import fuzz
from sklearn.preprocessing import StandardScaler
from utilities.sql import SQL

In [2]:
load_dotenv()
sql = SQL()

In [None]:
df = pd.read_csv("../data/clean-21st-century-data.csv")
df["Customer Invoice Date"] = pd.to_datetime(df["Customer Invoice Date"])
df.head()

In [None]:
query = """SELECT m.name AS make, mm.model AS model, m.id AS make_id, mm.id AS model_id, lc.name AS category, ls.name AS subcategory
FROM make m 
JOIN make_model mm ON m.id = mm.make_id 
JOIN lot_category lc ON lc.id = mm.category 
JOIN lot_subcategory ls ON ls.id = mm.subcategory """
mm_df = sql.pandas_execute(query)
mm_df.head()

Run through finding the closest equipment match so we can pull in our category + subcategory

In [None]:
distinct_makes = mm_df["make"].unique()

def find_closest_match(row, mm_df):
    # This function will search for the closest match of 'make' and 'model' in mm_df
    closest_match_make = None
    highest_make_score = 0
    highest_model_score = 0
    # get best match for make then sort through those models
    for make in distinct_makes:
        make_score = fuzz.ratio(row["Make"].lower(), make.lower())
        if make_score > highest_make_score:
            highest_make_score = make_score
            closest_match_make = make
    mm_df_temp = mm_df[mm_df["make"] == closest_match_make].copy()
    for _, mm_row in mm_df_temp.iterrows():
        # Compute similarity score for both make and model
        model_score = fuzz.ratio(row["Model"].lower(), mm_row["model"].lower())


        if model_score > highest_model_score:
            highest_model_score = model_score
            closest_match = mm_row


    match = pd.Series({"matched_make": closest_match["make"],
                      "matched_model": closest_match["model"],
                      "matched_category": closest_match["category"],
                      "matched_subcategory": closest_match["subcategory"],
                      "make_score": highest_make_score,
                      "model_score": highest_model_score})
    # print make/model from row and the closest match
    print(f"From Dataset: {row['Make']} {row['Model']}, Closest match: {closest_match['make']} {closest_match['model']}, Score: {highest_make_score} {highest_model_score}")
    return match

def parallel_match(df, mm_df):
    # Create a ThreadPoolExecutor to handle multiple threads
    with ThreadPoolExecutor(max_workers=8) as executor:  # Adjust max_workers based on your CPU core count
        # Map each row to the find_closest_match function in parallel
        futures = [executor.submit(find_closest_match, row, mm_df) for index, row in df.iterrows()]

        # Collect the results as they complete
        results = [future.result() for future in futures]

    # Convert the list of results back to a dataframe
    results_df = pd.DataFrame(results, columns=["matched_make", "matched_model", "matched_category", "matched_subcategory", "match_score"])

    # Merge the results back with the original dataframe
    df = pd.concat([df.reset_index(drop=True), results_df], axis=1)

    return df

# Apply the matching function to each row of df
matched_df = parallel_match(df, mm_df)

# Filter by a threshold if needed, for example, keeping only matches above 80%:
threshold = 80
matched_df = df[df["match_score"] >= threshold]

## Feature Engineering

In [None]:
bad_account_names = ["CONVERSION CUSTOMER", "MV EQUIP CONVERSION CUST", "21ST CENTURY LEASING LLC", "BIG IRON", "RANDALL BROTHERS"]

segment_df = df.copy()
segment_df = segment_df.loc[segment_df["New/Used"] == "Used"]
segment_df = segment_df.loc[~segment_df["account_name"].isin(bad_account_names)]
segment_df.head()

The train/test dataset needs to be split temporally so if the prediction window is buy over next 12 months I need to cut off the dataset at the max date - 12 months to calculate some features like recency, frequency, and monetary. 

In [45]:
PREDICTION_WINDOW = 12 # months
MAX_DATE = segment_df["Customer Invoice Date"].max()
segment_pre_df = segment_df.loc[segment_df["Customer Invoice Date"] < MAX_DATE - pd.DateOffset(months=PREDICTION_WINDOW)].copy()
segment_post_df = segment_df.loc[segment_df["Customer Invoice Date"] >= MAX_DATE - pd.DateOffset(months=PREDICTION_WINDOW)].copy()
# only include accounts in segment_pre_df in segment_post_df
segment_post_df = segment_post_df.loc[segment_post_df["account_name"].isin(segment_pre_df["account_name"])]

In [None]:
segment_pre_df.account_name.nunique()

In [None]:
segment_post_df.account_name.nunique()

Only 663 of accounts in training dataset have bought of the 7804 which is fairly imbalanced. Will have to address this.

### Frequency
Determine the number of purchases since min date and max date

In [None]:
segment_frequency_df = segment_pre_df.copy()
segment_frequency_df = segment_frequency_df.sort_values(by=["account_name", "Customer Invoice Date"], ascending = [True, True])
segment_frequency_df = segment_frequency_df.groupby(["account_name", "Product Group"])["Customer Invoice Date"].count().reset_index()
segment_frequency_df.rename(columns={"Customer Invoice Date": "Frequency"}, inplace=True)
segment_frequency_df.head()

### Recency

Get most recent purchase by account + category

In [None]:
segment_recency_df = segment_pre_df.copy()
segment_recency_df = segment_recency_df.sort_values(by=["account_name", "Customer Invoice Date"], ascending = [True, True])
segment_recency_df = segment_recency_df.groupby(["account_name", "Product Group"])["Customer Invoice Date"].max().reset_index()
segment_recency_df["months_since_last_purchase"] = (MAX_DATE - segment_recency_df["Customer Invoice Date"]).dt.days / 30
segment_recency_df["Recency"] = segment_recency_df["months_since_last_purchase"].astype(int)
segment_recency_df = segment_recency_df[["account_name", "Product Group", "Recency"]]
segment_recency_df.head()

### Monetary

Overall total by account + category from min to max date

In [None]:
segment_monetary_df = segment_pre_df.copy()
segment_monetary_df = segment_monetary_df.groupby(["account_name", "Product Group"])["Customer Invoice Amount"].sum().reset_index()
segment_monetary_df.rename(columns={"Customer Invoice Amount": "Monetary"}, inplace=True)
segment_monetary_df.head()

In [None]:
segment_training_df = pd.merge(segment_frequency_df, segment_recency_df, on=["account_name", "Product Group"], how="inner")
segment_training_df = pd.merge(segment_training_df, segment_monetary_df, on=["account_name", "Product Group"], how="inner")
segment_training_df.head()

Finally add in the flag for whether they purchased. If the account_name is in the post dataframe then they bought (1) otherwise they didn't.

In [None]:
segment_training_df["Bought"] = segment_training_df["account_name"].isin(segment_post_df["account_name"]).astype(int)
segment_training_df.head()

## Row Crop

Testing this on row crop tractors

In [None]:
segment_training_row_crop_df = segment_training_df.copy()
segment_training_row_crop_df = segment_training_row_crop_df.loc[segment_training_row_crop_df["Product Group"] == "TRACTOR-ROW CROP"]
segment_training_row_crop_df.head()

In [None]:
segment_training_row_crop_df.describe()

In [None]:
segment_training_row_crop_df["Bought"].value_counts()

In [None]:
segment_training_row_crop_df.loc[segment_training_row_crop_df["Bought"] == 1].describe()

In [None]:
segment_training_row_crop_df.loc[segment_training_row_crop_df["Bought"] == 0].describe()

In [None]:
num_positive_samples = segment_training_row_crop_df.loc[segment_training_row_crop_df["Bought"] == 1].shape[0]
train_df = pd.concat([segment_training_row_crop_df.loc[segment_training_row_crop_df["Bought"] == 1], segment_training_row_crop_df.loc[segment_training_row_crop_df["Bought"] == 0].sample(n=num_positive_samples)])
#shuffle the dataframe
train_df = train_df.sample(frac=1).reset_index(drop=True)
train_df.head()

In [65]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
)
from sklearn.model_selection import train_test_split

In [68]:
train_df = train_df.drop(["account_name", "Product Group"], axis=1)
X = train_df.drop("Bought", axis=1)
y = train_df["Bought"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
scaler = StandardScaler()
X_test = scaler.fit_transform(X_test)

In [None]:
# Initialize and train the logistic regression model
log_reg = LogisticRegression(random_state=69)
log_reg.fit(X_train, y_train)

# Predict on the test set
y_pred = log_reg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

What if we did this as an ensemble where we downsample a bunch of times and create N models

In [None]:
num_positive_samples = segment_training_row_crop_df.loc[segment_training_row_crop_df["Bought"] == 1].shape[0]
num_models = 10
model_list = []
f1_scores = []
accuracies = []

for i in range(num_models):
    train_df = pd.concat([segment_training_row_crop_df.loc[segment_training_row_crop_df["Bought"] == 1], segment_training_row_crop_df.loc[segment_training_row_crop_df["Bought"] == 0].sample(n=num_positive_samples)])
    #shuffle the dataframe
    train_df = train_df.sample(frac=1).reset_index(drop=True)

    train_df = train_df.drop(["account_name", "Product Group"], axis=1)
    X = train_df.drop("Bought", axis=1)
    y = train_df["Bought"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    scaler = StandardScaler()
    X_test = scaler.fit_transform(X_test)

    log_reg = LogisticRegression(random_state=69)
    log_reg.fit(X_train, y_train)
    y_pred = log_reg.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    accuracies.append(accuracy)
    f1_scores.append(f1)
    model_list.append(log_reg)

print(f"Mean Accuracy: {np.mean(accuracies)}")
print(f"Mean F1 Score: {np.mean(f1_scores)}")

In [None]:
segment_training_row_crop_df.sort_values(by="Monetary", ascending=False).head()

In [76]:
pred_df = segment_training_row_crop_df.drop(["account_name", "Product Group"], axis=1)
X = pred_df.drop("Bought", axis=1)
y = pred_df["Bought"]

scaler = StandardScaler()
X = scaler.fit_transform(X)

preds = []
for model in model_list:
    preds.append(model.predict(X))

preds = np.array(preds)
preds = np.mean(preds, axis=0)
preds = preds > 0.5
preds = preds.astype(int)

In [77]:
segment_training_row_crop_df["predicted"] = preds

In [None]:
segment_training_row_crop_df.loc[(segment_training_row_crop_df.Bought == 1) & (segment_training_row_crop_df.predicted == 1)].describe()

In [None]:
segment_training_row_crop_df.loc[(segment_training_row_crop_df.Bought == 1) & (segment_training_row_crop_df.predicted == 0)].describe()

For the False Negative scenario the frequency & monetary tend to be smaller while recency is higher

In [None]:
segment_training_row_crop_df.loc[(segment_training_row_crop_df.Bought == 0) & (segment_training_row_crop_df.predicted == 0)].describe()

In [None]:
segment_training_row_crop_df.loc[(segment_training_row_crop_df.Bought == 0) & (segment_training_row_crop_df.predicted == 1)].describe()

## Combines

In [None]:
segment_combines_df = segment_training_df.copy()
segment_combines_df = segment_combines_df.loc[segment_combines_df["Product Group"] == "COMBINES"]
segment_combines_df.head()

In [None]:
segment_combines_df["Bought"].value_counts()

In [None]:
num_positive_samples = segment_combines_df.loc[segment_combines_df["Bought"] == 1].shape[0]
num_models = 10
model_list = []
f1_scores = []
accuracies = []

for i in range(num_models):
    train_df = pd.concat([segment_combines_df.loc[segment_combines_df["Bought"] == 1], segment_combines_df.loc[segment_combines_df["Bought"] == 0].sample(n=num_positive_samples)])
    #shuffle the dataframe
    train_df = train_df.sample(frac=1).reset_index(drop=True)

    train_df = train_df.drop(["account_name", "Product Group"], axis=1)
    X = train_df.drop("Bought", axis=1)
    y = train_df["Bought"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    scaler = StandardScaler()
    X_test = scaler.fit_transform(X_test)

    log_reg = LogisticRegression(random_state=69)
    log_reg.fit(X_train, y_train)
    y_pred = log_reg.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    accuracies.append(accuracy)
    f1_scores.append(f1)
    model_list.append(log_reg)

print(f"Mean Accuracy: {np.mean(accuracies)}")
print(f"Mean F1 Score: {np.mean(f1_scores)}")