In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [3]:
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [5]:
input_path = '/Users/charisoneyemi/Downloads/THESISFOLDER/FinalDataset/clustering_work.csv'
df = pd.read_csv(input_path, parse_dates=['First_Purchase', 'Last_Purchase'])

In [9]:
# Just defining features (the same as before)
numeric_features = [
    'Unique_Products', 'Unique_Categories',
    'Recency_Days', 'Frequency', 'Avg_Order_Value', 'Avg_Quantity_Per_Order'
]

categorical_features = [
    'Age Group', 'Hispanic Origin', 'Race', 'Education', 'Household Income',
    'Gender Identity', 'Sexual Orientation', 'State Reported',
    'Account Shared With', 'Household Size', 'Amazon Order Frequency',
    'Smoke Cigarettes', 'Smoke Marijuana', 'Drink Alcohol', 'Diabetes', 'Wheelchair',
    'Opinion: Amazon Sell Data', 'Opinion: Companies Sell Data',
    'Opinion: SMB Data Access', 'Opinion: Census Use Data', 'Opinion: Research Use Data',
    'Behavior_Segment'
]

behavioral_features = [
    'Avg_Quantity_Per_Order',
    'Avg_Order_Value',
    'Frequency',
    'Recency_Days',
    'Unique_Categories',
    'Unique_Products',
    'Total_Spent'
]

In [11]:
X = df[numeric_features + categorical_features]
y = df['Total_Spent']

# Revenue concentration by segment

In [14]:
total_revenue = df['Total_Spent'].sum()
for segment in [0, 1]:
    subset = df[df['Behavior_Segment'] == segment]
    seg_revenue = subset['Total_Spent'].sum()
    percentage = (seg_revenue / total_revenue) * 100
    print(f"\nSegment {segment}:")
    print(f"  Total Revenue: ${seg_revenue:,.2f}")
    print(f"  Revenue Share: {percentage:.1f}% of total")
    print(f"  Number of Customers: {len(subset):,}")
    print(f"  Average CLV: ${subset['Total_Spent'].mean():,.2f}")


Segment 0:
  Total Revenue: $18,643,616.81
  Revenue Share: 42.3% of total
  Number of Customers: 3,897
  Average CLV: $4,784.09

Segment 1:
  Total Revenue: $25,410,016.31
  Revenue Share: 57.7% of total
  Number of Customers: 1,130
  Average CLV: $22,486.74


# Demographic breakdown by segment

In [19]:
demo_features = ['Age Group', 'Education', 'Household Income', 'Gender Identity']
for segment in [0, 1]:
    print(f"\nSEGMENT {segment}")
    for feature in demo_features:
        if feature in df.columns:
            mode_value = df[df['Behavior_Segment'] == segment][feature].mode()
            if len(mode_value) > 0:
                count = df[df['Behavior_Segment'] == segment][feature].value_counts()
                print(f"{feature}: {mode_value[0]} (n={count.iloc[0]})")


SEGMENT 0
Age Group: 25 - 34 years (n=1473)
Education: Bachelor's degree (n=1704)
Household Income: $25,000 - $49,999 (n=1016)
Gender Identity: Female (n=1920)

SEGMENT 1
Age Group: 35 - 44 years (n=368)
Education: Bachelor's degree (n=515)
Household Income: $100,000 - $149,999 (n=284)
Gender Identity: Female (n=669)


# Segment-specific prediction accuracy

In [22]:
for segment in [0, 1]:
    segment_mask = df['Behavior_Segment'] == segment
    
    y_true_seg = df[segment_mask]['Total_Spent']
    y_pred_seg = df[segment_mask]['CLV_Predicted']
    
    rmse_seg = mean_squared_error(y_true_seg, y_pred_seg, squared=False)
    mae_seg = mean_absolute_error(y_true_seg, y_pred_seg)
    r2_seg = r2_score(y_true_seg, y_pred_seg)
    
    print(f"\nSegment {segment} Prediction Performance:")
    print(f"  RMSE: ${rmse_seg:.2f}")
    print(f"  MAE: ${mae_seg:.2f}")
    print(f"  R²: {r2_seg:.4f}")


Segment 0 Prediction Performance:
  RMSE: $250.59
  MAE: $81.91
  R²: 0.9955

Segment 1 Prediction Performance:
  RMSE: $2620.66
  MAE: $844.62
  R²: 0.9490




# Summary statistics by segment

In [25]:
segment_stats = df.groupby('Behavior_Segment')[behavioral_features].agg(['mean', 'std', 'min', 'max'])
print("\n", segment_stats)


                  Avg_Quantity_Per_Order                                 \
                                   mean       std       min        max   
Behavior_Segment                                                         
0                              2.223512  1.026641  1.000000  16.791667   
1                              2.828038  1.426449  1.377778  17.900000   

                 Avg_Order_Value                                    \
                            mean        std        min         max   
Behavior_Segment                                                     
0                      52.873373  26.983699   1.840000  368.875600   
1                      60.458654  25.873551  20.134879  321.314412   

                   Frequency              ... Unique_Categories       \
                        mean         std  ...               min  max   
Behavior_Segment                          ...                          
0                  97.257634   74.882984  ...               

# Overall dataset summary

In [28]:
print(f"Total Customers: {len(df):,}")
print(f"Total Revenue: ${total_revenue:,.2f}")
print(f"Average CLV: ${df['Total_Spent'].mean():,.2f}")
print(f"Median CLV: ${df['Total_Spent'].median():,.2f}")
print(f"CLV Standard Deviation: ${df['Total_Spent'].std():,.2f}")
print(f"\nDate Range: {df['First_Purchase'].min().date()} to {df['Last_Purchase'].max().date()}")

Total Customers: 5,027
Total Revenue: $44,053,633.12
Average CLV: $8,763.40
Median CLV: $5,706.99
CLV Standard Deviation: $9,782.76

Date Range: 2018-01-01 to 2024-08-15
