In [1]:
import pandas as pd
import numpy as np
from google.colab import files


df = pd.read_csv("processed_sales_data.csv")
df

Unnamed: 0,sale_id,product_id,customer_id,sale_price,quantity,sale_date,total_revenue,sentiment_score,review_date
0,1001,P101,C_001,150.0,2,2023-01-15,300.0,4.5,2023-01-16
1,1002,P102,C_002,75.0,3,2023-01-20,225.0,4.5,2023-02-05
2,1004,P101,C_003,150.0,4,2023-01-02,600.0,3.7,2023-02-18
3,1005,P104,C_004,30.0,1,2023-05-02,30.0,3.9,2023-02-01
4,1006,P102,C_005,75.0,2,2023-10-02,150.0,4.0,2023-02-11
...,...,...,...,...,...,...,...,...,...
591,1992,P104,C_062,120.0,1,2023-03-02,120.0,4.4,2023-03-10
592,1993,P104,C_075,150.0,5,2023-10-01,750.0,4.0,2023-03-05
593,1994,P103,C_054,150.0,5,2023-02-02,750.0,2.8,2023-02-17
594,1998,P104,C_092,75.0,1,2023-03-31,75.0,1.7,2023-03-31


In [2]:
customer_stats = df.groupby('customer_id').agg({
    'total_revenue': 'sum',
    'quantity': 'sum',
    'sale_price': 'mean'
}).reset_index()

customer_stats.rename(columns={
    'total_revenue': 'total_purchase_amount',
    'quantity': 'purchase_frequency',
    'sale_price': 'average_transaction_value'
}, inplace=True)

display(customer_stats.head())

Unnamed: 0,customer_id,total_purchase_amount,purchase_frequency,average_transaction_value
0,C_001,660.0,5,135.0
1,C_002,225.0,3,75.0
2,C_003,600.0,4,150.0
3,C_004,30.0,1,30.0
4,C_005,150.0,2,75.0


In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
scaler = StandardScaler()
features_scaled = scaler.fit_transform(customer_stats[['total_purchase_amount', 'purchase_frequency', 'average_transaction_value']])

kmeans = KMeans(n_clusters=2, random_state=42)
customer_stats['cluster'] = kmeans.fit_predict(features_scaled)
vip_cluster = customer_stats.groupby('cluster')['total_purchase_amount'].mean().idxmax()
customer_stats['VIP_status'] = customer_stats['cluster'].apply(lambda x: "VIP" if x == vip_cluster else "Non-VIP")

In [5]:
df_enriched = pd.merge(df, customer_stats[['customer_id', 'VIP_status']], on='customer_id', how='left')
output_file = "enriched_processed_sales_data.csv"
df_enriched.to_csv(output_file, index=False)
files.download(output_file)

print("Enriched dataset saved and downloaded.")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Enriched dataset saved and downloaded.
