In [None]:
import pandas as pd


file_path = 'HITRUST+Hair.csv'
sales_data = pd.read_csv(file_path)


sales_data.head(), sales_data.info(), sales_data.describe(include='all')

In [None]:

sales_data['Sales Price'] = pd.to_numeric(sales_data['Sales Price'].str.strip(), errors='coerce')
sales_data['Amount'] = pd.to_numeric(sales_data['Amount'].str.strip(), errors='coerce')

sales_data['Product/Service'] = sales_data['Product/Service'].replace(r'^VIP:', '', regex=True).str.strip()

sales_data.head(), sales_data.dtypes

In [None]:

sales_data['Has_Shipping'] = sales_data['Memo/Description'].str.contains('Shipping', na=False).astype(int)

rows_before = sales_data.shape[0]
sales_data = sales_data.dropna(subset=['Product/Service'], how='all')
rows_after = sales_data.shape[0]

rows_dropped = rows_before - rows_after
sales_data.head(), rows_dropped

In [None]:

sales_data['Date'] = pd.to_datetime(sales_data['Date'])

latest_date = sales_data['Date'].max()

# customer metrics
customer_metrics = sales_data.groupby('Customer').agg(
    Total_Spending=pd.NamedAgg(column='Amount', aggfunc='sum'),
    Purchase_Frequency=pd.NamedAgg(column='Date', aggfunc='count'),
    Average_Purchase_Value=pd.NamedAgg(column='Amount', aggfunc=lambda x: x.mean()),
    Most_Recent_Purchase=pd.NamedAgg(column='Date', aggfunc='max')
)


customer_metrics['Recency'] = (latest_date - customer_metrics['Most_Recent_Purchase']).dt.days

customer_metrics.head()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt


scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_metrics[['Total_Spending', 'Purchase_Frequency', 'Average_Purchase_Value', 'Recency']])

# Elbow Method
inertia = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=0)
    kmeans.fit(scaled_features)
    inertia.append(kmeans.inertia_)


plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), inertia, marker='o')
plt.title('Elbow Method For Optimal k')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.grid(True)
plt.show()

In [None]:
# K-means
kmeans_final = KMeans(n_clusters=3, random_state=0)
customer_metrics['Cluster'] = kmeans_final.fit_predict(scaled_features)


cluster_summary = customer_metrics.groupby('Cluster').mean()
cluster_summary['Count'] = customer_metrics['Cluster'].value_counts()
cluster_summary

In [None]:
from lifetimes.utils import summary_data_from_transaction_data
from lifetimes import BetaGeoFitter, GammaGammaFitter


summary = summary_data_from_transaction_data(data, 'Customer', 'Date', 'Amount', observation_period_end=data['Date'].max())

In [None]:
# BG/NBD model
bgf = BetaGeoFitter(penalizer_coef=0.0)
bgf.fit(summary['frequency'], summary['recency'], summary['T'])

In [None]:
# Gamma-Gamma model
positive_monetary_data = summary[(summary['monetary_value'] > 0) & (summary['frequency'] > 0)]
if not positive_monetary_data.empty:
    ggf = GammaGammaFitter(penalizer_coef=0.01)
    ggf.fit(positive_monetary_data['frequency'], positive_monetary_data['monetary_value'])

    
    t = 30  # for next 30 days
    summary['predicted_purchases'] = bgf.predict(t, summary['frequency'], summary['recency'], summary['T'])

    # Predict monetary values and calculate CLV
    summary['predicted_monetary_value'] = ggf.conditional_expected_average_profit(
        positive_monetary_data['frequency'], positive_monetary_data['monetary_value']
    )
    summary['CLV'] = summary['predicted_purchases'] * summary['predicted_monetary_value']

# Save processed data to CSV for GitHub upload
summary.to_csv('path_to_save_processed_data.csv')  # Change to your desired path

# Output the first few rows to verify
print(summary.head())