In [1]:
import pandas as pd
import seaborn.objects as so
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import silhouette_score

In [2]:
Sales_data=pd.read_csv("./Data/Sales Report.csv", sep=';',header=1)

In [3]:
Sales_data.columns = ['Company Code', 'Order Number', 'Employee', 'Product', 'Product Category', 'Client', 'Client City', 'Sales Date Time', 'Product Cost', 'Discount Amount', 'Amount', 'Total', 'Form of Payment']

In [4]:
null_values = Sales_data.isnull().sum()
print("Null values in each column:\n", null_values)
print("\n Total null values in the dataset:", null_values.sum())

Null values in each column:
 Company Code        0
Order Number        0
Employee            0
Product             0
Product Category    0
Client              0
Client City         0
Sales Date Time     0
Product Cost        0
Discount Amount     0
Amount              0
Total               0
Form of Payment     0
dtype: int64

 Total null values in the dataset: 0


In [5]:
filtered_data = Sales_data[Sales_data['Client City'] != 'No City']

In [6]:
filtered_data['Row Count'] = filtered_data.groupby('Client City')['Client City'].transform('count')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['Row Count'] = filtered_data.groupby('Client City')['Client City'].transform('count')


In [7]:
##Main DF which we are using everwhere
filtered_customer_distributaion_main = filtered_data[filtered_data['Row Count'] > 10000]

In [None]:
# filtered_customer_distributaion_main.to_csv(filename.csv)

### Question number 2) a)

In [None]:
product_frequency = filtered_customer_distribution_main.groupby(['Client', 'Product Category']).size().reset_index(name='Frequency')

In [None]:
product_frequency_sorted = product_frequency.sort_values(by = 'Frequency', ascending=False)

In [None]:
top_n = 15
top_frequent_products = product_frequency_sorted.head(top_n)
plt.figure(figsize = (10,6))
plt.bar(top_frequent_products['Product Category'], top_frequent_products['Frequency'])
plt.xlabel('Product Category')
plt.ylabel('Frequency')
plt.title('Top {} Most frequently Purchased product categories by individual by Individual Customers'.format(top_n))
plt.xticks(rotation =45)
plt.show()

In [None]:
product_frequency = filtered_customer_distribution_main.groupby(['Client','Product Category']).size().reset_index(name='Frequency')

In [None]:
product_pivot = product_frequency.pivot_table(index='Client', columns='Product Category', fill_value=0)

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(product_pivot)
X_scaled=pd.DataFrame(X_scaled, columns=product_pivot.columns)

In [None]:
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X_scaled)

In [None]:
inertias = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertias.append([k, kmeans.inertia_])
(
    pd.DataFrame(inertias, columns=['k','inertias'])
    .pipe(so.Plot, x='k', y='inertias')
    .add(so.Line(marker='o'))
    .label(title="Elbow Method for Optimal Clusters")
)

In [None]:
silhouette_scores = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    # score = silhouette_score(X_scaled, kmeans.labels_)
    silhouette_scores.append([k,silhouette_score(X_scaled, kmeans.labels_)])

(
    pd.DataFrame(silhouette_scores, columns=['k','silhouette_scores'])
    .pipe(so.Plot, x='k', y='silhouette_scores')
    .add(so.Line(marker='o'))
    .label(title="Elbow Method for Optimal Clusters")
)

In [None]:
# Choose the best number of clusters (e.g., 3) based on the above analysis
best_k = 4
kmeans_final = KMeans(n_clusters=best_k, random_state=42)
kmeans_final.fit(X_scaled)

In [None]:
product_pivot['Cluster'] = kmeans_final.labels_

In [None]:
cluster_summary = product_pivot.groupby('Cluster').mean()
cluster_summary

In [None]:
cluster_summary_transposed = cluster_summary.transpose()

In [None]:
cluster_summary_transposed = cluster_summary_transposed.apply(pd.to_numeric, errors = 'coerce')

In [None]:
if isinstance(cluster_summary_transposed.index,pd.MultiIndex):
    cluster_summary_transposed.index = [''.join(map(str,idx)) for idx in cluster_summary_transposed.index]

In [None]:
plt.figure(figsize=(15,10))
for cluster in cluster_summary_transposed.columns:
    plt.plot(cluster_summary_transposed.index,
            cluster_summary_transposed[cluster],
            label=f'Cluster{cluster}')
plt.xticks(rotation=45)
plt.ylabel('Average Purchase Frequency')
plt.title('Average Purchase Frequency for Each Product Category by Cluster')
plt.legend()
plt.show()

### Question no 2 b)

In [None]:
from matplotlib.ticker import FuncFormatter

In [None]:
def millions_formattor(x, pos):
    return f'{int(x/1e6)}M'

In [None]:
X = filtered_customer_distribution_main
X['Sale Date Time'] = pd.to_datetime(X['Sale Date Time'])

In [None]:
monthly_spending = X.resample('M', on='Sale Date Time').sum()['Total']

In [None]:
plt.figure(figsize=(12,6))
sns.lineplot(x=monthly_spending.index, y=monthly_spending.values)
plt.gca().yaxis.set_major_formatter(FuncFormatter(millions_formattor))
plt.xlabel('Month')
plt.ylabel('Total Spending')
plt.title('Monthly total spending Across All customers')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(x=monthly_spending.index.month, y=monthly_spending.values)
plt.gca().yaxis.set_major_formatter(FuncFormatter(millions_formattor))
plt.xlabel('Month')
plt.ylabel('Total Spending')
plt.title('Distribution of Monthly Spending Across All Customers')
plt.xticks(rotation=45)
plt.show()

In [None]:
X = filtered_customer_distribution_main.copy()

In [None]:
X['Sale Date Time'] = pd.to_datetime(X['Sale Date Time'])

In [None]:
customer_spending = X.groupby('Client').resample('M', on='Sale Date Time').sum(numeric_only=True)['Total']

In [None]:
customer_spending_reset = customer_spending.reset_index()
customer_spending_pivot = customer_spending_reset.pivot(index='Client', columns='Sale Date Time', values='Total').fillna(0)

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(customer_spending_pivot)
X_scaled = pd.DataFrame(X_scaled, columns=customer_spending_pivot.columns)

In [None]:
kmeans = KMeans(n_clusters=5, init='random', n_init=1, random_state=468)
kmeans.fit(X_scaled)

In [None]:
inertias = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertias.append([k, kmeans.inertia_])
(
    pd.DataFrame(inertias, columns=['k','inertias'])
    .pipe(so.Plot, x='k', y='inertias')
    .add(so.Line(marker='o'))
    .label(title="Elbow Method for Optimal Clusters")
)

In [None]:
silhouette_scores = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    # score = silhouette_score(X_scaled, kmeans.labels_)
    silhouette_scores.append([k,silhouette_score(X_scaled, kmeans.labels_)])

(
    pd.DataFrame(silhouette_scores, columns=['k','silhouette_scores'])
    .pipe(so.Plot, x='k', y='silhouette_scores')
    .add(so.Line(marker='o'))
    .label(title="Elbow Method for Optimal Clusters")
)

In [None]:
best_k = 4
kmeans_final = KMeans(n_clusters=best_k, random_state=42)
kmeans_final.fit(X_scaled)

customer_spending_pivot['Cluster'] = kmeans_final.labels_

cluster_summary = customer_spending_pivot.groupby('Cluster').mean()

cluster_summary_transposed = cluster_summary.transpose()

plt.figure(figsize=(15, 10))
for cluster in cluster_summary_transposed.columns:
    plt.plot(cluster_summary_transposed.index, 
             cluster_summary_transposed[cluster], 
             label=f'Cluster {cluster}')
# plt.gca().yaxis.set_major_formatter(FuncFormatter(millions_formattor))
plt.xticks(rotation=45)
plt.ylabel('Average Spending')
plt.title('Average Spending Over Time for Each Cluster')
plt.legend()
plt.show()

### Question 2) c)

In [None]:
df = filtered_customer_distribution_main

In [None]:
payment_preference = df.groupby(['Client City', 'Form of Payment']).size().reset_index(name='Count')

In [None]:
payment_pivot = payment_preference.pivot_table(index='Client City',columns='Form of Payment',values='Count',fill_value=0)

In [None]:
cities = payment_pivot.index

In [None]:
forms_of_payment = payment_pivot.columns

In [None]:
date_for_plot = [payment_pivot[form_of_payment].values for form_of_payment in forms_of_payment]

In [None]:
plt.figure(figsize=(12,6))
plt.stackplot(cities, data_for_plot, labels=forms_of_payment)
plt.xlabel('Client City')
plt.ylabel('Count')
plt.title('Prefereed forms of payment')
plt.legend(loc='upper left')
plt.xticks(rotation=45)
plt.show()

In [None]:
df = filtered_customer_distribution_main  # Using your DataFrame

In [None]:
payment_frequency = df.groupby(['Client', 'Form of Payment']).size().reset_index(name='Frequency')

In [None]:

payment_pivot = payment_frequency.pivot_table(index='Client', columns='Form of Payment', fill_value=0)


In [None]:

scaler = StandardScaler()
X_scaled = scaler.fit_transform(payment_pivot)
X_scaled = pd.DataFrame(X_scaled, columns=payment_pivot.columns)

In [None]:

kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X_scaled)

In [None]:
inertias = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertias.append([k, kmeans.inertia_])
(
    pd.DataFrame(inertias, columns=['k','inertias'])
    .pipe(so.Plot, x='k', y='inertias')
    .add(so.Line(marker='o'))
    .label(title="Elbow Method for Optimal Clusters")
)

In [None]:
silhouette_scores = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    # score = silhouette_score(X_scaled, kmeans.labels_)
    silhouette_scores.append([k,silhouette_score(X_scaled, kmeans.labels_)])

(
    pd.DataFrame(silhouette_scores, columns=['k','silhouette_scores'])
    .pipe(so.Plot, x='k', y='silhouette_scores')
    .add(so.Line(marker='o'))
    .label(title="Elbow Method for Optimal Clusters")
)

In [None]:
best_k = 3  
kmeans_final = KMeans(n_clusters=best_k, random_state=42)
kmeans_final.fit(X_scaled)

In [None]:
payment_pivot['Cluster'] = kmeans_final.labels_
cluster_summary = payment_pivot.groupby('Cluster').mean()

In [None]:
cluster_summary_transposed = cluster_summary.transpose()

In [None]:
if isinstance(cluster_summary_transposed.index, pd.MultiIndex):
    cluster_summary_transposed.index = [' '.join(map(str, ind)) for ind in cluster_summary_transposed.index]
cluster_summary_transposed.columns = cluster_summary_transposed.columns.astype(str)

In [None]:
plt.figure(figsize=(15, 10))
for cluster in cluster_summary_transposed.columns:
    plt.plot(cluster_summary_transposed.index, 
             cluster_summary_transposed[cluster], 
             label=f'Cluster {cluster}')

plt.xticks(rotation=45)
plt.ylabel('Average Frequency of Payment Type')
plt.title('Average Payment Type Frequency for Each Cluster')
plt.legend()
plt.show()

### Question 2 d)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming df is your DataFrame
df = filtered_customer_distribution_main

# Convert 'Sale Date Time' to datetime
df['Sale Date Time'] = pd.to_datetime(df['Sale Date Time'])

# Extract day of the week and hour of the day
df['Day of Week'] = df['Sale Date Time'].dt.day_name()
df['Hour of Day'] = df['Sale Date Time'].dt.hour

In [None]:
# Aggregate sales by day of the week
sales_by_day = df.groupby('Day of Week')['Total'].sum()

# Order the days for the plot
ordered_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
sales_by_day = sales_by_day.reindex(ordered_days)

In [None]:
# Plotting sales by day of the week
plt.figure(figsize=(10, 6))
sns.barplot(x=sales_by_day.index, y=sales_by_day.values)
plt.xlabel('Day of the Week')
plt.ylabel('Total Sales')
plt.title('Total Sales by Day of the Week')
plt.show()

# Aggregate sales by hour of the day
sales_by_hour = df.groupby('Hour of Day')['Total'].sum()

# Plotting sales by hour of the day
plt.figure(figsize=(12, 6))
sns.barplot(x=sales_by_hour.index, y=sales_by_hour.values)
plt.xlabel('Hour of the Day')
plt.ylabel('Total Sales')
plt.title('Total Sales by Hour of the Day')
plt.xticks(range(0, 24))
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

# Load your data into a DataFrame
# Assuming df is your DataFrame
df = filtered_customer_distribution_main.copy()

# Convert 'Sale Date Time' to datetime and extract time features
df['Sale Date Time'] = pd.to_datetime(df['Sale Date Time'])
df['Day of Week'] = df['Sale Date Time'].dt.dayofweek  # 0: Monday, 6: Sunday
df['Hour of Day'] = df['Sale Date Time'].dt.hour

# Group by 'Client', 'Day of Week', and 'Hour of Day' and count the occurrences
time_frequency = df.groupby(['Client', 'Day of Week', 'Hour of Day']).size().reset_index(name='Frequency')

# Pivot the table to get 'Day of Week' and 'Hour of Day' as features
time_pivot = time_frequency.pivot_table(index='Client', columns=['Day of Week', 'Hour of Day'], fill_value=0)

# Flatten MultiIndex columns for KMeans
time_pivot.columns = ['DOW_' + str(col[0]) + '_HOD_' + str(col[1]) for col in time_pivot.columns.values]

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(time_pivot)
X_scaled = pd.DataFrame(X_scaled, columns=time_pivot.columns)

# K-means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X_scaled)

# Elbow Method
inertias = []
for k in range(1, 11):
    kmeans_temp = KMeans(n_clusters=k, random_state=42)
    kmeans_temp.fit(X_scaled)
    inertias.append(kmeans_temp.inertia_)

# Plotting the Elbow Method
plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), inertias, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal Clusters')
plt.show()

# Silhouette Analysis
silhouette_scores = []
for k in range(2, 11):
    kmeans_temp = KMeans(n_clusters=k, random_state=42)
    kmeans_temp.fit(X_scaled)
    score = silhouette_score(X_scaled, kmeans_temp.labels_)
    silhouette_scores.append(score)

# Plotting the Silhouette Analysis
plt.figure(figsize=(10, 6))
plt.plot(range(2, 11), silhouette_scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Analysis for Optimal Clusters')
plt.show()

# Choose the best number of clusters based on the analysis
best_k = 7  # Replace with the actual best_k found from analysis
kmeans_final = KMeans(n_clusters=best_k, random_state=42)
kmeans_final.fit(X_scaled)

# Add the cluster labels to the DataFrame
time_pivot['Cluster'] = kmeans_final.labels_

# Aggregate statistics per cluster
cluster_summary = time_pivot.groupby('Cluster').mean()

# Visualization: Plotting the mean frequency for each time slot in each cluster
cluster_summary_transposed = cluster_summary.transpose()

plt.figure(figsize=(15, 10))
sns.heatmap(cluster_summary_transposed, cmap="YlGnBu")
plt.xlabel('Cluster')
plt.ylabel('Time Slot')
plt.title('Average Purchase Frequency for Each Time Slot by Cluster')
plt.show()

### Question 4) a)

In [None]:
df = filtered_customer_distribution_main
average_discount_by_city = df.groupby('Client City')['Discount Amount'].mean().reset_index()

In [None]:
average_discount_by_city = average_discount_by_city.sort_values(by='Discount Amount', ascending=False)

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(x='Client City', y='Discount Amount', data=average_discount_by_city)
plt.xlabel('Client City')
plt.ylabel('Average Discount Amount')
plt.title('Average Discount Amount by Client City')
plt.xticks(rotation=45)
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

# Load your data into a DataFrame
# df = pd.read_csv('path_to_your_dataset.csv')  # Replace with your actual data loading code

df = filtered_customer_distribution_main.copy()

# Adding a feature to represent discount utilization
df['Discount Utilized'] = df['Discount Amount'] > 0

# Group by 'Client' and potentially other features like 'Product Category'
discount_frequency = df.groupby(['Client', 'Product Category'])['Discount Utilized'].mean().reset_index()

# Pivot the table
discount_pivot = discount_frequency.pivot_table(index='Client', columns='Product Category', values='Discount Utilized', fill_value=0)

# Flatten the MultiIndex columns if required
discount_pivot.columns = ['Category_' + str(col) for col in discount_pivot.columns.values]

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(discount_pivot)
X_scaled = pd.DataFrame(X_scaled, columns=discount_pivot.columns)

# K-means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X_scaled)

# Elbow Method
inertias = []
for k in range(1, 11):
    kmeans_temp = KMeans(n_clusters=k, random_state=42)
    kmeans_temp.fit(X_scaled)
    inertias.append(kmeans_temp.inertia_)

# Plotting the Elbow Method
plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), inertias, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal Clusters')
plt.show()

# Silhouette Analysis
silhouette_scores = []
for k in range(2, 11):
    kmeans_temp = KMeans(n_clusters=k, random_state=42)
    kmeans_temp.fit(X_scaled)
    score = silhouette_score(X_scaled, kmeans_temp.labels_)
    silhouette_scores.append(score)

# Plotting the Silhouette Analysis
plt.figure(figsize=(10, 6))
plt.plot(range(2, 11), silhouette_scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Analysis for Optimal Clusters')
plt.show()

# Choose the best number of clusters based on the analysis
best_k = 3  # Replace with the actual best_k found from analysis
kmeans_final = KMeans(n_clusters=best_k, random_state=42)
kmeans_final.fit(X_scaled)

# Add the cluster labels to the DataFrame
discount_pivot['Cluster'] = kmeans_final.labels_

# Aggregate statistics per cluster
cluster_summary = discount_pivot.groupby('Cluster').mean()

# Visualization: Plotting the mean discount utilization for each category in each cluster
cluster_summary_transposed = cluster_summary.transpose()

plt.figure(figsize=(15, 10))
sns.heatmap(cluster_summary_transposed, cmap="YlGnBu")
plt.xlabel('Cluster')
plt.ylabel('Product Category')
plt.title('Average Discount Utilization for Each Product Category by Cluster')
plt.show()

### Q 4 b)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming df is your DataFrame
# df = pd.read_csv('path_to_your_dataset.csv')  # Replace with your actual data loading code

df = filtered_customer_distribution_main  # Using your DataFrame

# Aggregate data
segment_data = df.groupby('Client City').agg(Total_Discount=('Discount Amount', 'sum'), 
                                             Total_Spending=('Total', 'sum')).reset_index()

# Bin the data
segment_data['Discount_Bin'] = pd.qcut(segment_data['Total_Discount'], q=5, duplicates='drop', precision=0)
segment_data['Spending_Bin'] = pd.qcut(segment_data['Total_Spending'], q=5, duplicates='drop', precision=0)

# Create a pivot table for the heatmap
heatmap_data = segment_data.groupby(['Discount_Bin', 'Spending_Bin']).size().unstack(fill_value=0)

# Plotting the heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(heatmap_data, annot=True, cmap='viridis', fmt='g')
plt.title('Heatmap of Total Discount vs Total Spending per Client City')
plt.xlabel('Total Spending Bins')
plt.ylabel('Total Discount Bins')
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

# Load your data into a DataFrame 'df'
# df = pd.read_csv('path_to_your_dataset.csv')  # Replace with your actual data loading code

df = filtered_customer_distribution_main.copy()  # Using your DataFrame for demonstration

# Create a feature for average discount utilization and total spending per client
client_data = df.groupby('Client').agg(Average_Discount_Utilized=('Discount Amount', lambda x: (x > 0).mean()),
                                       Total_Spending=('Total', 'sum')).reset_index()

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(client_data[['Average_Discount_Utilized', 'Total_Spending']])
X_scaled = pd.DataFrame(X_scaled, columns=['Average_Discount_Utilized', 'Total_Spending'])

# K-means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X_scaled)

# Elbow Method
inertias = []
for k in range(1, 11):
    kmeans_temp = KMeans(n_clusters=k, random_state=42)
    kmeans_temp.fit(X_scaled)
    inertias.append(kmeans_temp.inertia_)

# Plotting the Elbow Method
plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), inertias, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal Clusters')
plt.show()

# Silhouette Analysis
silhouette_scores = []
for k in range(2, 11):
    kmeans_temp = KMeans(n_clusters=k, random_state=42)
    kmeans_temp.fit(X_scaled)
    score = silhouette_score(X_scaled, kmeans_temp.labels_)
    silhouette_scores.append(score)

# Plotting the Silhouette Analysis
plt.figure(figsize=(10, 6))
plt.plot(range(2, 11), silhouette_scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Analysis for Optimal Clusters')
plt.show()

# Choose the best number of clusters based on the analysis
best_k = silhouette_scores.index(max(silhouette_scores)) + 2  # Best k is where silhouette score is max
kmeans_final = KMeans(n_clusters=best_k, random_state=42)
kmeans_final.fit(X_scaled)

# Add the cluster labels to the DataFrame
client_data['Cluster'] = kmeans_final.labels_

# Aggregate statistics per cluster
cluster_summary = client_data.groupby('Cluster').mean()

# Visualization: Heatmap of average discount utilization and total spending by cluster
plt.figure(figsize=(10, 8))
sns.heatmap(cluster_summary, annot=True, cmap="YlGnBu", fmt=".2f")
plt.title('Clusters Average Discount Utilization and Total Spending')
plt.ylabel('Cluster')
plt.show()