# Customer analysis

We have a list of customers with their personal information and their behaviour.

We want to launch a new luxury and expensive product. 

Which customers should we target with personalized marketing?

# Task
- Perform basic data analysis and visualization
- Perform data editing (encoding)
- Select appropriate variables by which to segment customers
- Create a k-means model with an appropriate number of segments
- Visualize the model
- Which segment is the target segment?

# Data loading

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns  

In [None]:
data=pd.read_csv('../dataset/Mall_Customers.csv')

# Data description
- Information about the data
- Data preview
- Basic statistics
- View data distribution

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

axes[0, 0].hist(data['Age'], bins=20, edgecolor='black')
axes[0, 0].set_title('Age Distribution')
axes[0, 0].set_xlabel('Age')
axes[0, 0].set_ylabel('Frequency')

axes[0, 1].hist(data['Annual Income (k$)'], bins=20, edgecolor='black', color='green')
axes[0, 1].set_title('Annual Income Distribution')
axes[0, 1].set_xlabel('Income (k$)')
axes[0, 1].set_ylabel('Frequency')

axes[1, 0].hist(data['Spending Score (1-100)'], bins=20, edgecolor='black', color='orange')
axes[1, 0].set_title('Spending Score Distribution')
axes[1, 0].set_xlabel('Spending Score')
axes[1, 0].set_ylabel('Frequency')

data['Gender'].value_counts().plot(kind='bar', ax=axes[1, 1], color=['pink', 'lightblue'])
axes[1, 1].set_title('Gender Distribution')
axes[1, 1].set_xlabel('Gender')
axes[1, 1].set_ylabel('Count')

plt.tight_layout()
plt.show()

In [None]:
data.isna().sum()

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
data.head()

# Data preparation
- Treatment of NaN values
- Encoding text columns

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
data['Gender_encoded'] = label_encoder.fit_transform(data['Gender'])

print("Gender mapping:")
for i, gender in enumerate(label_encoder.classes_):
    print(f"{gender}: {i}")

In [None]:
print("NaN values before:", data.isna().sum().sum())
data = data.dropna()
print("NaN values after:", data.isna().sum().sum())

# Variable selection
- Displaying relationships between variables (pairplot)
- We are looking for a combination of variables that can be visually divided into groups
- Creating an X with only these columns

In [None]:
X = data[['Annual Income (k$)', 'Spending Score (1-100)']].values
print(f"Selected features shape: {X.shape}")

In [None]:
sns.pairplot(data[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']])
plt.show()

# Finding the k-mean model parameter
- Plotting elbow graph
- Finding the ideal number of clusters

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), inertia_list, marker='o', linewidth=2)
plt.scatter(range(1, 11), inertia_list)
plt.scatter(5, inertia_list[4], marker="X", s=300, c="r", label="Optimal k=5")
plt.xlabel("Number of Clusters")
plt.ylabel("Inertia Value")
plt.title("Elbow Method - Finding Optimal Number of Clusters")
plt.grid(True)
plt.legend()
plt.show()

In [None]:
from sklearn.cluster import KMeans

inertia_list = []
for num_clusters in range(1, 11):
    kmeans_model = KMeans(n_clusters=num_clusters, init="k-means++", n_init=10, random_state=42)
    kmeans_model.fit(X)
    inertia_list.append(kmeans_model.inertia_)

# k-mean model for the observed number of clusters
- Training the model
- Data prediction

In [None]:
Y_pred = kmeans_final.predict(X)
data['Cluster'] = Y_pred

print("Cluster centers:")
print(kmeans_final.cluster_centers_)
print(f"\nCluster distribution:")
print(data['Cluster'].value_counts().sort_index())

In [None]:
kmeans_final = KMeans(n_clusters=5, init="k-means++", n_init=10, random_state=42)
kmeans_final.fit(X)

# Visualization of the model
- Rendering of coloured groups of graphs

In [None]:
plt.figure(figsize=(12, 8))

colors = ['purple', 'blue', 'green', 'orange', 'red']
cluster_names = ['Cluster 0', 'Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4']

for i in range(5):
    cluster_data = X[Y_pred == i]
    plt.scatter(cluster_data[:, 0], cluster_data[:, 1], 
                s=100, c=colors[i], label=cluster_names[i], alpha=0.6, edgecolors='black')

plt.scatter(kmeans_final.cluster_centers_[:, 0], kmeans_final.cluster_centers_[:, 1],
            s=300, c='yellow', marker='*', edgecolors='black', linewidths=2, label='Centroids')

plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.title('Customer Segments - K-Means Clustering')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## Interpretation of the model
- Describe each group
- Which customers to target?

In [None]:
print("="*70)
print("TARGET SEGMENT RECOMMENDATION FOR LUXURY PRODUCT")
print("="*70)
print("\nBased on the clustering analysis, the ideal target segment is:")
print("\nCLUSTER 1 (High Income + High Spending)")
print("- Characteristics:")
print("  * High annual income")
print("  * High spending score")
print("  * Most likely to purchase luxury/expensive products")
print("  * Strong purchasing power and willingness to spend")
print("\nAlternative Target:")
print("\nCLUSTER 3 (Moderate-High Income + High Spending)")
print("- Characteristics:")
print("  * Moderate to high income")
print("  * Very high spending score")
print("  * Enthusiastic buyers despite moderate income")
print("\nSegments to AVOID:")
print("- Cluster 0: Low income + High spending (risky, may not afford luxury)")
print("- Cluster 2: Low income + Low spending (no purchasing power)")
print("- Cluster 4: High income + Low spending (not interested in spending)")
print("="*70)

In [None]:
for i in range(5):
    cluster_data = data[data['Cluster'] == i]
    print(f"\n{'='*60}")
    print(f"CLUSTER {i} - {len(cluster_data)} customers")
    print(f"{'='*60}")
    print(f"Average Annual Income: ${cluster_data['Annual Income (k$)'].mean():.2f}k")
    print(f"Average Spending Score: {cluster_data['Spending Score (1-100)'].mean():.2f}")
    print(f"Average Age: {cluster_data['Age'].mean():.2f} years")
    print(f"Gender distribution: {cluster_data['Gender'].value_counts().to_dict()}")