In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

sns.set(style='whitegrid')

In [None]:
df = pd.read_csv('ifood_df.csv')
df.head()

In [None]:
print(df.isnull().sum())
df.drop_duplicates(inplace=True)
df.drop(['Z_CostContact', 'Z_Revenue'], axis=1, inplace=True)

In [None]:
features = [
    'Income', 'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts',
    'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
    'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth'
]
X = df[features]

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
sse = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    sse.append(kmeans.inertia_)

plt.figure(figsize=(8,5))
plt.plot(range(1, 11), sse, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('SSE')
plt.title('Elbow Method for Optimal k')
plt.show()

In [None]:
kmeans = KMeans(n_clusters=4, random_state=42)
df['Segment'] = kmeans.fit_predict(X_scaled)
df['Segment'].value_counts()

In [None]:
pca = PCA(n_components=2)
pca_components = pca.fit_transform(X_scaled)
df['PCA1'] = pca_components[:, 0]
df['PCA2'] = pca_components[:, 1]

plt.figure(figsize=(10,6))
sns.scatterplot(data=df, x='PCA1', y='PCA2', hue='Segment', palette='Set2', s=100)
plt.title('Customer Segmentation Visualized')
plt.show()

In [None]:
segment_summary = df.groupby('Segment')[features].mean()
print(segment_summary)

In [None]:
print("Segment 0: High spenders, likely loyal customers.")
print("Segment 1: Low activity, may need engagement.")
print("Segment 2: Frequent small purchases, target with combos.")
print("Segment 3: Occasional big spenders, offer retention discounts.")