In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
file_path = '/Users/saileshkumarm/Downloads/customer_data.csv'
data = pd.read_csv(file_path, delimiter='\t')

# Display basic information about the dataset
print(data.info())
print(data.head())

# Handle missing values
data['Income'].fillna(data['Income'].median(), inplace=True)

# Convert Dt_Customer to datetime
data['Dt_Customer'] = pd.to_datetime(data['Dt_Customer'], dayfirst=True)

# Drop columns not needed for clustering
data_clustering = data.drop(columns=['ID', 'Dt_Customer', 'Z_CostContact', 'Z_Revenue'])

# Encode categorical variables
data_clustering = pd.get_dummies(data_clustering, drop_first=True)

# Scale the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_clustering)

# Apply K-Means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(data_scaled)

# Add the cluster labels to the original data
data['Cluster'] = clusters

# Visualize the clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Income', y='MntWines', hue='Cluster', data=data, palette='viridis')
plt.title('Customer Segments')
plt.xlabel('Income')
plt.ylabel('Wine Purchases')
plt.legend(title='Cluster')
plt.show()
