<a href="https://colab.research.google.com/github/Sara-Esm/Customer-Segmentation/blob/main/Customer_Segmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [38]:
import numpy as np #linear algebra
import pandas as pd # data processing in a 2D array format
import matplotlib.pyplot as plt #visualizations
import seaborn as sb #visualizations

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans

import warnings
warnings.filterwarnings('ignore')

In [40]:
df = pd.read_csv('/Train.csv')
df.head()

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D
1,462643,Female,Yes,38,Yes,Engineer,,Average,3.0,Cat_4,A
2,466315,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,B
3,461735,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,B
4,462669,Female,Yes,40,Yes,Entertainment,,High,6.0,Cat_6,A


In [41]:
#To check the shape of the dataset
df.shape

(8068, 11)

In [42]:
#To get the information of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8068 entries, 0 to 8067
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               8068 non-null   int64  
 1   Gender           8068 non-null   object 
 2   Ever_Married     7928 non-null   object 
 3   Age              8068 non-null   int64  
 4   Graduated        7990 non-null   object 
 5   Profession       7944 non-null   object 
 6   Work_Experience  7239 non-null   float64
 7   Spending_Score   8068 non-null   object 
 8   Family_Size      7733 non-null   float64
 9   Var_1            7992 non-null   object 
 10  Segmentation     8068 non-null   object 
dtypes: float64(2), int64(2), object(7)
memory usage: 693.5+ KB


In [43]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ID,8068.0,463479.214551,2595.381232,458982.0,461240.75,463472.5,465744.25,467974.0
Age,8068.0,43.466906,16.711696,18.0,30.0,40.0,53.0,89.0
Work_Experience,7239.0,2.641663,3.406763,0.0,0.0,1.0,4.0,14.0
Family_Size,7733.0,2.850123,1.531413,1.0,2.0,3.0,4.0,9.0


In [44]:
#Improving the values in the Accepted column
df['Accepted'] = df['Accepted'].str.replace('Accepted', '')

KeyError: 'Accepted'

In [None]:
#To check the null values in the dataset
for col in df.columns:
	temp = df[col].isnull().sum()
	if temp > 0:
		print(f'Column {col} contains {temp} null values.')

In [None]:
df = df.dropna()
print("Total missing values are:", len(df))

In [None]:
#To find the total number of unique values in each column
df.nunique()

In [None]:
parts = df["Dt_Customer"].str.split("-", n=3, expand=True)
df["day"] = parts[0].astype('int')
df["month"] = parts[1].astype('int')
df["year"] = parts[2].astype('int')

In [None]:
df.drop(['Z_CostContact', 'Z_Revenue', 'Dt_Customer'],
		axis=1,
		inplace=True)

In [None]:
#Data Visualization and Analysis
floats, objects = [], []
for col in df.columns:
	if df[col].dtype == object:
		objects.append(col)
	elif df[col].dtype == float:
		floats.append(col)

print(objects)
print(floats)

In [None]:
plt.subplots(figsize=(15, 10))
for i, col in enumerate(objects):
	plt.subplot(2, 2, i + 1)
	sb.countplot(df[col])
plt.show()

In [None]:
df['Marital_Status'].value_counts()

In [None]:
plt.subplots(figsize=(15, 10))
for i, col in enumerate(objects):
	plt.subplot(2, 2, i + 1)
	sb.countplot(df[col], hue=df['Response'])
plt.show()

In [None]:
#Label Encoding; convert the categorical values into the numerical values so that model can understand it
for col in df.columns:
	if df[col].dtype == object:
		le = LabelEncoder()
		df[col] = le.fit_transform(df[col])

In [None]:
#Heatmap; to visualize the correlation among the different features of dataset
plt.figure(figsize=(15, 15))
sb.heatmap(df.corr() > 0.8, annot=True, cbar=False)
plt.show()

In [None]:
#Standardization;It scales down the data
scaler = StandardScaler()
data = scaler.fit_transform(df)

In [None]:
#Segmentation
from sklearn.manifold import TSNE
model = TSNE(n_components=2, random_state=0)
tsne_data = model.fit_transform(df)
plt.figure(figsize=(7, 7))
plt.scatter(tsne_data[:, 0], tsne_data[:, 1])
plt.show()

In [None]:
#KMeans Clustering
error = []
for n_clusters in range(1, 21):
	model = KMeans(init='k-means++',
				n_clusters=n_clusters,
				max_iter=500,
				random_state=22)
	model.fit(df)
	error.append(model.inertia_)

In [None]:
plt.figure(figsize=(10, 5))
sb.lineplot(x=range(1, 21), y=error)
sb.scatterplot(x=range(1, 21), y=error)
plt.show()

In [None]:
# create clustering model with optimal k=5
model = KMeans(init='k-means++',
			n_clusters=5,
			max_iter=500,
			random_state=22)
segments = model.fit_predict(df)

In [None]:
#Scatterplot
plt.figure(figsize=(7, 7))
sb.scatterplot(tsne_data[:, 0], tsne_data[:, 1], hue=segments)
plt.show()