In [103]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

> **Customer Segmentation - ML Project
> (using KMeans Clustering)**

1. Importing dependencies

In [104]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import tree
from sklearn import metrics

2. Data Collection & Analysis

In [105]:
#loading csv file to dataframe
customer_data=pd.read_csv("../input/mall-customers/Mall_Customers.csv")

In [106]:
#first 5 rows in dataframe
customer_data.head()

In [107]:
customer_data.info()

In [108]:
#size od dataframe
customer_data.shape

In [109]:
#finding missing values in dataframe to avoid error
customer_data.isnull().sum()

In [110]:
sns.barplot(x=['Male','Female'],y=customer_data['Genre'].value_counts())

In [111]:
df_encoder=pd.get_dummies(data=customer_data, prefix='Gender', prefix_sep='_', columns=['Genre'],drop_first=True,dtype='int64')

In [112]:
df_encoder

In [113]:

df_encoder.drop(columns=["CustomerID"],axis=1,inplace=True)


In [114]:
customer_data.columns

In [115]:
scalar=StandardScaler()
scale= scalar.fit_transform(df_encoder)

3. Dimentionality Reduction

In [116]:
pca = PCA(n_components=2)
principal_components = pca.fit_transform(pd.DataFrame(scale))
pca_df = pd.DataFrame(principal_components ,columns=["PCA1","PCA2"])
pca_df

4. choosing the optimal number of clusters 

In [117]:
#WCSS : Within Clusters Sums of Squares
# finds distance between data points and centroid of cluster
wcss=[]

for i in range(1,11):
    kmeans=KMeans(n_clusters=i,init='k-means++',random_state=42)
    kmeans.fit_predict(pca_df)
    wcss.append(kmeans.inertia_) #it gives wcss values for each i

In [118]:
#plotting elbow graph/cut-off graph
sns.set()
plt.plot(range(1,11),wcss)#(x,y)
plt.title('The Elbow Point Graph')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()

optimum no of clusters=4

5. Training the K-Means Clustering Model

In [119]:
kmeans=KMeans(n_clusters=4,init='k-means++',random_state=0)
#each cluster will given a label
kmeans.fit_predict(pca_df)#return cluster no.


In [120]:
pca_df_kmeans= pd.concat([pca_df,pd.DataFrame({'cluster':kmeans.labels_})],axis=1)
pca_df_kmeans

6. Visualizing all the clusters and their centroids

In [121]:
plt.figure(figsize=(8,8))
sns.scatterplot(x="PCA1",y="PCA2",hue="cluster",data=pca_df_kmeans,palette=['red','green','blue','magenta'])

#plot centroids
plt.scatter(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],s=100,c='black',label='Centroids')
plt.title('Customer Groups')
plt.show()

In [122]:
#cluster visualization
#import plotly.express as px
#df=customer_data.iloc[['Age','Annual Income (k$)','Spending Score (1-100)']]
#px.scatter_3d(df, x='Age',y='Annual Income (k$)', z='Spending Score (1-100)',
 #            color='Label', size='Label')

In [123]:
# find all cluster centers
#cluster_centers = pd.DataFrame(data=kmeans.cluster_centers_,columns=[df_encoder.columns])
#cluster_centers
# inverse transform the data
#cluster_centers = scalar.inverse_transform(cluster_centers)
#cluster_centers = pd.DataFrame(data=cluster_centers,columns=[df_encoder.columns])
#cluster_centers

In [124]:
# Creating a target column "Cluster" for storing the cluster segment
cluster_df = pd.concat([df_encoder,pd.DataFrame({'Cluster':kmeans.labels_})],axis=1)
cluster_df

In [125]:
cluster_1_df = cluster_df[cluster_df["Cluster"]==0]
cluster_1_df

In [126]:
cluster_2_df = cluster_df[cluster_df["Cluster"]==1]
cluster_2_df

In [127]:
cluster_3_df = cluster_df[cluster_df["Cluster"]==2]
cluster_3_df

In [128]:
cluster_4_df = cluster_df[cluster_df["Cluster"] == 3]
cluster_4_df


In [129]:
#Visualization
sns.countplot(x='Cluster', data=cluster_df)

Training and Testing the model accuracy using decision tree
#Split Dataset

In [130]:
X = cluster_df.drop(['Cluster'],axis=1)
y= cluster_df[['Cluster']]
X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.3)

In [131]:
#Decision_Tree
model= DecisionTreeClassifier(criterion="entropy")
model.fit(X_train, y_train)
y_pred = model.predict(X_test)



In [132]:
print(model.score(X_test,y_test))
print(model.score(X_train,y_train))

In [133]:
#Confusion_Matrix
print(metrics.confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [134]:
import pickle

In [135]:
filename='customer_segmentation.sav'
pickle.dump(model,open(filename,'wb'))