# Importing Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.cluster import KMeans

In [None]:
#importing the dataset
dataset=pd.read_csv('Mall_Customers.csv')

# Data Visualization

In [None]:
dataset.head()

In [None]:
dataset.shape

In [None]:
dataset.info()

In [None]:
dataset.describe()

In [None]:
dataset.dtypes

In [None]:
dataset.columns

In [None]:
dataset.rename(columns={'Annual Income (k$)':'AnnualIncome','Spending Score (1-100)':'SpendingScore'},inplace=True)

In [None]:
dataset.head()

In [None]:
dataset.isnull().sum()

In [None]:
dataset.corr()

In [None]:
sns.heatmap(dataset.corr(),annot=True,fmt='.1f')
plt.show()

In [None]:
dataset.drop('CustomerID',axis=1,inplace=True)

In [None]:
dataset.head()

In [None]:
dataset.shape

In [None]:
dataset['Gender'].value_counts()

In [None]:
sns.countplot(dataset['Gender'])
plt.title('Gender')
plt.show()

In [None]:
dataset['Age'].unique()

In [None]:
labels=dataset['Gender'].unique()
explode=[0.1,0]
values=dataset['Gender'].value_counts().values
plt.figure(figsize=(7,8))
plt.pie(values,explode=explode,labels=labels,autopct='%1.1f%%')
plt.title('Gender Count in Percentage')
plt.show()

In [None]:
plt.figure(figsize=(25,10))
sns.barplot(x = "AnnualIncome", y = "SpendingScore", hue = "Gender", data = dataset)
plt.xticks(rotation=45)
plt.show()

In [None]:
dataset.hist(figsize=(18,12))
plt.title('All Data Show Histogram System')
plt.show()

In [None]:
plt.figure(figsize=(15,7))
sns.barplot(x=dataset['Age'].value_counts().index,y=dataset['Age'].value_counts().values)
plt.xlabel('Age')
plt.ylabel('Rate')
plt.title('Age vs Rate State')
plt.show()

In [None]:
#implementing the k-means clustering algorithm
class K_means:
    def __init__(self, k=3, tol=0.001, max_iter=300):
        self.k = k
        self.tol = tol
        self.max_iter = max_iter
    
    def fit(self,dataset):
        self.centroids={}
        for i in range(self.k):
            self.centroids[i] = dataset[i]
            
        for i in range(self.max_iter):
            self.classes={}
            for i in range(self.k):
                self.classes[i]=[]
            for features in dataset:
                distances=[np.linalg.norm(features-self.centroids[centroid]) for centroid in self.centroids]
                classification=distances.index(min(distances))
                self.classes[classification].append(features)
            
            previous=dict(self.centroids)
            
            for classification in self.classes:
                self.centroids[classification]=np.average(self.classes[classification],axis=0)
            
            isOptimal=True
            
            for centroid in self.centroids:
                original_centroid=previous[centroid]
                curr=self.centroids[centroid]
                if np.sum((curr - original_centroid)/original_centroid * 100.0) > self.tol:
                    isOptimal = False
        
            if(isOptimal):
                break
    
    def predict(self,data):
        distances=[np.linalg.norm(features-self.centroid[centroid]) for centroid in self.centroids]
        classification=distances.index(min(distances))
        return classification

In [None]:
#initializing
km=K_means(5)

In [None]:
#fitting the data
km.fit(dataset.iloc[:,1:].values)

In [None]:
colors = 10*["r", "g", "c", "b", "k"]

In [None]:
for centroid in km.centroids:
    plt.scatter(km.centroids[centroid][0], km.centroids[centroid][1], s = 130, marker = "x")

In [None]:
for classification in km.classes:
    color = colors[classification]
    for features in km.classes[classification]:
        plt.scatter(features[0], features[1], color = color,s = 30)
plt.show()

In [None]:
#getting the optimal value of K using elbow method
wcss=[]
for i in range(1,11):
    km=KMeans(n_clusters=i)
    km.fit(dataset.iloc[:,1:].values)
    wcss.append(km.inertia_)

plt.figure(figsize=(15,7))
plt.grid()
plt.plot(range(1,11),wcss)
plt.xlabel("K Value")
plt.xticks(np.arange(1,11,1))
plt.ylabel("WCSS")
plt.show()

In [None]:
#using the traditionl k-means clustering of scikit learn
kmeans=KMeans(n_clusters=5)
clusters = kmeans.fit(dataset.iloc[:,1:].values)
dataset["label"] = clusters

In [None]:
#ploting the centroid of clusters
kmeans.cluster_centers_

In [None]:
#plotting the result
plt.scatter(45.2173913 , 26.30434783, 20.91304348, c='r')
plt.scatter(42.9375, 55.0875, 49.7125, c='g')
plt.scatter(40.66666667, 87.75, 17.58333333, c='c')
plt.scatter(5.27272727, 25.72727273, 79.36363636, c='b')
plt.scatter(32.69230769, 86.53846154, 82.12820513, c='k')