In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from sklearn.cluster import KMeans, k_means
from sklearn.decomposition import PCA 

In [None]:
df = pd.read_csv("sales_data_sample.csv") 

In [None]:
df.head()

In [None]:
df.shape 

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

In [None]:
df_drop  = ['ADDRESSLINE1', 'ADDRESSLINE2', 'STATUS','POSTALCODE', 'CITY', 'TERRITORY', 'PHONE', 'STATE', 'CONTACTFIRSTNAME', 'CONTACTLASTNAME', 'CUSTOMERNAME', 'ORDERNUMBER']
df = df.drop(df_drop, axis=1) 

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

In [None]:
df['COUNTRY'].unique()

In [None]:
df['PRODUCTLINE'].unique()

In [None]:
df['DEALSIZE'].unique()

In [None]:
productline = pd.get_dummies(df['PRODUCTLINE']) #Converting the categorical columns. 
Dealsize = pd.get_dummies(df['DEALSIZE'])

In [None]:
df = pd.concat([df,productline,Dealsize], axis = 1)

In [None]:
df_drop  = ['COUNTRY','PRODUCTLINE','DEALSIZE'] 
df = df.drop(df_drop, axis=1)

In [None]:
df['PRODUCTCODE'] = pd.Categorical(df['PRODUCTCODE']).codes 

In [None]:
df.drop('ORDERDATE', axis=1, inplace=True) 

In [None]:
df.dtypes 

In [None]:
distortions = [] 
K = range(1,10)
for k in K:
    kmeanModel = KMeans(n_clusters=k)
    kmeanModel.fit(df)
    distortions.append(kmeanModel.inertia_)   

In [None]:
plt.figure(figsize=(16,8))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

In [None]:
X_train = df.values 

In [None]:
X_train.shape

In [None]:
model = KMeans(n_clusters=3,random_state=2) 
model = model.fit(X_train) 
predictions = model.predict(X_train) 

In [None]:
unique,counts = np.unique(predictions,return_counts=True)

In [None]:
counts = counts.reshape(1,3)

In [None]:
counts_df = pd.DataFrame(counts,columns=['Cluster1','Cluster2','Cluster3'])

In [None]:
counts_df.head()

In [None]:
pca = PCA(n_components=2) 

In [None]:
reduced_X = pd.DataFrame(pca.fit_transform(X_train),columns=['PCA1','PCA2'])

In [None]:
reduced_X.head()

In [None]:

plt.figure(figsize=(14,10))
plt.scatter(reduced_X['PCA1'],reduced_X['PCA2'])

In [None]:
model.cluster_centers_ 

In [None]:
reduced_centers = pca.transform(model.cluster_centers_) 

In [None]:
reduced_centers

In [None]:
plt.figure(figsize=(14,10))
plt.scatter(reduced_X['PCA1'],reduced_X['PCA2'])
plt.scatter(reduced_centers[:,0],reduced_centers[:,1],color='black',marker='x',s=300) #Plotting the centriods

In [None]:
reduced_X['Clusters'] = predictions 

In [None]:
reduced_X.head()

In [None]:
 
plt.figure(figsize=(14,10))
plt.scatter(reduced_X[reduced_X['Clusters'] == 0].loc[:,'PCA1'],reduced_X[reduced_X['Clusters'] == 0].loc[:,'PCA2'],color='slateblue')
plt.scatter(reduced_X[reduced_X['Clusters'] == 1].loc[:,'PCA1'],reduced_X[reduced_X['Clusters'] == 1].loc[:,'PCA2'],color='springgreen')
plt.scatter(reduced_X[reduced_X['Clusters'] == 2].loc[:,'PCA1'],reduced_X[reduced_X['Clusters'] == 2].loc[:,'PCA2'],color='indigo')


plt.scatter(reduced_centers[:,0],reduced_centers[:,1],color='black',marker='x',s=300)