In [126]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import statistics as stats
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
import warnings

warnings.filterwarnings('ignore')

In [127]:
db = pd.read_excel('data.xlsx')
db.to_csv('data.csv', index=False)

In [128]:
db.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


In [129]:
db.tail()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
525456,538171,22271,FELTCRAFT DOLL ROSIE,2,2010-12-09 20:01:00,2.95,17530.0,United Kingdom
525457,538171,22750,FELTCRAFT PRINCESS LOLA DOLL,1,2010-12-09 20:01:00,3.75,17530.0,United Kingdom
525458,538171,22751,FELTCRAFT PRINCESS OLIVIA DOLL,1,2010-12-09 20:01:00,3.75,17530.0,United Kingdom
525459,538171,20970,PINK FLORAL FELTCRAFT SHOULDER BAG,2,2010-12-09 20:01:00,3.75,17530.0,United Kingdom
525460,538171,21931,JUMBO STORAGE BAG SUKI,2,2010-12-09 20:01:00,1.95,17530.0,United Kingdom


In [130]:
db.shape

(525461, 8)

In [131]:
db.dtypes

Invoice                object
StockCode              object
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
Price                 float64
Customer ID           float64
Country                object
dtype: object

In [132]:
db.isnull().sum()

Invoice             0
StockCode           0
Description      2928
Quantity            0
InvoiceDate         0
Price               0
Customer ID    107927
Country             0
dtype: int64

In [133]:
db.duplicated()

0         False
1         False
2         False
3         False
4         False
          ...  
525456    False
525457    False
525458    False
525459    False
525460    False
Length: 525461, dtype: bool

In [134]:
db.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525461 entries, 0 to 525460
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Invoice      525461 non-null  object        
 1   StockCode    525461 non-null  object        
 2   Description  522533 non-null  object        
 3   Quantity     525461 non-null  int64         
 4   InvoiceDate  525461 non-null  datetime64[ns]
 5   Price        525461 non-null  float64       
 6   Customer ID  417534 non-null  float64       
 7   Country      525461 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 32.1+ MB


In [135]:
db.dropna(inplace=True)

In [136]:
db.shape

(417534, 8)

In [137]:
db.nunique()

Invoice        23587
StockCode       4031
Description     4459
Quantity         500
InvoiceDate    21786
Price            664
Customer ID     4383
Country           37
dtype: int64

In [138]:
#There is no target varibale for clustering
#Dropped non values as there is a sufficient amount of data to work with

In [139]:
first = [489434,12,2.95, 17530.0]
second = [489434,12,6.75, 13085.0]
total = 0
for i in range(len(first)):
    print((first[i]-second[i])**2)
    total += (first[i]-second[i])**2
total ** 0.5

0
0
14.44
19758025.0


4445.001624296667

Extremely far away due to the last feature.

In [140]:
db.drop(['Customer ID'], axis=1)

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.10,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,United Kingdom
...,...,...,...,...,...,...,...
525456,538171,22271,FELTCRAFT DOLL ROSIE,2,2010-12-09 20:01:00,2.95,United Kingdom
525457,538171,22750,FELTCRAFT PRINCESS LOLA DOLL,1,2010-12-09 20:01:00,3.75,United Kingdom
525458,538171,22751,FELTCRAFT PRINCESS OLIVIA DOLL,1,2010-12-09 20:01:00,3.75,United Kingdom
525459,538171,20970,PINK FLORAL FELTCRAFT SHOULDER BAG,2,2010-12-09 20:01:00,3.75,United Kingdom


In [141]:
price = db['Price']
countries = db['Country']
label_encoder = LabelEncoder()
countries_encoded = label_encoder.fit_transform(countries) + 1 

In [142]:
country_mapping = {country: i + 1 for i, country in enumerate(sorted(countries.unique()))}
countries_encoded = countries.map(country_mapping)

In [143]:
db['countries_encoded'] = countries_encoded

In [144]:
db.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,countries_encoded
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom,35
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,35
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,35
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom,35
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom,35


In [145]:
first = [489434,12,2.95, 35]
second = [489434,12,6.75, 35]
total = 0
for i in range(len(first)):
    print((first[i]-second[i])**2)
    total += (first[i]-second[i])**2
total ** 0.5

0
0
14.44
0


3.8

Points are now far closer.

In [146]:
db_new = db.drop(['StockCode', 'Description', 'InvoiceDate'], axis=1)

In [147]:
kmeans = KMeans(n_clusters=3, n_init=10, random_state=42)

In [148]:
reshape = np.array([489434, 12, 2.95, 35]).reshape(-1, 1)  # Reshape into a 2D array with a single column

# Perform KMeans clustering
kmeans = KMeans(n_clusters=3,)  # Example: using 3 clusters
cluster_labels = kmeans.fit_predict(reshape)

print(cluster_labels)

[1 0 0 2]


In [149]:
x = db_new

In [150]:
y = db['Country']

In [151]:
scaler = StandardScaler()

In [152]:
x = scaler.fit_transform(x)

ValueError: could not convert string to float: 'C489449'

In [None]:
x = pd.DataFrame(data=x, columns=df.columns[1:5])

In [None]:
data = pd.DataFrame({'Country': countries, 'Price': price})

In [None]:
scaler = StandardScaler()
data[['Price']] = scaler.fit_transform(data[['Price']])

column_transformer = ColumnTransformer(
    [('encoder', OneHotEncoder(), [0])],  # [0] indicates the column index to one-hot encode
    remainder='passthrough')

data_encoded = column_transformer.fit_transform(data)

print(data)

In [None]:
kmeans = KMeans(n_clusters=3)  # Example: using 3 clusters
cluster_labels = kmeans.fit_predict(data_encoded)

# Plot the clusters
plt.scatter(data_encoded[:, 0], data_encoded[:, 1], c=cluster_labels, cmap='viridis', marker='o')
plt.xlabel('Country')
plt.ylabel('Price')
plt.title('Cluster Graph')
plt.colorbar(label='Cluster')
plt.show()