In [1]:
import pandas as pd 
import numpy as np
from sklearn import preprocessing
from sklearn.cluster import estimate_bandwidth
from sklearn.cluster import MeanShift


In [2]:
laptop_data = pd.read_csv('laptops.csv')
laptop_data.columns = ["name", "price"]
laptop_data.head()

Unnamed: 0,name,price
0,"Apple MacBook Pro M1 2020 13"" 8 GB RAM & 256 G...",10622
1,Samsung Galaxy Book Pro 360 - Intel Core i7 & ...,11999
2,ASUS ROG Strix G17 G713QM-HX015T - RTX3060 & R...,11999
3,Lenovo Legion 5 Pro 16ACH6H - RTX3070 & Ryzen ...,12499
4,MSI Summit E13 Flip Evo - Intel Core i7 1185G7...,14546


In [3]:
label_enc =preprocessing.LabelEncoder()
laptop_data['name'] = label_enc.fit_transform(laptop_data['name'].astype(str))
laptop_data.head()

Unnamed: 0,name,price
0,315,10622
1,791,11999
2,42,11999
3,552,12499
4,693,14546


In [4]:
est_bandwidth = estimate_bandwidth(laptop_data)
print(est_bandwidth)

4015.32229866256


In [5]:
analyzer = MeanShift(bandwidth=3000) 
analyzer.fit(laptop_data)

MeanShift(bandwidth=3000, bin_seeding=False, cluster_all=True, max_iter=300,
          min_bin_freq=1, n_jobs=None, seeds=None)

In [7]:
labels = analyzer.labels_
print(labels)
print('\n\n',np.unique(labels))

[0 0 0 ... 0 0 0]


 [0 1 2 3 4 5]


In [11]:
laptop_data['name'] = label_enc.inverse_transform(laptop_data['name'])

Unnamed: 0,name,price
0,"Apple MacBook Pro M1 2020 13"" 8 GB RAM & 256 G...",10622
1,Samsung Galaxy Book Pro 360 - Intel Core i7 & ...,11999
2,ASUS ROG Strix G17 G713QM-HX015T - RTX3060 & R...,11999
3,Lenovo Legion 5 Pro 16ACH6H - RTX3070 & Ryzen ...,12499
4,MSI Summit E13 Flip Evo - Intel Core i7 1185G7...,14546
...,...,...
1232,HP 17-ca2006no,4379
1233,HP *DEMO* ProBook 455 G7 Ryzen 3 8GB 256GB,5454
1234,Lenovo IdeaPad 5 14ARE05,5781
1235,Lenovo *DEMO* IdeaPad Flex 5 14ARE05 - Ryzen 7...,6860


In [12]:
laptop_data['cluster_group'] = np.nan
for i in range(len(laptop_data)): # loop 714 rows
    laptop_data.iloc[i,laptop_data.columns.get_loc('cluster_group')] = labels[i] #set the cluster label on each row

laptop_data

Unnamed: 0,name,price,cluster_group
0,"Apple MacBook Pro M1 2020 13"" 8 GB RAM & 256 G...",10622,0.0
1,Samsung Galaxy Book Pro 360 - Intel Core i7 & ...,11999,0.0
2,ASUS ROG Strix G17 G713QM-HX015T - RTX3060 & R...,11999,0.0
3,Lenovo Legion 5 Pro 16ACH6H - RTX3070 & Ryzen ...,12499,0.0
4,MSI Summit E13 Flip Evo - Intel Core i7 1185G7...,14546,0.0
...,...,...,...
1232,HP 17-ca2006no,4379,0.0
1233,HP *DEMO* ProBook 455 G7 Ryzen 3 8GB 256GB,5454,0.0
1234,Lenovo IdeaPad 5 14ARE05,5781,0.0
1235,Lenovo *DEMO* IdeaPad Flex 5 14ARE05 - Ryzen 7...,6860,0.0


In [15]:
laptop_data.describe()

Unnamed: 0,price,cluster_group
count,1237.0,1237.0
mean,11736.92886,0.226354
std,6833.333824,0.50131
min,1533.0,0.0
25%,6768.0,0.0
50%,10504.0,0.0
75%,15155.0,0.0
max,64146.0,5.0


In [14]:
#Grouping laptops by Cluster
laptop_cluster_data = laptop_data.groupby(['cluster_group']).mean()
#Count of laptops in each cluster
laptop_cluster_data['Counts'] = pd.Series(laptop_data.groupby(['cluster_group']).size())
laptop_cluster_data

Unnamed: 0_level_0,price,Counts
cluster_group,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,9116.998991,991
1.0,20742.348416,221
2.0,32613.736842,19
3.0,42136.25,4
4.0,47224.0,1
5.0,64146.0,1


In [16]:
laptop_data

Unnamed: 0,name,price,cluster_group
0,"Apple MacBook Pro M1 2020 13"" 8 GB RAM & 256 G...",10622,0.0
1,Samsung Galaxy Book Pro 360 - Intel Core i7 & ...,11999,0.0
2,ASUS ROG Strix G17 G713QM-HX015T - RTX3060 & R...,11999,0.0
3,Lenovo Legion 5 Pro 16ACH6H - RTX3070 & Ryzen ...,12499,0.0
4,MSI Summit E13 Flip Evo - Intel Core i7 1185G7...,14546,0.0
...,...,...,...
1232,HP 17-ca2006no,4379,0.0
1233,HP *DEMO* ProBook 455 G7 Ryzen 3 8GB 256GB,5454,0.0
1234,Lenovo IdeaPad 5 14ARE05,5781,0.0
1235,Lenovo *DEMO* IdeaPad Flex 5 14ARE05 - Ryzen 7...,6860,0.0
