In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans

In [2]:
data = pd.read_csv('../Data_bases/data_for_clustering.csv')

Scaling the data that will be used in the first cluster. 
First cluster goal:
    Discriminate the phone models by performance.

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1359 entries, 0 to 1358
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   brand                   1359 non-null   object 
 1   model                   1359 non-null   object 
 2   battery_capacity_(mah)  1359 non-null   int64  
 3   screen_size_(inches)    1359 non-null   float64
 4   touchscreen             1359 non-null   object 
 5   resolution_x            1359 non-null   int64  
 6   resolution_y            1359 non-null   int64  
 7   processor               1359 non-null   int64  
 8   ram_(mb)                1359 non-null   int64  
 9   internal_storage_(gb)   1359 non-null   float64
 10  rear_camera             1359 non-null   float64
 11  front_camera            1359 non-null   float64
 12  operating_system        1359 non-null   object 
 13  wi-fi                   1359 non-null   object 
 14  bluetooth               1359 non-null   

In [4]:
data_cluster_perf_df = data[['processor','ram_(mb)','internal_storage_(gb)']]

In [5]:
#Use standard scaler on columns
transformer = MinMaxScaler().fit(data_cluster_perf_df)
data_cluster_perf_scaled = transformer.transform(data_cluster_perf_df)

data_cluster_perf_scaled = pd.DataFrame(data_cluster_perf_scaled, columns=data_cluster_perf_df.columns)

Using the KMeans to cluster the chosen columns

In [6]:
model = KMeans(n_clusters=6)
yhat = model.fit_predict(data_cluster_perf_scaled)
model.cluster_centers_ # the coordinates of the centroid of each cluster in the scaled features. But you can undo the 
# scaling to see the coordinates of each centroid in the original features."scaler.inverse_transform(scaled_features)" 
yhat

array([2, 5, 5, ..., 4, 4, 1], dtype=int32)

Adding the clusters to the data base:

In [7]:
data['performance_c'] = yhat

In [8]:
data['performance_c'].unique()

array([2, 5, 0, 4, 3, 1], dtype=int32)

Checking the clusters:

In [9]:
data.loc[:,['processor','ram_(mb)','internal_storage_(gb)','performance_c']]

Unnamed: 0,processor,ram_(mb),internal_storage_(gb),performance_c
0,8,12000,256.000,2
1,8,6000,64.000,5
2,6,4000,64.000,5
3,6,4000,64.000,5
4,8,6000,128.000,2
...,...,...,...,...
1354,4,512,8.000,1
1355,4,1000,8.000,1
1356,2,512,4.000,4
1357,1,256,0.512,4


In [10]:
data.groupby('performance_c').agg({'processor':'mean'}).sort_values(by="processor")

Unnamed: 0_level_0,processor
performance_c,Unnamed: 1_level_1
4,1.517241
1,4.0
3,4.024316
2,7.873016
5,7.952607
0,7.955272


In [11]:
data['performance_c'] = data['performance_c'].replace([4,1,3,2,5,0],[6,7,8,9,10,11])

In [12]:
data.groupby('performance_c').agg({'processor':'mean'}).sort_values(by="processor")

Unnamed: 0_level_0,processor
performance_c,Unnamed: 1_level_1
6,1.517241
7,4.0
8,4.024316
9,7.873016
10,7.952607
11,7.955272


In [13]:
data['performance_c'] = data['performance_c'].replace([6,7,8,9,10,11],[1,2,3,4,5,6])

In [14]:
data.groupby('performance_c').agg({'processor':'mean'}).sort_values(by="processor")

Unnamed: 0_level_0,processor
performance_c,Unnamed: 1_level_1
1,1.517241
2,4.0
3,4.024316
4,7.873016
5,7.952607
6,7.955272


Scaling the data that will be used in the second cluster. 
Second cluster goal: 
    Discriminate the phone models by cam performance.

In [15]:
data_cluster_cam_df = data[['rear_camera','front_camera']]

In [16]:
transformer = MinMaxScaler().fit(data_cluster_cam_df)
data_cluster_cam_scaled = transformer.transform(data_cluster_cam_df)

data_cluster_cam_scaled = pd.DataFrame(data_cluster_cam_scaled, columns=data_cluster_cam_df.columns)

Using the KMeans to cluster the chosen columns

In [17]:
model = KMeans(n_clusters= 7)
yhat = model.fit_predict(data_cluster_perf_scaled)
yhat

array([6, 2, 5, ..., 3, 3, 4], dtype=int32)

Adding the clusters to the data base:

In [18]:
data['cam_c'] = yhat

In [19]:
data['cam_c'].unique()

array([6, 2, 5, 1, 3, 0, 4], dtype=int32)

Checking the clusters:

In [20]:
data.loc[:,['rear_camera','front_camera', 'cam_c']]

Unnamed: 0,rear_camera,front_camera,cam_c
0,48.0,16.0,6
1,64.0,16.0,2
2,12.0,12.0,5
3,12.0,12.0,5
4,12.0,32.0,2
...,...,...,...
1354,5.0,0.3,4
1355,8.0,5.0,4
1356,5.0,2.0,3
1357,2.0,0.3,3


In [21]:
data.groupby('cam_c').agg({'rear_camera':'mean'}).sort_values(by="rear_camera")

Unnamed: 0_level_0,rear_camera
cam_c,Unnamed: 1_level_1
4,6.087921
3,6.737931
0,11.222796
1,13.471711
5,17.443158
2,23.566667
6,34.363636


In [22]:
data['cam_c'] = data['cam_c'].replace([4,3,0,1,5,2,6],[7,8,9,10,11,12,13])

In [23]:
data.groupby('cam_c').agg({'rear_camera':'mean'}).sort_values(by="rear_camera")

Unnamed: 0_level_0,rear_camera
cam_c,Unnamed: 1_level_1
7,6.087921
8,6.737931
9,11.222796
10,13.471711
11,17.443158
12,23.566667
13,34.363636


In [24]:
data['cam_c'] = data['cam_c'].replace([7,8,9,10,11,12,13],[1,2,3,4,5,6,7])

In [25]:
data.groupby('cam_c').agg({'rear_camera':'mean'}).sort_values(by="rear_camera")

Unnamed: 0_level_0,rear_camera
cam_c,Unnamed: 1_level_1
1,6.087921
2,6.737931
3,11.222796
4,13.471711
5,17.443158
6,23.566667
7,34.363636


Scaling the data that will be used in the third cluster. Third cluster goal: Discriminate the phone models by size.

In [26]:
data_cluster_size_df = data['screen_size_(inches)']

In [27]:
#Use standard scaler on columns
transformer = MinMaxScaler().fit(data_cluster_size_df.values.reshape(-1, 1))
data_cluster_cam_scaled = transformer.transform(data_cluster_size_df.values.reshape(-1, 1))

data_cluster_cam_scaled_df = pd.DataFrame(data_cluster_cam_scaled, columns=["scaled_values"])

Using the KMeans to cluster the chosen columns

In [28]:
model = KMeans(n_clusters= 8)
yhat = model.fit_predict(data_cluster_cam_scaled_df)
yhat

array([4, 4, 4, ..., 5, 2, 3], dtype=int32)

In [29]:
data['size_c'] = yhat

In [30]:
data['size_c'].unique()

array([4, 1, 7, 0, 5, 3, 6, 2], dtype=int32)

In [31]:
data.loc[:,['screen_size_(inches)','size_c']]

Unnamed: 0,screen_size_(inches),size_c
0,6.67,4
1,6.50,4
2,6.50,4
3,6.10,1
4,6.40,7
...,...,...
1354,4.00,2
1355,5.00,3
1356,4.50,5
1357,4.00,2


In [32]:
data.groupby('size_c').agg({'screen_size_(inches)':'mean'}).sort_values(by="screen_size_(inches)")

Unnamed: 0_level_0,screen_size_(inches)
size_c,Unnamed: 1_level_1
6,2.808182
2,3.968354
5,4.548359
3,5.032691
0,5.525199
1,5.976476
7,6.290219
4,6.609592


In [33]:
data['size_c'] = data['size_c'].replace([6,2,5,3,0,1,7,4],[8,9,10,11,12,13,14,15])

In [34]:
data.groupby('size_c').agg({'screen_size_(inches)':'mean'}).sort_values(by="screen_size_(inches)")

Unnamed: 0_level_0,screen_size_(inches)
size_c,Unnamed: 1_level_1
8,2.808182
9,3.968354
10,4.548359
11,5.032691
12,5.525199
13,5.976476
14,6.290219
15,6.609592


In [35]:
data['size_c'] = data['size_c'].replace([8,9,10,11,12,13,14,15],[1,2,3,4,5,6,7,8])

In [36]:
data.groupby('size_c').agg({'screen_size_(inches)':'mean'}).sort_values(by="screen_size_(inches)")

Unnamed: 0_level_0,screen_size_(inches)
size_c,Unnamed: 1_level_1
1,2.808182
2,3.968354
3,4.548359
4,5.032691
5,5.525199
6,5.976476
7,6.290219
8,6.609592


Scaling the data that will be used in the forth cluster. Forth cluster goal: Discriminate the phone models by batery length.

In [37]:
data_cluster_bat_df = data['battery_capacity_(mah)']

In [38]:
#Use standard scaler on columns
transformer = MinMaxScaler().fit(data_cluster_bat_df.values.reshape(-1, 1))
data_cluster_bat_scaled = transformer.transform(data_cluster_bat_df.values.reshape(-1, 1))

data_cluster_bat_scaled_df = pd.DataFrame(data_cluster_bat_scaled, columns=["scaled_values"])

Using the KMeans to cluster the chosen columns:

In [39]:
model = KMeans(n_clusters= 8)
yhat = model.fit_predict(data_cluster_bat_scaled_df)
yhat

array([1, 1, 1, ..., 3, 3, 7], dtype=int32)

In [40]:
data['battery_c'] = yhat

In [41]:
data['battery_c'].unique()

array([1, 2, 4, 5, 0, 6, 3, 7], dtype=int32)

In [42]:
data.loc[:,['battery_capacity_(mah)','battery_c']]

Unnamed: 0,battery_capacity_(mah),battery_c
0,4085,1
1,4000,1
2,3969,1
3,3110,2
4,4000,1
...,...,...
1354,1500,3
1355,2000,6
1356,1700,3
1357,1250,3


In [43]:
data.groupby('battery_c').agg({'battery_capacity_(mah)':'mean'}).sort_values(by="battery_capacity_(mah)")

Unnamed: 0_level_0,battery_capacity_(mah)
battery_c,Unnamed: 1_level_1
3,1473.113924
6,1965.291667
7,2289.343066
0,2563.733333
2,2992.441088
5,3420.076336
1,4009.904545
4,4938.108108


In [44]:
data['battery_c'] = data['battery_c'].replace([3,6,7,0,2,5,1,4],[8,9,10,11,12,13,14,15])

In [45]:
data.groupby('battery_c').agg({'battery_capacity_(mah)':'mean'}).sort_values(by="battery_capacity_(mah)")

Unnamed: 0_level_0,battery_capacity_(mah)
battery_c,Unnamed: 1_level_1
8,1473.113924
9,1965.291667
10,2289.343066
11,2563.733333
12,2992.441088
13,3420.076336
14,4009.904545
15,4938.108108


In [46]:
data['battery_c'] = data['battery_c'].replace([8,9,10,11,12,13,14,15],[1,2,3,4,5,6,7,8])

In [47]:
data.groupby('battery_c').agg({'battery_capacity_(mah)':'mean'}).sort_values(by="battery_capacity_(mah)")

Unnamed: 0_level_0,battery_capacity_(mah)
battery_c,Unnamed: 1_level_1
1,1473.113924
2,1965.291667
3,2289.343066
4,2563.733333
5,2992.441088
6,3420.076336
7,4009.904545
8,4938.108108


In [48]:
data.to_csv('../Data_bases/data_clstered.csv', index=False)