# Import essential modules


In [1]:
import pandas as pd #import pandas library for data analysis
import numpy as np #import numpy library for numerical operations
import matplotlib.pyplot as plt #import matplotlib library for data visualizations

# Import dataset file

In [2]:
data_frame = pd.read_csv("exercise_csvfile.csv") 
data_frame.head() #display the first five rows of the DataFrame

Unnamed: 0,Activity (1H),Calories per kg,Cluster
0,"Cycling, mountain bike, bmx",1.75073,3
1,"Cycling, <10 mph, leisure bicycling",0.823236,6
2,"Cycling, >20 mph, racing",3.294974,2
3,"Cycling, 10-11.9 mph, light",1.234853,9
4,"Cycling, 12-13.9 mph, moderate",1.647825,3


In [3]:
data_frame.columns #display the column names of the DataFrame for understandability

Index(['Activity (1H)', 'Calories per kg', 'Cluster'], dtype='object')

# Data preprocessing

In [4]:
data_frame.isnull().sum() #check for missing values in each column of the DataFrame

Activity (1H)      0
Calories per kg    0
Cluster            0
dtype: int64

In [5]:
data_frame.describe() #display a basic statistical sumary of each numerical DataFrame column

Unnamed: 0,Calories per kg,Cluster
count,248.0,248.0
mean,1.35993,4.229839
std,0.678851,2.760258
min,0.310067,0.0
25%,0.823236,2.0
50%,1.234853,4.0
75%,1.647825,6.0
max,3.706591,9.0


In [6]:
data_frame['Calories per kg'] = pd.to_numeric(data_frame['Calories per kg']) #convert the 'Calories per kg' column from object type to numeric data type
data_frame.dtypes #display data types of all the columns in the DataFrame

Activity (1H)       object
Calories per kg    float64
Cluster              int64
dtype: object

# Normalize the data for better perfomance

In [7]:
from sklearn.preprocessing import MinMaxScaler #Scale the 'Calories per kg' column using MinMaxScaler from sklearn.preprocessing

In [8]:
features_to_scale = ['Calories per kg']
scaler = MinMaxScaler()
normalized_data_frame = scaler.fit_transform(data_frame[features_to_scale])

In [9]:
normalized_data_frame #display the scaled data

array([[0.42415786],
       [0.15108631],
       [0.87881204],
       [0.27227427],
       [0.39386087],
       [0.51504883],
       [0.6362368 ],
       [0.21187961],
       [0.09049233],
       [0.2421766 ],
       [0.33306757],
       [0.54534582],
       [0.66673311],
       [0.39386087],
       [0.12098864],
       [0.39386087],
       [0.27227427],
       [0.09049233],
       [0.2421766 ],
       [0.45425553],
       [0.12098864],
       [0.33306757],
       [0.42415786],
       [0.6362368 ],
       [0.33306757],
       [0.21187961],
       [0.33306757],
       [0.42415786],
       [0.30277058],
       [0.27227427],
       [0.15108631],
       [0.06039466],
       [0.27227427],
       [0.15108631],
       [0.18178194],
       [0.09049233],
       [0.2421766 ],
       [0.39386087],
       [0.45425553],
       [0.51504883],
       [0.57564281],
       [0.6059398 ],
       [0.66673311],
       [0.72712777],
       [0.75742476],
       [0.81821806],
       [0.87881204],
       [1.   

# Apply data clustering

In [10]:
from sklearn.cluster import KMeans #apply KMeans clustering algorithm on the normalized DataFrame with 10 clusters

In [11]:
kmeans = KMeans(n_clusters = 10, random_state = 0).fit(normalized_data_frame)
labels = kmeans.labels_ #retrieve cluster labels
centroids = pd.DataFrame(kmeans.cluster_centers_, columns = features_to_scale) #create a DataFrame of cluster centroids

In [12]:
data_frame['Cluster'] = labels #Add a new 'Cluster' column to the original DataFrame with cluster columns

In [13]:
data_frame #display the updated DataFrame

Unnamed: 0,Activity (1H),Calories per kg,Cluster
0,"Cycling, mountain bike, bmx",1.750730,7
1,"Cycling, <10 mph, leisure bicycling",0.823236,0
2,"Cycling, >20 mph, racing",3.294974,3
3,"Cycling, 10-11.9 mph, light",1.234853,5
4,"Cycling, 12-13.9 mph, moderate",1.647825,7
...,...,...,...
243,General cleaning,0.721008,0
244,"Cleaning, dusting",0.515199,6
245,Taking out trash,0.617427,6
246,"Walking, pushing a wheelchair",0.823236,0


# Display the number of instances in each cluster

In [14]:
#count the number of instances in each cluster and sort them in ascending order of the 'Calories per kg'
cluster_counts = data_frame['Cluster'].value_counts().sort_index() 
cluster_counts

0    42
1    21
2    27
3     8
4    14
5    26
6    40
7    44
8     3
9    23
Name: Cluster, dtype: int64

In [15]:
#take exercise from each cluster in delivering output based on sugar levels.

# Sort the cluster data based on the centroids

In [16]:
#rorder the cluster labels and centroids based on 'Calories per kg'
cluster_order = centroids.sort_values(by='Calories per kg').index.values

In [17]:
labels = np.array([np.where(cluster_order==label)[0][0] for label in labels])

In [18]:
centroids = centroids.loc[cluster_order]

In [19]:
#Update the 'Cluster' column in the DataFrame with the reordered cluster labels and count instances per cluster
data_frame['Cluster'] = labels

In [20]:
cluster_counts = data_frame['Cluster'].value_counts().sort_index()

In [21]:
print(cluster_counts)

0    40
1    42
2    23
3    26
4    27
5    44
6    21
7    14
8     3
9     8
Name: Cluster, dtype: int64


# Save the data frame into a CSV file

In [22]:
data_frame.to_csv('exercise_csvfile.csv', index = False) #save the updated DataFrame to the CSV file wihtout the index column