Import the libraries

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt   # import required libraries
import plotly.express as px
import  plotly.graph_objects as go

Load the data

In [None]:
df = pd.read_csv("dailyActivity_merged.csv")

In [None]:
df.head()

Unnamed: 0,Id,ActivityDate,TotalSteps,TotalDistance,TrackerDistance,LoggedActivitiesDistance,VeryActiveDistance,ModeratelyActiveDistance,LightActiveDistance,SedentaryActiveDistance,VeryActiveMinutes,FairlyActiveMinutes,LightlyActiveMinutes,SedentaryMinutes,Calories
0,1503960366,4/12/2016,13162,8.5,8.5,0.0,1.88,0.55,6.06,0.0,25,13,328,728,1985
1,1503960366,4/13/2016,10735,6.97,6.97,0.0,1.57,0.69,4.71,0.0,21,19,217,776,1797
2,1503960366,4/14/2016,10460,6.74,6.74,0.0,2.44,0.4,3.91,0.0,30,11,181,1218,1776
3,1503960366,4/15/2016,9762,6.28,6.28,0.0,2.14,1.26,2.83,0.0,29,34,209,726,1745
4,1503960366,4/16/2016,12669,8.16,8.16,0.0,2.71,0.41,5.04,0.0,36,10,221,773,1863


**Perform Analysis**

Check for the Null values

In [None]:
Columns_with_null = df.isnull().sum()
print(Columns_with_null)

Id                          0
ActivityDate                0
TotalSteps                  0
TotalDistance               0
TrackerDistance             0
LoggedActivitiesDistance    0
VeryActiveDistance          0
ModeratelyActiveDistance    0
LightActiveDistance         0
SedentaryActiveDistance     0
VeryActiveMinutes           0
FairlyActiveMinutes         0
LightlyActiveMinutes        0
SedentaryMinutes            0
Calories                    0
dtype: int64


See the info about the dataset, data types, Number of entries, No. of columns etc.

We see above that we have no null values , so we won't have to drop them in our case.

We can check the no of records, Datatypes, Number of columns using `df.info()`.

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 940 entries, 0 to 939
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Id                        940 non-null    int64  
 1   ActivityDate              940 non-null    object 
 2   TotalSteps                940 non-null    int64  
 3   TotalDistance             940 non-null    float64
 4   TrackerDistance           940 non-null    float64
 5   LoggedActivitiesDistance  940 non-null    float64
 6   VeryActiveDistance        940 non-null    float64
 7   ModeratelyActiveDistance  940 non-null    float64
 8   LightActiveDistance       940 non-null    float64
 9   SedentaryActiveDistance   940 non-null    float64
 10  VeryActiveMinutes         940 non-null    int64  
 11  FairlyActiveMinutes       940 non-null    int64  
 12  LightlyActiveMinutes      940 non-null    int64  
 13  SedentaryMinutes          940 non-null    int64  
 14  Calories  

# **Data Pre-processing**

Change the datatype of a column

In [None]:
df["TotalDistance"] = df["TotalDistance"].astype('int64')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 940 entries, 0 to 939
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Id                        940 non-null    int64  
 1   ActivityDate              940 non-null    object 
 2   TotalSteps                940 non-null    int64  
 3   TotalDistance             940 non-null    int64  
 4   TrackerDistance           940 non-null    float64
 5   LoggedActivitiesDistance  940 non-null    float64
 6   VeryActiveDistance        940 non-null    float64
 7   ModeratelyActiveDistance  940 non-null    float64
 8   LightActiveDistance       940 non-null    float64
 9   SedentaryActiveDistance   940 non-null    float64
 10  VeryActiveMinutes         940 non-null    int64  
 11  FairlyActiveMinutes       940 non-null    int64  
 12  LightlyActiveMinutes      940 non-null    int64  
 13  SedentaryMinutes          940 non-null    int64  
 14  Calories  

Sum all activity minutes in a single column then print the info to see the changes.

In [None]:
df["Total_Minutes"] = df["VeryActiveMinutes"] + df['FairlyActiveMinutes'] + df["LightlyActiveMinutes"] + df["SedentaryMinutes"]
print(df)
print(df.info())

Statistics of the dataset

In [None]:
df.describe()

Unnamed: 0,Id,TotalSteps,TotalDistance,TrackerDistance,LoggedActivitiesDistance,VeryActiveDistance,ModeratelyActiveDistance,LightActiveDistance,SedentaryActiveDistance,VeryActiveMinutes,FairlyActiveMinutes,LightlyActiveMinutes,SedentaryMinutes,Calories,Total_Minutes
count,940.0,940.0,940.0,940.0,940.0,940.0,940.0,940.0,940.0,940.0,940.0,940.0,940.0,940.0,940.0
mean,4855407000.0,7637.910638,5.058511,5.475351,0.108171,1.502681,0.567543,3.340819,0.001606,21.164894,13.564894,192.812766,991.210638,2303.609574,1218.753191
std,2424805000.0,5087.150742,3.88188,3.907276,0.619897,2.658941,0.88358,2.040655,0.007346,32.844803,19.987404,109.1747,301.267437,718.166862,265.931767
min,1503960000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
25%,2320127000.0,3789.75,2.0,2.62,0.0,0.0,0.0,1.945,0.0,0.0,0.0,127.0,729.75,1828.5,989.75
50%,4445115000.0,7405.5,5.0,5.245,0.0,0.21,0.24,3.365,0.0,4.0,6.0,199.0,1057.5,2134.0,1440.0
75%,6962181000.0,10727.0,7.0,7.71,0.0,2.0525,0.8,4.7825,0.0,32.0,19.0,264.0,1229.5,2793.25,1440.0
max,8877689000.0,36019.0,28.0,28.030001,4.942142,21.92,6.48,10.71,0.11,210.0,143.0,518.0,1440.0,4900.0,1440.0


Total Mintutes to Total hours

In [None]:
df["Total_Hours"] = df["Total_Minutes"]/60
print(df.head())

           Id ActivityDate  TotalSteps  TotalDistance  TrackerDistance  \
0  1503960366    4/12/2016       13162              8             8.50   
1  1503960366    4/13/2016       10735              6             6.97   
2  1503960366    4/14/2016       10460              6             6.74   
3  1503960366    4/15/2016        9762              6             6.28   
4  1503960366    4/16/2016       12669              8             8.16   

   LoggedActivitiesDistance  VeryActiveDistance  ModeratelyActiveDistance  \
0                       0.0                1.88                      0.55   
1                       0.0                1.57                      0.69   
2                       0.0                2.44                      0.40   
3                       0.0                2.14                      1.26   
4                       0.0                2.71                      0.41   

   LightActiveDistance  SedentaryActiveDistance  VeryActiveMinutes  \
0                 6.06

## Analysis and visualization







Create a pie chart to check distribution of active minutes.

In [None]:
labels = ['Very Active Minutes', 'Fairly Active Minutes', 'Lightly Active Minutes', 'Inactive Minutes']
counts = df[['VeryActiveMinutes', 'FairlyActiveMinutes', 'LightlyActiveMinutes', 'SedentaryMinutes']].max()
colors = ['red','green', "pink", "blue"]

fig = go.Figure(data=[go.Pie(labels=labels, values=counts)])
fig.update_layout(width = 500, height = 400,
    paper_bgcolor="white", autosize=False, showlegend=True)
fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=15,
                  marker=dict(colors=colors, line=dict(color='black', width=1))
                 )
fig.show()

We can observe from the above chart that:
User spends 62.3% inactive minutes.
6.19% fairly active minutes.
9.09% of highly active minutes.
22.4% of lightly active minutes.


Add the day of the week in the data set so that we can analyze the activity each day.

In [None]:
df["Day"] = df["ActivityDate"].dt.day_name()
print(df.head)

             Id ActivityDate  TotalSteps  TotalDistance  TrackerDistance  \
0    1503960366   2016-04-12       13162              8         8.500000   
1    1503960366   2016-04-13       10735              6         6.970000   
2    1503960366   2016-04-14       10460              6         6.740000   
3    1503960366   2016-04-15        9762              6         6.280000   
4    1503960366   2016-04-16       12669              8         8.160000   
..          ...          ...         ...            ...              ...   
935  8877689391   2016-05-08       10686              8         8.110000   
936  8877689391   2016-05-09       20226             18        18.250000   
937  8877689391   2016-05-10       10733              8         8.150000   
938  8877689391   2016-05-11       21420             19        19.559999   
939  8877689391   2016-05-12        8064              6         6.120000   

     LoggedActivitiesDistance  VeryActiveDistance  ModeratelyActiveDistance  \
0       

See the days of the week with highly active minutes and fairly active minutes.

In [None]:
fig = go.Figure()

fig.add_trace(go.Bar(x= df['Day'],
                     y= df['VeryActiveMinutes'],
                     name= 'Very Active',
                     marker_color = 'red'
                    ))

fig.add_trace(go.Bar(x= df['Day'],
                     y= df['FairlyActiveMinutes'],
                     name= 'Fairly Active',
                     marker_color = 'blue'
                    ))


fig.update_layout(barmode='group', xaxis_tickangle=-45)
fig.show()


We can observe from the plot above that the user is spends his highly active minutes on Tuesdays.

Check for the number of steps covered in each day.

In [None]:
day = df["Day"].value_counts()
label = day.index
counts = df["TotalSteps"]
colors = ['gold','lightgreen', "pink", "blue", "skyblue", "cyan", "orange"]

fig = go.Figure(data=[go.Pie(labels=label, values=counts)])
fig.update_layout(width = 500, height = 400, paper_bgcolor="white", autosize=False, showlegend=True)
fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=15,
                  marker=dict(colors=colors, line=dict(color='black', width=1)))
fig.show()

We can observe that the user covers maximum number of steps on Tuesday.

Calories burnt in a day of a week

In [None]:
calorie_count = df["Day"].value_counts()
label = calorie_count.index
counts = calorie_count.values
colors = ['blue', 'green', 'pink', 'purple', 'skyblue', 'orange', 'brown']
fig = go.Figure(data=[go.Bar(x=label, y=counts, marker_color=colors)])
fig.update_layout(width = 500, height = 400, paper_bgcolor="white", autosize=False, showlegend=True, title = "Calorie count per day", xaxis_title='Day', yaxis_title='Calories')
fig.show()


Distance covered each day

In [None]:

distance_covered = df["Day"].value_counts()


labels = distance_covered.index
counts = df["TotalDistance"]
color = ['blue', 'green', 'pink', 'purple', 'skyblue', 'orange', 'brown']
fig = go.Figure(data=[go.Pie(labels=labels, values=counts, marker_colors= color)])
fig.update_layout(width = 500, height = 400, paper_bgcolor="white", autosize=False, showlegend=True, title ='Distance covered each day')
fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=15,
                  marker=dict(line=dict(color='black', width=1)))


fig.show()

We can see from the above plot that the user covers maximum distance on Tuesday, Monday and Saturday.