In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
daily_df = pd.read_csv("OneDrive/Desktop/Capstone_week9/dailyActivity_merged.csv")

In [None]:
daily_df.info()

In [None]:
daily_df.isna().sum()

In [None]:
duplicate_rows = daily_df.duplicated()

# Display rows that are duplicates
duplicate_rows = daily_df[duplicate_rows]
print("Duplicate Rows:")
print(duplicate_rows)

In [None]:
daily_df.head(5)

In [None]:
daily_df.tail(5)

In [None]:
daily_df['ActivityDate'] = pd.to_datetime(daily_df['ActivityDate'])
daily_df['ActivityDate']

In [None]:
Users_activity_count = daily_df.groupby('Id').size().sort_values(ascending=False)
Users_activity_count

In [None]:
plt.figure(figsize=(10, 6))
Users_activity_count.plot(kind='bar')
plt.title('Total Number of Activity by User')
plt.xlabel('User ID')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(daily_df['TotalSteps'], bins=20, color='skyblue', edgecolor='black')
plt.title('Distribution of Total Steps Taken by Users')
plt.xlabel('Total Steps')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(daily_df['TotalDistance'], bins=20, color='skyblue', edgecolor='black')
plt.title('Distribution of Total Distance Cover by Users')
plt.xlabel('Total Distance')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
correlation_df = daily_df.drop(columns=['Id', 'ActivityDate']).corr()

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_df, annot=True, cmap='coolwarm', fmt=".2f", vmin=-1, vmax=1)
plt.title('Correlation Heatmap of Daily Activity Variables')
plt.show()

In [None]:
# Group the data by user ID and calculate the mean of active minutes
average_active_minutes = daily_df.groupby('Id')[['VeryActiveMinutes', 'FairlyActiveMinutes', 'LightlyActiveMinutes']].mean()

# Display the average active minutes per user
average_active_minutes

In [None]:
plt.figure(figsize=(12, 8))
sns.violinplot(x='Id', y='SedentaryMinutes', data=daily_df)
plt.title('Comparison of Sedentary Minutes Across Users')
plt.xlabel('User ID')
plt.ylabel('Sedentary Minutes')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

total_very_active_minutes = daily_df.groupby('Id')['VeryActiveMinutes'].sum()

# Create bar plot
plt.figure(figsize=(10, 6))
total_very_active_minutes.plot(kind='bar')
plt.title('Total Very Active Minutes by User')
plt.xlabel('User ID')
plt.ylabel('Total Very Active Minutes')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()


In [None]:
CaloriesBurned_perday = daily_df.groupby('ActivityDate')['Calories'].sum()
CaloriesBurned_perday

In [None]:
plt.figure(figsize=(18, 6))
plt.plot(CaloriesBurned_perday.index, CaloriesBurned_perday.values, marker='o', linestyle='-')
plt.title('Total Calories Burned Per Day')
plt.xlabel('Date')
plt.ylabel('Total Calories Burned')
plt.xticks(CaloriesBurned_perday.index, rotation=45, ha='right')
plt.grid(True)
plt.show()

In [None]:
daily_df['DayOfWeek'] = daily_df['ActivityDate'].dt.dayofweek
daily_df['DayType'] = daily_df['DayOfWeek'].apply(lambda x: 'Weekend' if x >= 5 else 'Weekday')

# Plot box plot or violin plot to compare total steps between weekdays and weekends
plt.figure(figsize=(10, 6))
sns.boxplot(x='DayType', y='TotalSteps', data=daily_df)
plt.title('Distribution of Total Steps between Weekdays and Weekends')
plt.xlabel('Day Type')
plt.ylabel('Total Steps')
plt.grid(True)
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Calculate total activity distance for each type
total_very_active_distance = daily_df['VeryActiveDistance'].sum()
total_moderately_active_distance = daily_df['ModeratelyActiveDistance'].sum()
total_light_active_distance = daily_df['LightActiveDistance'].sum()
total_sedentary_distance = daily_df['SedentaryActiveDistance'].sum()

# Create a pie chart
labels = ['Very Active', 'Moderately Active', 'Lightly Active','Sedentary']
sizes = [total_very_active_distance, total_moderately_active_distance, total_light_active_distance,total_sedentary_distance]
colors = ['gold', 'lightcoral', 'lightskyblue','turquoise']
explode = (0.1, 0, 0,0)  # explode the 1st slice (Very Active)

plt.figure(figsize=(8, 6))
plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=140)
plt.title('Distribution of Activity Distance by Type')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle
plt.show()


In [None]:
top_users_calories = daily_df.groupby('Id')['Calories'].sum().nlargest(5)
top_users_calories.plot(kind='bar', figsize=(10, 6), color = 'steelblue', edgecolor='black')
plt.title('Top 5 Users by Calorie Burned')
plt.xlabel('User ID')
plt.ylabel('Total Calories Burned')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()


In [None]:
top_users_stepstaken = daily_df.groupby('Id')['TotalSteps'].sum().nlargest(5)
top_users_stepstaken.plot(kind='bar', figsize=(10, 6), color = 'darkturquoise', edgecolor='black')
plt.title('Top 5 Users by Steps Taken')
plt.xlabel('User ID')
plt.ylabel('Total Steps')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

In [None]:
correlation = daily_df['VeryActiveMinutes'].corr(daily_df['Calories'])

# Plot a scatter plot to visualize the relationship
plt.figure(figsize=(8, 6))
sns.scatterplot(x='VeryActiveMinutes', y='Calories', data=daily_df)
plt.title('Relationship between Active Minutes and Calories Burned')
plt.xlabel('Active Minutes')
plt.ylabel('Calories Burned')
plt.grid(True)
plt.show()

print('The Correlation coeficient is:', correlation)