In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
calories_df = pd.read_csv("OneDrive/Desktop/Capstone_week9/hourlyCalories_merged.csv")
steps_df = pd.read_csv("OneDrive/Desktop/Capstone_week9/hourlySteps_merged.csv")
intensities_df = pd.read_csv("OneDrive/Desktop/Capstone_week9/hourlyIntensities_merged.csv")

In [None]:
hourly_df = pd.merge(calories_df,steps_df)
hourly_df = pd.merge(hourly_df,intensities_df)

In [None]:
hourly_df.info()

In [None]:
duplicate_rows = hourly_df.duplicated()

# Display rows that are duplicates
duplicate_rows = hourly_df[duplicate_rows]
print("Duplicate Rows:")
print(duplicate_rows)

In [None]:
hourly_df.head(5)

In [None]:
hourly_df['ActivityHour'] = pd.to_datetime(hourly_df['ActivityHour'])

In [None]:
hourly_df['DayofWeek'] = hourly_df.ActivityHour.dt.day_name()

In [None]:
hourly_df.head(5)

In [None]:
hourly_df['Hour'] = hourly_df['ActivityHour'].dt.hour
hourly_summary = hourly_df.groupby('Hour').agg({'StepTotal': 'mean', 'Calories': 'mean'})
hourly_summary

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(hourly_summary.index, hourly_summary['StepTotal'], label='Step Count', color='blue', marker='o')
plt.plot(hourly_summary.index, hourly_summary['Calories'], label='Calories', color='orange', marker='o')
plt.title('Hourly Step Counts and Calorie Expenditure')
plt.xlabel('Hour of the Day')
plt.ylabel('Average Value')
plt.xticks(hourly_summary.index)
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
hourly_intensity_summary = hourly_df.groupby('Hour')['TotalIntensity'].mean()

plt.figure(figsize=(10, 6))
plt.plot(hourly_intensity_summary.index, hourly_intensity_summary.values, color='blue', marker='o')
plt.title('Hourly Activity Intensity')
plt.xlabel('Hour of the Day')
plt.ylabel('Average Intensity')
plt.xticks(hourly_intensity_summary.index)
plt.grid(True)
plt.tight_layout()
plt.show()

# Identify peak hours
peak_hours = hourly_intensity_summary.idxmax()
peak_intensity = hourly_intensity_summary.max()
print(f"The peak hour for activity intensity is {peak_hours} with an average intensity of {peak_intensity:.2f}.")

In [None]:
# Extract day of the week from 'ActivityHour' (0 = Monday, 1 = Tuesday, ..., 6 = Sunday)
hourly_df['DayOfWeek'] = hourly_df['ActivityHour'].dt.dayofweek

# Define a function to classify days as weekdays (0-4) or weekends (5-6)
def classify_day(day):
    return 'Weekend' if day >= 5 else 'Weekday'

# Apply the function to create a new column indicating weekday or weekend
hourly_df['DayType'] = hourly_df['DayOfWeek'].apply(classify_day)

# Group data by day type and hour, and calculate mean step count and calorie expenditure
hourly_summary = hourly_df.groupby(['DayType', 'Hour']).agg({'StepTotal': 'mean', 'Calories': 'mean'}).reset_index()

# Plot average activity levels for weekdays and weekends
plt.figure(figsize=(10, 6))

# Plot step counts
plt.plot(hourly_summary[hourly_summary['DayType'] == 'Weekday']['Hour'],
         hourly_summary[hourly_summary['DayType'] == 'Weekday']['StepTotal'],
         label='Weekday', color='blue', marker='o')

plt.plot(hourly_summary[hourly_summary['DayType'] == 'Weekend']['Hour'],
         hourly_summary[hourly_summary['DayType'] == 'Weekend']['StepTotal'],
         label='Weekend', color='orange', marker='o')

plt.title('Hourly Step Counts on Weekdays vs Weekends')
plt.xlabel('Hour of the Day')
plt.ylabel('Average Step Count')
plt.xticks(hourly_summary['Hour'])
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
selected_columns = ['Calories', 'StepTotal', 'TotalIntensity', 'AverageIntensity']
subset_df = hourly_df[selected_columns]
correlation_matrix = subset_df.corr()

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Heatmap of Numeric Variables')
plt.show()

In [None]:
user_summary = hourly_df.groupby('Id').agg({'StepTotal': 'sum','Calories': 'sum','TotalIntensity': 'mean','AverageIntensity': 'mean'})
user_summary.columns = ['TotalSteps', 'TotalCalories', 'AverageTotalIntensity', 'AverageActivityIntensity']
user_summary.sort_values(by='TotalSteps', ascending=False).head(10)

In [None]:
hourly_df.set_index('ActivityHour', inplace=True)

# Resample the data to daily frequency and aggregate by sum for step counts and calorie expenditure
daily_step_counts = hourly_df['StepTotal'].resample('D').sum()
daily_calorie_expenditure = hourly_df['Calories'].resample('D').sum()

# Plot the time series for daily step counts and calorie expenditure
plt.figure(figsize=(12, 6))
plt.plot(daily_step_counts.index, daily_step_counts, label='Daily Step Counts', color='blue')
plt.plot(daily_calorie_expenditure.index, daily_calorie_expenditure, label='Daily Calorie Expenditure', color='orange')
plt.title('Time Series of Daily Step Counts and Calorie Expenditure')
plt.xlabel('Date')
plt.ylabel('StepCount / Calories')
plt.legend()
plt.grid(True)
plt.show()