# Objective : Data Merging (hour)

### Import Required Libraries ->

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

### Load the Dataset ->

In [3]:

hourly_steps = pd.read_csv('hourlySteps_merged.csv')
hourly_calories = pd.read_csv('hourlyCalories_merged.csv')
hourly_intensities = pd.read_csv('hourlyIntensities_merged.csv')

### Preview The Dataset ->

In [4]:
# Inspect the first few rows and the column names of each dataset
datasets = [hourly_steps, hourly_calories, hourly_intensities]
dataset_names = ['hourly_steps', 'hourly_calories','hourly_intensities']

for name, dataset in zip(dataset_names, datasets):
    print(f"{name.upper()}:")
    print(dataset.head())
    print(dataset.columns.tolist())
    print("\n")

HOURLY_STEPS:
           Id           ActivityHour  StepTotal
0  1503960366  4/12/2016 12:00:00 AM        373
1  1503960366   4/12/2016 1:00:00 AM        160
2  1503960366   4/12/2016 2:00:00 AM        151
3  1503960366   4/12/2016 3:00:00 AM          0
4  1503960366   4/12/2016 4:00:00 AM          0
['Id', 'ActivityHour', 'StepTotal']


HOURLY_CALORIES:
           Id           ActivityHour  Calories
0  1503960366  4/12/2016 12:00:00 AM        81
1  1503960366   4/12/2016 1:00:00 AM        61
2  1503960366   4/12/2016 2:00:00 AM        59
3  1503960366   4/12/2016 3:00:00 AM        47
4  1503960366   4/12/2016 4:00:00 AM        48
['Id', 'ActivityHour', 'Calories']


HOURLY_INTENSITIES:
           Id           ActivityHour  TotalIntensity  AverageIntensity
0  1503960366  4/12/2016 12:00:00 AM              20          0.333333
1  1503960366   4/12/2016 1:00:00 AM               8          0.133333
2  1503960366   4/12/2016 2:00:00 AM               7          0.116667
3  1503960366   4/12

In [5]:
# Ensure the shape of dataset before merge
print(hourly_steps.shape)
print(hourly_calories.shape)
print(hourly_intensities.shape)

(22099, 3)
(22099, 3)
(22099, 4)


### Merging of Dataset ->

In [6]:
# Merge datasets: Start with hourly_steps as the base, then merge others one by one
# Using an outer join to ensure no data is missed
hourly_merged_data = hourly_steps.merge(hourly_calories, on=['Id', 'ActivityHour'], how='outer', suffixes=('', '_calories'))

# Merging hourly_intensities
hourly_merged_data = hourly_merged_data.merge(hourly_intensities, on=['Id', 'ActivityHour'], how='outer', suffixes=('', '_intensities'))

# Check for duplicates based on Id and ActivityDay
duplicates = hourly_merged_data.duplicated(subset=['Id', 'ActivityHour'], keep='first').sum()

# Overview of the merged dataset
merged_overview = {
    'Number of rows': hourly_merged_data.shape[0],
    'Number of columns': hourly_merged_data.shape[1],
    'Number of duplicates': duplicates
}

print(merged_overview)
hourly_merged_data.head()

{'Number of rows': 22099, 'Number of columns': 6, 'Number of duplicates': 0}


Unnamed: 0,Id,ActivityHour,StepTotal,Calories,TotalIntensity,AverageIntensity
0,1503960366,4/12/2016 12:00:00 AM,373,81,20,0.333333
1,1503960366,4/12/2016 1:00:00 AM,160,61,8,0.133333
2,1503960366,4/12/2016 2:00:00 AM,151,59,7,0.116667
3,1503960366,4/12/2016 3:00:00 AM,0,47,0,0.0
4,1503960366,4/12/2016 4:00:00 AM,0,48,0,0.0


In [6]:
# Define a function to split the ActivityHour column into ActivityDay and ActivityTime
def split_activity_hour(df, column_name='ActivityHour'):
    # Split the column into two parts: Date and Time
    split_columns = df[column_name].str.split(' ', expand=True)
    # Assign the split columns to the dataframe
    df['ActivityDay'] = pd.to_datetime(split_columns[0])  # Convert to datetime format
    df['ActivityTime'] = split_columns[1]
    # Drop the original ActivityHour column
    df.drop(column_name, axis=1, inplace=True)
    return df

In [7]:
# Apply the function to each dataset
hourly_merged_data = split_activity_hour(hourly_merged_data)

### Exporting the Dataset ->

In [7]:
hourly_merged_data.to_csv("hourly_merged_data.csv")