# Objective: Data Merging (daily)

### Importing Required Libraries ->

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Loading and Previewing Dataset ->

In [3]:
# Load each dataset
daily_steps = pd.read_csv('dailySteps_merged.csv')
daily_activity = pd.read_csv('dailyActivity_merged.csv')
daily_calories = pd.read_csv('dailyCalories_merged.csv')
daily_intensities = pd.read_csv('dailyIntensities_merged.csv')

# Inspect the first few rows and the column names of each dataset
datasets = [daily_steps, daily_activity, daily_calories, daily_intensities]
dataset_names = ['daily_steps', 'daily_activity', 'daily_calories', 'daily_intensities']

for name, dataset in zip(dataset_names, datasets):
    print(f"{name.upper()}:")
    print(dataset.head())
    print(dataset.columns.tolist())
    print("\n")

DAILY_STEPS:
           Id ActivityDay  StepTotal
0  1503960366   4/12/2016      13162
1  1503960366   4/13/2016      10735
2  1503960366   4/14/2016      10460
3  1503960366   4/15/2016       9762
4  1503960366   4/16/2016      12669
['Id', 'ActivityDay', 'StepTotal']


DAILY_ACTIVITY:
           Id ActivityDate  TotalSteps  TotalDistance  TrackerDistance  \
0  1503960366    4/12/2016       13162           8.50             8.50   
1  1503960366    4/13/2016       10735           6.97             6.97   
2  1503960366    4/14/2016       10460           6.74             6.74   
3  1503960366    4/15/2016        9762           6.28             6.28   
4  1503960366    4/16/2016       12669           8.16             8.16   

   LoggedActivitiesDistance  VeryActiveDistance  ModeratelyActiveDistance  \
0                       0.0                1.88                      0.55   
1                       0.0                1.57                      0.69   
2                       0.0         

In [4]:
# Ensure the shape of dataset before merge
print(daily_steps.shape)
print(daily_activity.shape)
print(daily_calories.shape)
print(daily_intensities.shape)

(940, 3)
(940, 15)
(940, 3)
(940, 10)


### Merging of Dataset ->

In [5]:
# Preprocessing: Standardize the ActivityDay column name and convert to datetime format
daily_activity.rename(columns={'ActivityDate': 'ActivityDay'}, inplace=True)

# Convert ActivityDay to datetime in all datasets
for dataset in datasets:
    dataset['ActivityDay'] = pd.to_datetime(dataset['ActivityDay'])

# Merge datasets: Start with daily_activity as the base, then merge others one by one
# Using an outer join to ensure no data is missed
daily_merged_data = daily_activity.merge(daily_steps, on=['Id', 'ActivityDay'], how='outer', suffixes=('', '_steps'))

# Merging daily_calories
daily_merged_data = daily_merged_data.merge(daily_calories, on=['Id', 'ActivityDay'], how='outer', suffixes=('', '_calories'))

# Merging daily_intensities
daily_merged_data = daily_merged_data.merge(daily_intensities, on=['Id', 'ActivityDay'], how='outer', suffixes=('', '_intensities'))

# Check for duplicates based on Id and ActivityDay
duplicates = daily_merged_data.duplicated(subset=['Id', 'ActivityDay'], keep='first').sum()

# Overview of the merged dataset
merged_overview = {
    'Number of rows': daily_merged_data.shape[0],
    'Number of columns': daily_merged_data.shape[1],
    'Number of duplicates': duplicates
}

print(merged_overview)
daily_merged_data.head()


{'Number of rows': 940, 'Number of columns': 25, 'Number of duplicates': 0}


Unnamed: 0,Id,ActivityDay,TotalSteps,TotalDistance,TrackerDistance,LoggedActivitiesDistance,VeryActiveDistance,ModeratelyActiveDistance,LightActiveDistance,SedentaryActiveDistance,...,StepTotal,Calories_calories,SedentaryMinutes_intensities,LightlyActiveMinutes_intensities,FairlyActiveMinutes_intensities,VeryActiveMinutes_intensities,SedentaryActiveDistance_intensities,LightActiveDistance_intensities,ModeratelyActiveDistance_intensities,VeryActiveDistance_intensities
0,1503960366,2016-04-12,13162,8.5,8.5,0.0,1.88,0.55,6.06,0.0,...,13162,1985,728,328,13,25,0.0,6.06,0.55,1.88
1,1503960366,2016-04-13,10735,6.97,6.97,0.0,1.57,0.69,4.71,0.0,...,10735,1797,776,217,19,21,0.0,4.71,0.69,1.57
2,1503960366,2016-04-14,10460,6.74,6.74,0.0,2.44,0.4,3.91,0.0,...,10460,1776,1218,181,11,30,0.0,3.91,0.4,2.44
3,1503960366,2016-04-15,9762,6.28,6.28,0.0,2.14,1.26,2.83,0.0,...,9762,1745,726,209,34,29,0.0,2.83,1.26,2.14
4,1503960366,2016-04-16,12669,8.16,8.16,0.0,2.71,0.41,5.04,0.0,...,12669,1863,773,221,10,36,0.0,5.04,0.41,2.71


### Exporting the Dataset ->

In [7]:
daily_merged_data.to_csv('daily_merged_data.csv')