# Objective : Data Merging and Melting (minute - wide)

### Importing Required Libraries ->

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

### Load the Datasets ->

In [2]:
# Load datasets
minute_calories = pd.read_csv('minuteCaloriesWide_merged.csv')
minute_intensities = pd.read_csv('minuteIntensitiesWide_merged.csv')
minute_steps = pd.read_csv('minuteStepsWide_merged.csv')

### Preview the Datasets ->

In [3]:
# Inspect the first few rows and the column names of each dataset
datasets = [minute_calories, minute_intensities, minute_steps]
dataset_names = ['minute_calories', 'minute_intensities','minute_steps']

for name, dataset in zip(dataset_names, datasets):
    print(f"{name.upper()}:")
    print(dataset.head())
    print(dataset.columns.tolist())
    print("\n")

MINUTE_CALORIES:
           Id           ActivityHour  Calories00  Calories01  Calories02  \
0  1503960366  4/13/2016 12:00:00 AM      1.8876      2.2022      0.9438   
1  1503960366   4/13/2016 1:00:00 AM      0.7865      0.7865      0.7865   
2  1503960366   4/13/2016 2:00:00 AM      0.7865      0.7865      0.7865   
3  1503960366   4/13/2016 3:00:00 AM      0.7865      0.7865      0.7865   
4  1503960366   4/13/2016 4:00:00 AM      0.7865      0.7865      0.7865   

   Calories03  Calories04  Calories05  Calories06  Calories07  ...  \
0      0.9438      0.9438      2.0449      0.9438      2.2022  ...   
1      0.7865      0.9438      0.9438      0.9438      0.7865  ...   
2      0.7865      0.7865      0.7865      0.7865      0.7865  ...   
3      0.7865      0.7865      0.7865      0.7865      0.7865  ...   
4      0.7865      0.7865      0.7865      0.7865      0.7865  ...   

   Calories50  Calories51  Calories52  Calories53  Calories54  Calories55  \
0      0.9438      2.0449   

In [4]:
# Enusure the shape of dataset before merging
print(minute_calories.shape)
print(minute_intensities.shape)
print(minute_steps.shape)

(21645, 62)
(21645, 62)
(21645, 62)


### Dataset Melting ->

In [5]:
# Perform the melting operation on 'minuteIntensitiesWide_merged' dataset
melted_data1 = pd.melt(minute_intensities, id_vars=['Id', 'ActivityHour'], var_name='Minute', value_name='Intensities')

# Adjust the 'Minute' column to extract the minute as a number
melted_data1['Minute'] = melted_data1['Minute'].str.extract('(\d+)').astype(int)

# Display the first few rows of the melted dataset
melted_data1.head()


Unnamed: 0,Id,ActivityHour,Minute,Intensities
0,1503960366,4/13/2016 12:00:00 AM,0,1
1,1503960366,4/13/2016 1:00:00 AM,0,0
2,1503960366,4/13/2016 2:00:00 AM,0,0
3,1503960366,4/13/2016 3:00:00 AM,0,0
4,1503960366,4/13/2016 4:00:00 AM,0,0


In [6]:
melted_data1.shape

(1298700, 4)

In [7]:
# Perform the melting operation on 'minuteCaloriesWide_merged' dataset
melted_data2 = pd.melt(minute_calories, id_vars=['Id', 'ActivityHour'], var_name='Minute', value_name='Calories')

# Adjust the 'Minute' column to extract the minute as a number
melted_data2['Minute'] = melted_data2['Minute'].str.extract('(\d+)').astype(int)

# Display the first few rows of the melted dataset
melted_data2.head()

Unnamed: 0,Id,ActivityHour,Minute,Calories
0,1503960366,4/13/2016 12:00:00 AM,0,1.8876
1,1503960366,4/13/2016 1:00:00 AM,0,0.7865
2,1503960366,4/13/2016 2:00:00 AM,0,0.7865
3,1503960366,4/13/2016 3:00:00 AM,0,0.7865
4,1503960366,4/13/2016 4:00:00 AM,0,0.7865


In [8]:
melted_data2.shape

(1298700, 4)

In [9]:
# Perform the melting operation 'minuteStepsWide_merged' dataset
melted_data3 = pd.melt(minute_steps, id_vars=['Id', 'ActivityHour'], var_name='Minute', value_name='Steps')

# Adjust the 'Minute' column to extract the minute as a number
melted_data3['Minute'] = melted_data3['Minute'].str.extract('(\d+)').astype(int)

# Display the first few rows of the melted dataset
melted_data3.head()

Unnamed: 0,Id,ActivityHour,Minute,Steps
0,1503960366,4/13/2016 12:00:00 AM,0,4
1,1503960366,4/13/2016 1:00:00 AM,0,0
2,1503960366,4/13/2016 2:00:00 AM,0,0
3,1503960366,4/13/2016 3:00:00 AM,0,0
4,1503960366,4/13/2016 4:00:00 AM,0,0


In [10]:
melted_data3.shape

(1298700, 4)

In [11]:
# Merge datasets: Start with minutely_calories as the base, then merge others one by one
# Using an outer join to ensure no data is missed
minute_merged_data_wide = melted_data1.merge(melted_data2, on=['Id', 'ActivityHour','Minute'], how='outer', suffixes=('', '_calories'))

# Merging minutely_steps
minute_merged_data_wide = minute_merged_data_wide.merge(melted_data3, on=['Id', 'ActivityHour','Minute'], how='outer', suffixes=('', '_steps'))

# Check for duplicates
duplicates = minute_merged_data_wide.duplicated().sum()

# Overview of the merged dataset
merged_overview = {
    'Number of rows': minute_merged_data_wide.shape[0],
    'Number of columns': minute_merged_data_wide.shape[1],
    'Number of duplicates': duplicates
}

print(merged_overview)
minute_merged_data_wide.head()

{'Number of rows': 1298700, 'Number of columns': 6, 'Number of duplicates': 0}


Unnamed: 0,Id,ActivityHour,Minute,Intensities,Calories,Steps
0,1503960366,4/13/2016 12:00:00 AM,0,1,1.8876,4
1,1503960366,4/13/2016 1:00:00 AM,0,0,0.7865,0
2,1503960366,4/13/2016 2:00:00 AM,0,0,0.7865,0
3,1503960366,4/13/2016 3:00:00 AM,0,0,0.7865,0
4,1503960366,4/13/2016 4:00:00 AM,0,0,0.7865,0


### Exporting the Dataset ->

In [12]:
minute_merged_data_wide.to_csv("minute_merged_data_wide.csv")