# Objective : Data Merging (minute - narrow)

### Importing Required Libraries ->

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

### Load the Datasets ->

In [2]:
# Load datasets
minute_calories = pd.read_csv('minuteCaloriesNarrow_merged.csv')
minute_intensities = pd.read_csv('minuteIntensitiesNarrow_merged.csv')
minute_mets = pd.read_csv('minuteMETsNarrow_merged.csv')
minute_steps = pd.read_csv('minuteStepsNarrow_merged.csv')

### Preview the Datasets ->

In [3]:
# Inspect the first few rows and the column names of each dataset
datasets = [minute_calories, minute_intensities, minute_mets, minute_steps]
dataset_names = ['minute_calories', 'minute_intensities', 'minute_mets', 'minute_steps']

for name, dataset in zip(dataset_names, datasets):
    print(f"{name.upper()}:")
    print(dataset.head())
    print(dataset.columns.tolist())
    print("\n")

MINUTE_CALORIES:
           Id         ActivityMinute  Calories
0  1503960366  4/12/2016 12:00:00 AM    0.7865
1  1503960366  4/12/2016 12:01:00 AM    0.7865
2  1503960366  4/12/2016 12:02:00 AM    0.7865
3  1503960366  4/12/2016 12:03:00 AM    0.7865
4  1503960366  4/12/2016 12:04:00 AM    0.7865
['Id', 'ActivityMinute', 'Calories']


MINUTE_INTENSITIES:
           Id         ActivityMinute  Intensity
0  1503960366  4/12/2016 12:00:00 AM          0
1  1503960366  4/12/2016 12:01:00 AM          0
2  1503960366  4/12/2016 12:02:00 AM          0
3  1503960366  4/12/2016 12:03:00 AM          0
4  1503960366  4/12/2016 12:04:00 AM          0
['Id', 'ActivityMinute', 'Intensity']


MINUTE_METS:
           Id         ActivityMinute  METs
0  1503960366  4/12/2016 12:00:00 AM    10
1  1503960366  4/12/2016 12:01:00 AM    10
2  1503960366  4/12/2016 12:02:00 AM    10
3  1503960366  4/12/2016 12:03:00 AM    10
4  1503960366  4/12/2016 12:04:00 AM    10
['Id', 'ActivityMinute', 'METs']


MINUTE_S

In [4]:
# Enusure the shape of dataset before merging
print(minute_calories.shape)
print(minute_intensities.shape)
print(minute_mets.shape)
print(minute_steps.shape)

(1325580, 3)
(1325580, 3)
(1325580, 3)
(1325580, 3)


### Merging the Datasets ->

In [5]:
# Merge datasets: Start with minutely_calories as the base, then merge others one by one
# Using an outer join to ensure no data is missed
minute_merged_data_narrow = minute_calories.merge(minute_intensities, on=['Id', 'ActivityMinute'], how='outer', suffixes=('', '_intensities'))

# Merging minutely_mets
minute_merged_data_narrow = minute_merged_data_narrow.merge(minute_mets, on=['Id', 'ActivityMinute'], how='outer', suffixes=('', '_mets'))

# Merging minutely_steps
minute_merged_data_narrow = minute_merged_data_narrow.merge(minute_steps, on=['Id', 'ActivityMinute'], how='outer', suffixes=('', '_steps'))

# Check for duplicates based on Id and ActivityDay
duplicates = minute_merged_data_narrow.duplicated(subset=['Id', 'ActivityMinute'], keep='first').sum()

# Overview of the merged dataset
merged_overview = {
    'Number of rows': minute_merged_data_narrow.shape[0],
    'Number of columns': minute_merged_data_narrow.shape[1],
    'Number of duplicates': duplicates
}

print(merged_overview)
minute_merged_data_narrow.head()

{'Number of rows': 1325580, 'Number of columns': 6, 'Number of duplicates': 0}


Unnamed: 0,Id,ActivityMinute,Calories,Intensity,METs,Steps
0,1503960366,4/12/2016 12:00:00 AM,0.7865,0,10,0
1,1503960366,4/12/2016 12:01:00 AM,0.7865,0,10,0
2,1503960366,4/12/2016 12:02:00 AM,0.7865,0,10,0
3,1503960366,4/12/2016 12:03:00 AM,0.7865,0,10,0
4,1503960366,4/12/2016 12:04:00 AM,0.7865,0,10,0


In [6]:
# Define a function to split the ActivityHour column into ActivityDay and ActivityTime
def split_activity_minute(df, column_name='ActivityMinute'):
    # Split the column into two parts: Date and Time
    split_columns = df[column_name].str.split(' ', expand=True)
    # Assign the split columns to the dataframe
    df['ActivityDay'] = pd.to_datetime(split_columns[0])  # Convert to datetime format
    df['ActivityTime'] = split_columns[1]
    # Drop the original ActivityHour column
    df.drop(column_name, axis=1, inplace=True)
    return df

In [7]:
# Apply the function to each dataset
minute_merged_data_narrow = split_activity_minute(minute_merged_data_narrow)

In [8]:
minute_merged_data_narrow.head()

Unnamed: 0,Id,Calories,Intensity,METs,Steps,ActivityDay,ActivityTime
0,1503960366,0.7865,0,10,0,2016-04-12,12:00:00
1,1503960366,0.7865,0,10,0,2016-04-12,12:01:00
2,1503960366,0.7865,0,10,0,2016-04-12,12:02:00
3,1503960366,0.7865,0,10,0,2016-04-12,12:03:00
4,1503960366,0.7865,0,10,0,2016-04-12,12:04:00


### Exporting the Dataset ->

In [9]:
minute_merged_data_narrow.to_csv("minute_merged_data_narrow.csv")