# Objective : Data Cleaning (hour)

### Import Required Libraries ->

In [16]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

### Load the Dataset ->

In [17]:
df = pd.read_csv("G:\My Drive\Prepinsta - Data Analytics\Week 8 - Task 8\Merging\Hour\hourly_merged_data.csv",index_col=0)

### Preview the DataSet ->

In [18]:
df.head()

Unnamed: 0,Id,ActivityHour,StepTotal,Calories,TotalIntensity,AverageIntensity
0,1503960366,4/12/2016 12:00:00 AM,373,81,20,0.333333
1,1503960366,4/12/2016 1:00:00 AM,160,61,8,0.133333
2,1503960366,4/12/2016 2:00:00 AM,151,59,7,0.116667
3,1503960366,4/12/2016 3:00:00 AM,0,47,0,0.0
4,1503960366,4/12/2016 4:00:00 AM,0,48,0,0.0


In [19]:
# Check the shape of dataset
df.shape

(22099, 6)

In [20]:
# check if is there any null values in the dataset
df.isnull().sum()

Id                  0
ActivityHour        0
StepTotal           0
Calories            0
TotalIntensity      0
AverageIntensity    0
dtype: int64

In [21]:
# check the datatype of all the colimns in dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22099 entries, 0 to 22098
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Id                22099 non-null  int64  
 1   ActivityHour      22099 non-null  object 
 2   StepTotal         22099 non-null  int64  
 3   Calories          22099 non-null  int64  
 4   TotalIntensity    22099 non-null  int64  
 5   AverageIntensity  22099 non-null  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 1.2+ MB


### Splitting of ActivityHour column in Date and Time Respectively ->

In [22]:
# Define a function to split the ActivityHour column into ActivityDay and ActivityTime
def split_activity_hour(df, column_name='ActivityHour'):
    # Split the column into two parts: Date and Time
    split_columns = df[column_name].str.split(' ', expand=True)
    # Assign the split columns to the dataframe
    df['ActivityDay'] = pd.to_datetime(split_columns[0])  # Convert to datetime format
    df['ActivityTime'] = split_columns[1]
    # Drop the original ActivityHour column
    df.drop(column_name, axis=1, inplace=True)
    return df

In [23]:
# Apply the function to each dataset
df = split_activity_hour(df)
df.head()

Unnamed: 0,Id,StepTotal,Calories,TotalIntensity,AverageIntensity,ActivityDay,ActivityTime
0,1503960366,373,81,20,0.333333,2016-04-12,12:00:00
1,1503960366,160,61,8,0.133333,2016-04-12,1:00:00
2,1503960366,151,59,7,0.116667,2016-04-12,2:00:00
3,1503960366,0,47,0,0.0,2016-04-12,3:00:00
4,1503960366,0,48,0,0.0,2016-04-12,4:00:00


### Standerdize the column names ->

In [24]:
# Standardize column names
df.columns = df.columns.str.lower().str.replace(r'([a-z])([A-Z])', r'\1_\2').str.replace(r'[^a-z0-9_]', '', regex=True)

# Display the first few rows of the dataframe to verify the new column names
df.head()

Unnamed: 0,id,steptotal,calories,totalintensity,averageintensity,activityday,activitytime
0,1503960366,373,81,20,0.333333,2016-04-12,12:00:00
1,1503960366,160,61,8,0.133333,2016-04-12,1:00:00
2,1503960366,151,59,7,0.116667,2016-04-12,2:00:00
3,1503960366,0,47,0,0.0,2016-04-12,3:00:00
4,1503960366,0,48,0,0.0,2016-04-12,4:00:00


### Rearrange the position of 'activityday' and 'activitytime' column ->

In [25]:
# Rearrange the position of 'activityday' column to be adjacent to 'id' column
cols = list(df.columns)
cols.insert(1, cols.pop(cols.index('activityday')))  # Move 'activityday' and 'acivitytime' next to 'id'
cols.insert(2, cols.pop(cols.index('activitytime'))) 
df = df[cols]

# Display the first few rows of the dataframe to verify the rearrangement
df.head()

Unnamed: 0,id,activityday,activitytime,steptotal,calories,totalintensity,averageintensity
0,1503960366,2016-04-12,12:00:00,373,81,20,0.333333
1,1503960366,2016-04-12,1:00:00,160,61,8,0.133333
2,1503960366,2016-04-12,2:00:00,151,59,7,0.116667
3,1503960366,2016-04-12,3:00:00,0,47,0,0.0
4,1503960366,2016-04-12,4:00:00,0,48,0,0.0


### Formattin the activityday column as datetime datatype ->

In [26]:
# Convert 'activityday' and to datetime format
df['activityday'] = pd.to_datetime(df['activityday'], format='%d-%m-%Y')
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22099 entries, 0 to 22098
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   id                22099 non-null  int64         
 1   activityday       22099 non-null  datetime64[ns]
 2   activitytime      22099 non-null  object        
 3   steptotal         22099 non-null  int64         
 4   calories          22099 non-null  int64         
 5   totalintensity    22099 non-null  int64         
 6   averageintensity  22099 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(4), object(1)
memory usage: 1.3+ MB


### Remove Duplicate values if any ->

In [27]:
# chek if is there any duplicate values in dataset
duplicates = duplicates = df.duplicated().sum()
duplicates

1606

In [28]:
# Remove duplicate rows, keeping the first occurrence of each duplicate
df = df.drop_duplicates()

### Export the Dataset ->

In [29]:
df.to_csv("G:\My Drive\Prepinsta - Data Analytics\Week 8 - Task 8\Master Datasets\hourly_merged_data.csv")