# Objective: Data Cleaning (daily)

### Importing the Required Libraries ->

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Loading and Previewing the dataset ->

In [3]:
df = pd.read_csv("G:\My Drive\Prepinsta - Data Analytics\Week 8 - Task 8\Merging\Daily\daily_merged_data.csv", index_col=0)
df.head()

Unnamed: 0,Id,ActivityDay,TotalSteps,TotalDistance,TrackerDistance,LoggedActivitiesDistance,VeryActiveDistance,ModeratelyActiveDistance,LightActiveDistance,SedentaryActiveDistance,...,StepTotal,Calories_calories,SedentaryMinutes_intensities,LightlyActiveMinutes_intensities,FairlyActiveMinutes_intensities,VeryActiveMinutes_intensities,SedentaryActiveDistance_intensities,LightActiveDistance_intensities,ModeratelyActiveDistance_intensities,VeryActiveDistance_intensities
0,1503960366,2016-04-12,13162,8.5,8.5,0.0,1.88,0.55,6.06,0.0,...,13162,1985,728,328,13,25,0.0,6.06,0.55,1.88
1,1503960366,2016-04-13,10735,6.97,6.97,0.0,1.57,0.69,4.71,0.0,...,10735,1797,776,217,19,21,0.0,4.71,0.69,1.57
2,1503960366,2016-04-14,10460,6.74,6.74,0.0,2.44,0.4,3.91,0.0,...,10460,1776,1218,181,11,30,0.0,3.91,0.4,2.44
3,1503960366,2016-04-15,9762,6.28,6.28,0.0,2.14,1.26,2.83,0.0,...,9762,1745,726,209,34,29,0.0,2.83,1.26,2.14
4,1503960366,2016-04-16,12669,8.16,8.16,0.0,2.71,0.41,5.04,0.0,...,12669,1863,773,221,10,36,0.0,5.04,0.41,2.71


In [3]:
# Check the shape of dataset
df.shape

(940, 25)

In [4]:
# check if is there any null values in the dataset
df.isnull().sum()

Id                                      0
ActivityDay                             0
TotalSteps                              0
TotalDistance                           0
TrackerDistance                         0
LoggedActivitiesDistance                0
VeryActiveDistance                      0
ModeratelyActiveDistance                0
LightActiveDistance                     0
SedentaryActiveDistance                 0
VeryActiveMinutes                       0
FairlyActiveMinutes                     0
LightlyActiveMinutes                    0
SedentaryMinutes                        0
Calories                                0
StepTotal                               0
Calories_calories                       0
SedentaryMinutes_intensities            0
LightlyActiveMinutes_intensities        0
FairlyActiveMinutes_intensities         0
VeryActiveMinutes_intensities           0
SedentaryActiveDistance_intensities     0
LightActiveDistance_intensities         0
ModeratelyActiveDistance_intensiti

### Remove Duplicate values if any ->

In [5]:
# chek if is there any duplicate values in dataset
duplicates = duplicates = df.duplicated(subset=['Id', 'ActivityDay'], keep='first').sum()
duplicates

0

### Remove Duplicate Column if any ->

In [6]:
# Convert 'ActivityDay' to datetime
df['ActivityDay'] = pd.to_datetime(df['ActivityDay'])

# Check for duplicate columns by comparing values in columns that seem to represent the same data
duplicate_columns = ['TotalSteps', 'StepTotal', 
                     'Calories_calories', 'Calories',
                     'SedentaryMinutes_intensities', 'LightlyActiveMinutes_intensities', 
                     'FairlyActiveMinutes_intensities', 'VeryActiveMinutes_intensities', 
                     'SedentaryActiveDistance_intensities', 'LightActiveDistance_intensities', 
                     'ModeratelyActiveDistance_intensities', 'VeryActiveDistance_intensities']

# Check if the data in the original and the corresponding "_intensities" columns are the same
for column in duplicate_columns:
    if column in df.columns:
        original_column = column.replace('_intensities', '').replace('calories', 'Calories')
        if original_column in df.columns:
            # Check if all values are equal
            all_equal = (df[original_column] == df[column]).all()
            print(f"Columns '{original_column}' and '{column}' are identical: {all_equal}")

# Based on the output, we will decide which duplicate columns to drop. Let's first check the equality.



Columns 'TotalSteps' and 'TotalSteps' are identical: True
Columns 'StepTotal' and 'StepTotal' are identical: True
Columns 'Calories' and 'Calories' are identical: True
Columns 'SedentaryMinutes' and 'SedentaryMinutes_intensities' are identical: True
Columns 'LightlyActiveMinutes' and 'LightlyActiveMinutes_intensities' are identical: True
Columns 'FairlyActiveMinutes' and 'FairlyActiveMinutes_intensities' are identical: True
Columns 'VeryActiveMinutes' and 'VeryActiveMinutes_intensities' are identical: True
Columns 'SedentaryActiveDistance' and 'SedentaryActiveDistance_intensities' are identical: True
Columns 'LightActiveDistance' and 'LightActiveDistance_intensities' are identical: True
Columns 'ModeratelyActiveDistance' and 'ModeratelyActiveDistance_intensities' are identical: True
Columns 'VeryActiveDistance' and 'VeryActiveDistance_intensities' are identical: True


- All identified pairs of duplicate columns contain identical data, so we can safely remove one from each pair to simplify our dataset. We will keep the original columns and remove those with the "_intensities" suffix, as well as the "StepTotal" column which is a duplicate of "TotalSteps".

In [7]:
# Remove duplicate columns
columns_to_remove = [col for col in df.columns if '_intensities' in col or col == 'StepTotal' or col == 'Calories_calories']
data_cleaned = df.drop(columns=columns_to_remove)

# Check for missing values
missing_values = data_cleaned.isnull().sum()
missing_values

Id                          0
ActivityDay                 0
TotalSteps                  0
TotalDistance               0
TrackerDistance             0
LoggedActivitiesDistance    0
VeryActiveDistance          0
ModeratelyActiveDistance    0
LightActiveDistance         0
SedentaryActiveDistance     0
VeryActiveMinutes           0
FairlyActiveMinutes         0
LightlyActiveMinutes        0
SedentaryMinutes            0
Calories                    0
dtype: int64

In [8]:
# check the datatype of all the colimns in dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 940 entries, 0 to 939
Data columns (total 25 columns):
 #   Column                                Non-Null Count  Dtype         
---  ------                                --------------  -----         
 0   Id                                    940 non-null    int64         
 1   ActivityDay                           940 non-null    datetime64[ns]
 2   TotalSteps                            940 non-null    int64         
 3   TotalDistance                         940 non-null    float64       
 4   TrackerDistance                       940 non-null    float64       
 5   LoggedActivitiesDistance              940 non-null    float64       
 6   VeryActiveDistance                    940 non-null    float64       
 7   ModeratelyActiveDistance              940 non-null    float64       
 8   LightActiveDistance                   940 non-null    float64       
 9   SedentaryActiveDistance               940 non-null    float64       
 10  VeryAct

### Standerdize the column names ->

In [9]:
# Standardize column names
df.columns = df.columns.str.lower().str.replace(r'([a-z])([A-Z])', r'\1_\2').str.replace(r'[^a-z0-9_]', '', regex=True)

# Display the first few rows of the dataframe to verify the new column names
df.head()

Unnamed: 0,id,activityday,totalsteps,totaldistance,trackerdistance,loggedactivitiesdistance,veryactivedistance,moderatelyactivedistance,lightactivedistance,sedentaryactivedistance,...,steptotal,calories_calories,sedentaryminutes_intensities,lightlyactiveminutes_intensities,fairlyactiveminutes_intensities,veryactiveminutes_intensities,sedentaryactivedistance_intensities,lightactivedistance_intensities,moderatelyactivedistance_intensities,veryactivedistance_intensities
0,1503960366,2016-04-12,13162,8.5,8.5,0.0,1.88,0.55,6.06,0.0,...,13162,1985,728,328,13,25,0.0,6.06,0.55,1.88
1,1503960366,2016-04-13,10735,6.97,6.97,0.0,1.57,0.69,4.71,0.0,...,10735,1797,776,217,19,21,0.0,4.71,0.69,1.57
2,1503960366,2016-04-14,10460,6.74,6.74,0.0,2.44,0.4,3.91,0.0,...,10460,1776,1218,181,11,30,0.0,3.91,0.4,2.44
3,1503960366,2016-04-15,9762,6.28,6.28,0.0,2.14,1.26,2.83,0.0,...,9762,1745,726,209,34,29,0.0,2.83,1.26,2.14
4,1503960366,2016-04-16,12669,8.16,8.16,0.0,2.71,0.41,5.04,0.0,...,12669,1863,773,221,10,36,0.0,5.04,0.41,2.71


### Formattin the activityday column as datetime datatype ->

In [10]:
# Convert 'activityday' to datetime format
df['activityday'] = pd.to_datetime(df['activityday'], format='%d/%m/%Y')
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 940 entries, 0 to 939
Data columns (total 25 columns):
 #   Column                                Non-Null Count  Dtype         
---  ------                                --------------  -----         
 0   id                                    940 non-null    int64         
 1   activityday                           940 non-null    datetime64[ns]
 2   totalsteps                            940 non-null    int64         
 3   totaldistance                         940 non-null    float64       
 4   trackerdistance                       940 non-null    float64       
 5   loggedactivitiesdistance              940 non-null    float64       
 6   veryactivedistance                    940 non-null    float64       
 7   moderatelyactivedistance              940 non-null    float64       
 8   lightactivedistance                   940 non-null    float64       
 9   sedentaryactivedistance               940 non-null    float64       
 10  veryact

### Export the Dataset ->

In [11]:
df.to_csv("G:\My Drive\Prepinsta - Data Analytics\Week 8 - Task 8\Master Datasets\daily_merged_data.csv")