In [1]:
import numpy as np
import pandas as pd


df = pd.read_csv("Mine.csv")
# --- Initial Inspection of the DataFrame ---
print("Original DataFrame head:")
print(df.head()) # Display the first 5 rows of the original DataFrame
print("\nOriginal DataFrame info:")
print(df.info()) # Display a summary of the DataFrame, including data types and non-null counts
print("\nOriginal DataFrame missing values before cleaning:")
print(df.isnull().sum()) # Count and display the number of missing values (NaN) in each column
print("\nOriginal DataFrame duplicates before cleaning:")
print(df.duplicated().sum()) # Count and display the number of duplicate rows

Original DataFrame head:
   Duration         Date  Pulse  Maxpulse  Calories
0        60  2023/10/01'  110.0     130.0     409.1
1        60  2023/10/02'  117.0     145.0     479.0
2        60  2023/10/03'  103.0     135.0     340.3
3        45  2023/10/04'  109.0     175.0     282.4
4        45  2023/10/05'  117.0     150.0     405.1

Original DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31 entries, 0 to 30
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Duration  31 non-null     int64  
 1   Date      29 non-null     object 
 2   Pulse     30 non-null     float64
 3   Maxpulse  29 non-null     float64
 4   Calories  27 non-null     float64
dtypes: float64(3), int64(1), object(1)
memory usage: 1.3+ KB
None

Original DataFrame missing values before cleaning:
Duration    0
Date        2
Pulse       1
Maxpulse    2
Calories    4
dtype: int64

Original DataFrame duplicates before cleaning:
0


In [None]:
# --- 1. Handle Missing Values / Empty Cells ---
# For numerical columns ('Calories', 'Pulse', 'Maxpulse'), impute missing values with the median.
# The median is preferred over the mean as it's less sensitive to outliers.
df['Calories'] = df['Calories'].fillna(df['Calories'].median())
df['Pulse'] = df['Pulse'].fillna(df['Pulse'].median())
df['Maxpulse'] = df['Maxpulse'].fillna(df['Maxpulse'].median())

# For the 'Date' column, drop rows where the date is missing (NaN).
# Imputing dates can be complex and may introduce inaccuracies, so dropping is often safer for critical identifiers.
df.dropna(subset=['Date'], inplace=True)

In [None]:
# --- 2. Handle Inconsistent Date Formats ---
# The 'Date' column might contain inconsistencies (e.g., apostrophes, different formats).

# First, ensure the 'Date' column is treated as string and remove any trailing apostrophes.
df['Date'] = df['Date'].astype(str).str.replace("'", "")

# Attempt to convert the 'Date' column to datetime objects using a primary format ('%Y/%m/%d').
# 'errors='coerce'' will turn any unparseable dates into NaT (Not a Time).
df['Date'] = pd.to_datetime(df['Date'], errors='coerce', format='%Y/%m/%d')

# Handle a secondary date format ('%Y%m%d') for dates that failed parsing in the first attempt (are NaT).
# This is done by filling the NaT values with new attempts at parsing.
df['Date'] = df['Date'].fillna(pd.to_datetime(df['Date'].astype(str), errors='coerce', format='%Y%m%d'))

# After attempting multiple formats, drop any rows where the date conversion still failed (i.e., still NaT).
df.dropna(subset=['Date'], inplace=True)

In [None]:
# --- 3. Handle Duplicate Rows ---
# Remove any rows that are exact duplicates of other rows across all columns.
# 'inplace=True' modifies the DataFrame directly.
df.drop_duplicates(inplace=True)

In [None]:
# --- 4. Handle Wrong Data ---
# This step involves identifying and correcting or removing logically incorrect or out-of-range data.

# Check 'Duration' column for unusually high values.
# Assuming a workout duration typically doesn't exceed 120 minutes.
# Filter out rows where 'Duration' is greater than 120.
df = df[df['Duration'] <= 120]

# Check 'Pulse' and 'Maxpulse' for unrealistic physiological values.
# Assuming 'Pulse' should be between 40 and 220 bpm.
# Assuming 'Maxpulse' should be between 80 and 220 bpm.
df = df[(df['Pulse'] >= 40) & (df['Pulse'] <= 220)]
df = df[(df['Maxpulse'] >= 80) & (df['Maxpulse'] <= 220)]

In [None]:
# --- 5. Handle Unnecessary Columns ---
# Based on the dataset and common fitness analysis, all columns ('Duration', 'Date', 'Pulse', 'Maxpulse', 'Calories')
# are considered relevant for analysis. Therefore, no columns are dropped in this step.
# If there were irrelevant columns, you would use: df.drop(columns=['column_name_to_drop'], inplace=True)

# --- Final Inspection of the Cleaned DataFrame ---
print("\nCleaned DataFrame head:")
print(df.head()) # Display the first 5 rows of the cleaned DataFrame
print("\nCleaned DataFrame info:")
print(df.info()) # Display summary info of the cleaned DataFrame (check Dtypes and non-null counts)
print("\nCleaned DataFrame missing values (should be 0):")
print(df.isnull().sum()) # Verify no missing values remain
print("\nCleaned DataFrame duplicates (should be 0):")
print(df.duplicated().sum()) # Verify no duplicate rows remain

print("\n--- Cleaned Dataset ---")
# Display the entire cleaned DataFrame. '.to_string()' prevents truncation for large DataFrames.
print(df.to_string())