In [None]:
import pandas as pd
import numpy as np

# Load the dataset
df_mine = pd.read_csv('Mine.csv')



In [2]:
df_mine

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,2023/10/01',110.0,130.0,409.1
1,60,2023/10/02',117.0,145.0,479.0
2,60,2023/10/03',103.0,135.0,340.3
3,45,2023/10/04',109.0,175.0,282.4
4,45,2023/10/05',117.0,150.0,405.1
5,60,2023/10/06',103.0,125.0,300.0
6,60,2023/10/07',110.0,135.0,374.0
7,400,2023/10/08',114.0,133.0,
8,60,2023/10/09',112.0,126.0,193.8
9,30,2023/10/10',102.0,147.0,234.8


In [3]:
# 1. Handle missing values
# Fill missing 'Pulse' and 'Maxpulse' with median, 'Calories' with mean
if 'Pulse' in df_mine.columns:
    df_mine['Pulse'] = df_mine['Pulse'].fillna(df_mine['Pulse'].median())

if 'Maxpulse' in df_mine.columns:
    df_mine['Maxpulse'] = df_mine['Maxpulse'].fillna(df_mine['Maxpulse'].median())

if 'Calories' in df_mine.columns:
    df_mine['Calories'] = df_mine['Calories'].fillna(df_mine['Calories'].mean())

In [5]:
# 2. Fix inconsistent date formats
# Remove trailing single quotes and standardize format
if 'Date' in df_mine.columns:
    df_mine['Date'] = df_mine['Date'].astype(str).str.replace("'", "", regex=False)

    # Try converting directly to datetime
    df_mine['Date'] = pd.to_datetime(df_mine['Date'], errors='coerce')

    # Fix special cases like 20231018 (YYYYMMDD) by replacing them with YYYY/MM/DD
    def fix_strict_format(d):
        if d.isdigit() and len(d) == 8:
            return f"{d[:4]}/{d[4:6]}/{d[6:]}"
        return d

    df_mine['Date'] = df_mine['Date'].fillna(
        pd.to_datetime(df_mine['Date'].astype(str).apply(fix_strict_format), errors='coerce')
    )



In [6]:
# 3. Remove duplicate rows
df_mine = df_mine.drop_duplicates()


In [9]:
# Display duplicate rows in df_mine (excluding the first occurrence)
duplicates = df_mine[df_mine.duplicated()]
duplicates

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories


In [10]:
# Display rows with potentially wrong data in df_mine
# For example: Duration > 120, Pulse < 60 or > 200, Maxpulse < 100, Calories < 100 or > 1000, or missing Date

wrong_data = df_mine[
    (df_mine['Duration'] > 120) |
    (df_mine['Pulse'] < 60) | (df_mine['Pulse'] > 200) |
    (df_mine['Maxpulse'] < 100) |
    (df_mine['Calories'] < 100) | (df_mine['Calories'] > 1000) |
    (df_mine['Date'].isna())
]
wrong_data

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
17,45,NaT,142.0,103.0,241.4
22,60,NaT,130.0,108.0,230.8
29,60,NaT,92.0,125.0,380.5


In [7]:
# 4. Fix wrong data
# Fix unrealistic Duration = 400
if 'Duration' in df_mine.columns:
    df_mine.loc[df_mine['Duration'] == 400, 'Duration'] = 40
    # Fix Maxpulse values below 100
if 'Maxpulse' in df_mine.columns:
    df_mine.loc[df_mine['Maxpulse'] < 100, 'Maxpulse'] = df_mine['Maxpulse'].median()

In [11]:
# 5. Drop unnecessary columns — no action needed (but you could do df.drop(['col1', 'col2'], axis=1))

# Save cleaned dataset
df_mine.to_csv('Mine_cleaned.csv', index=False)

# Show confirmation
print(" Dataset cleaned and saved as 'Mine_cleaned.csv'")


 Dataset cleaned and saved as 'Mine_cleaned.csv'
