In [3]:
import pandas as pd
import numpy as np

# Load the dataset
df_mine = pd.read_csv('Mine.csv')

# 1. Handle missing values
# Fill missing 'Pulse' and 'Maxpulse' with median, 'Calories' with mean
if 'Pulse' in df_mine.columns:
    df_mine['Pulse'] = df_mine['Pulse'].fillna(df_mine['Pulse'].median())

if 'Maxpulse' in df_mine.columns:
    df_mine['Maxpulse'] = df_mine['Maxpulse'].fillna(df_mine['Maxpulse'].median())

if 'Calories' in df_mine.columns:
    df_mine['Calories'] = df_mine['Calories'].fillna(df_mine['Calories'].mean())

# 2. Fix inconsistent date formats
# Remove trailing single quotes and standardize format
if 'Date' in df_mine.columns:
    df_mine['Date'] = df_mine['Date'].astype(str).str.replace("'", "", regex=False)

    # Try converting directly to datetime
    df_mine['Date'] = pd.to_datetime(df_mine['Date'], errors='coerce')

    # Fix special cases like 20231018 (YYYYMMDD) by replacing them with YYYY/MM/DD
    def fix_strict_format(d):
        if d.isdigit() and len(d) == 8:
            return f"{d[:4]}/{d[4:6]}/{d[6:]}"
        return d

    df_mine['Date'] = df_mine['Date'].fillna(
        pd.to_datetime(df_mine['Date'].astype(str).apply(fix_strict_format), errors='coerce')
    )

# 3. Remove duplicate rows
df_mine = df_mine.drop_duplicates()

# 4. Fix wrong data
# Fix unrealistic Duration = 400
if 'Duration' in df_mine.columns:
    df_mine.loc[df_mine['Duration'] == 400, 'Duration'] = 40

# Fix Maxpulse values below 100
if 'Maxpulse' in df_mine.columns:
    df_mine.loc[df_mine['Maxpulse'] < 100, 'Maxpulse'] = df_mine['Maxpulse'].median()

# 5. Drop unnecessary columns — no action needed (but you could do df.drop(['col1', 'col2'], axis=1))

# Save cleaned dataset
df_mine.to_csv('Mine_cleaned.csv', index=False)

# Show confirmation
print("✅ Dataset cleaned and saved as 'Mine_cleaned.csv'")


✅ Dataset cleaned and saved as 'Mine_cleaned.csv'
