In [None]:
import pandas as pd

df = pd.read_csv('data.csv')
#print the first 5 rows of the DataFrame
df.head(5)

In [None]:
#Data Processing or Data Cleaning
"""
BAD DATA:
1.Empty values
2.Wrong formats        30/05/2025   30052025
3.Wrong data     Age: 23, -23,  123,   20
4.Duplicate data
5.Outliers  
"""

In [None]:
#handling empty values
#checking for empty values
print(df.isnull().sum())  #sum of empty cells per column.

Duration    0
Pulse       0
Maxpulse    0
Calories    5
dtype: int64


In [None]:
#checking data types / wrong formats
print(df.dtypes)  #data types of each column

In [3]:
#checking for non-numeric values in numeric columns
for col in ['Duration','Pulse','Maxpulse','Calories']:
    if not pd.api.types.is_numeric_dtype(df[col]):
        print(f"Non-numeric values found in {col}:")
        print(df[~df[col].apply(lambda x: isinstance(x, (int, float)))][col])    

In [4]:
#check for negative values in numeric columns
for col in ['Duration','Pulse','Maxpulse','Calories']:
    if (df[col] < 0).any():
        print(f"Negative values found in {col}:")
        print(df[df[col] < 0][col])

In [7]:
#checking for duplicate rows
duplicates = df.duplicated()
if duplicates.any():
    print("Duplicate rows found:")
    print(df[duplicates])

Duplicate rows found:
     Duration  Pulse  Maxpulse  Calories
36         60    102       127     300.0
37         60    100       120     300.0
38         60    100       120     300.0
40         45     90       112     180.1
71         60    109       153     387.6
113        45    100       120     225.3
155        60    111       151     368.5


In [None]:
"""
CLEANING DATA:
1. Fill empty values with mean or median
2. Convert wrong formats to correct formats
3. Remove or correct wrong data
4. Remove duplicate rows
5. Handle outliers
"""

In [None]:
#fill empty values with mean
for col in ['Duration', 'Pulse', 'Maxpulse', 'Calories']:
    if df[col].isnull().any():
        mean_value = df[col].mean()
        df[col].fillna(mean_value, inplace=True)
        
        
#check for if there are still any empty values
print(df.isnull().sum())  #calories is no longer null.

Duration    0
Pulse       0
Maxpulse    0
Calories    0
dtype: int64


In [10]:
#shape tells us the number of rows and columns
#EDA- Exploratory Data Analysis

print("Data shape after cleaning:", df.shape)

Data shape after cleaning: (169, 4)


In [None]:
#get descriptive statistics
#descibe() gives us count, mean, std, min, 25%, 50%, 75%, max
print("Descriptive statistics:", df.describe())