# Data Cleaning
## Objectives
* Clean the dataset by handling missing values, transforming 'Year' to 'Age', and renaming columns for consistency.
* Ensure the dataset is ready for feature engineering.

## Outputs
* Cleaned dataset saved as 'cleaned_car_data.csv' in the datasets/ folder.

## Additional Comments
* Check for missing values and ensure appropriate column names.
* Transform 'Year' to 'Age' based on the current year (2025).

In [None]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv('../datasets/car_data.csv')

# Display basic info
print('Dataset Info:')
df.info()

# Check for missing values
print('\nMissing Values:')
print(df.isna().sum())

# Handle missing values (if any)
# For numerical columns, fill with median; for categorical, fill with mode
for col in ['Selling_Price', 'Present_Price', 'Kms_Driven', 'Owner']:
    if df[col].isna().sum() > 0:
        df[col].fillna(df[col].median(), inplace=True)

for col in ['Fuel_Type', 'Seller_Type', 'Transmission']:
    if df[col].isna().sum() > 0:
        df[col].fillna(df[col].mode()[0], inplace=True)

# Transform 'Year' to 'Age'
current_year = 2025
df['Age'] = current_year - df['Year']
df.drop('Year', axis=1, inplace=True)

# Rename columns for consistency
df.rename(columns={
    'Selling_Price': 'Selling_Price(lacs)',
    'Present_Price': 'Present_Price(lacs)',
    'Kms_Driven': 'Kms_Driven',
    'Owner': 'Past_Owners'
}, inplace=True)

# Save cleaned dataset
df.to_csv('../datasets/cleaned_car_data.csv', index=False)

# Display first few rows
df.head()