In [None]:
import pandas as pd
import numpy as np

# This is used to identify outliers and then remove them
def identify_outlier(df):
    # Create a vector of 0 of length equal to the number of rows
    temp = np.zeros(df.shape[0])
    for i, x in enumerate(df['engine-size']):
        if x > 190: temp[i] = 1 
    for i, x in enumerate(df['curb-weight']):
        if x > 3500: temp[i] = 1 
    for i, x in enumerate(df['city-mpg']):
        if x > 40: temp[i] = 1      
    df['outlier'] = temp # Append a column to the data frame    
    return df

# Clean the dataframe and then save the cleaned data into a CSV file
def clean_auto(fileName="Automobile_price_data_Raw.csv", outliers=True):
    # Load the data  
    auto_price = pd.read_csv(fileName)

    # Convert selected columns to numeric values
    cols = ['price', 'bore', 'stroke', 'horsepower', 'peak-rpm']
    auto_price[cols] = auto_price[cols].apply(pd.to_numeric, errors='coerce')

    # Replace '?' with NaN values
    auto_price.replace('?', np.nan, inplace=True)

    # Check for missing values
    pd.isnull(auto_price).values.sum()

    # **TASK 2 MODIFICATIONS START HERE** 

    # ✅ **Step 1: Remove rows only if ALL column values are missing**
    auto_price.dropna(how='any', inplace=True)

    # ✅ **Step 2: Replace missing numerical values with their respective column averages**
    
    
    # ✅ **Step 3: Replace missing categorical values using the forward fill method**
    

    # ✅ **Step 4: Remove duplicates for the specified columns**
    

    # **TASK 2 MODIFICATIONS END HERE** 

    # Compute log of the auto price
    auto_price['lnprice'] = np.log(auto_price['price'])

    # Create a column for new cylinder levels and handle NaN values safely
    auto_price['num-cylinders'] = auto_price['num-of-cylinders'].map(
        lambda x: 'four-or-less' if x in ['two', 'three', 'four'] 
        else ('five-six' if x in ['five', 'six'] else ('eight-twelve' if pd.notna(x) else np.nan))
    )

    # Identify and remove outliers if needed 
    if outliers:
        auto_price = identify_outlier(auto_price)  # Mark outliers
        if 'outlier' in auto_price.columns:  # Check if the column exists
            auto_price = auto_price[auto_price.outlier == 0]  # Filter for outliers
            auto_price.drop('outlier', axis=1, inplace=True)

    # Save cleaned data
    auto_price.to_csv('cleaned_autoprice_data.csv', index=False)
    return auto_price
