In [None]:
# function for data cleaning from exercise 3
import pandas as pd

def clean_dataset(df):
    # Convert column names to lowercase and replace spaces with underscores
    df.columns = df.columns.str.lower().str.replace(' ', '_')
    # Rename the second column to 'state'
    df.rename(columns={df.columns[1]: 'state'}, inplace=True)
    # Convert gender values to uppercase and standardize
    df.gender = df['gender'].str.lower().replace(['f', 'femal', 'female'], 'F').replace(['m', 'male'], 'M')
    df.education = df['education'].replace('Bachelors', 'Bachelor')

    # Handle customer_lifetime_value column
    if 'customer_lifetime_value' in df.columns and df['customer_lifetime_value'].dtype == 'O':
        df['customer_lifetime_value'] = pd.to_numeric(df['customer_lifetime_value'].str.replace('%', ''), errors='coerce')

    vehicle_mapping = {'Sports Car': 'Luxury', 'Luxury SUV': 'Luxury', 'Luxury Car': 'Luxury'}
    df.vehicle_class = df['vehicle_class'].replace(vehicle_mapping) 
    df.number_of_open_complaints = pd.to_numeric(df['number_of_open_complaints'], errors='coerce').astype('Int64')
    
    # Fill missing values in numeric columns with the mean
    numeric_columns = ['customer_lifetime_value', 'income', 'monthly_premium_auto', 'number_of_open_complaints', 'total_claim_amount']
    for col in numeric_columns:
        if col == 'number_of_open_complaints':
            mean_value = round(df[col].mean())
            df[col] = df[col].fillna(mean_value)
        else:
            df[col] = df[col].fillna(round(df[col].mean()))
    
    # Fill missing values in categorical columns with the mode
    categorical_columns = ['customer', 'state', 'gender', 'education', 'policy_type', 'vehicle_class']
    for col in categorical_columns:
        df[col] = df[col].fillna(df[col].mode()[0])
    
    # Display duplicates after handling
    duplicates_after_handling = df.duplicated()
    print("\nDuplicates after handling:")
    print(duplicates_after_handling) 
    
    return df

# Assuming you have a DataFrame named 'df'
your_data = df.copy()  # Make a copy if you want to keep the original DataFrame unchanged

# Call the cleaning function
cleaned_data = clean_dataset(your_data)

# Display the first few rows of the cleaned data
cleaned_data.head()