In [1]:
print("Hello World")

Hello World


In [16]:
import pandas as pd
import numpy as np
import re
import os
from collections import Counter

def analyze_dataset(file_path):
    """
    Perform comprehensive analysis on a dataset to identify inconsistencies,
    data quality issues, and structural problems.
    """
    print(f"Analyzing dataset: {file_path}")
    
    # Load the dataset
    df = pd.read_csv(file_path)
    
    # Basic dataset information
    print(f"\n=== Basic Dataset Information ===")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print(f"Total Rows: {len(df)}")
    print(f"Memory Usage: {df.memory_usage().sum() / 1024**2:.2f} MB")
    
    # Missing values summary
    missing_values = df.isna().sum()
    missing_percent = (missing_values / len(df)) * 100
    missing_summary = pd.DataFrame({
        'Missing Values': missing_values,
        'Missing (%)': missing_percent.round(2)
    }).sort_values('Missing Values', ascending=False)
    
    print(f"\n=== Missing Values Summary ===")
    print(missing_summary)
    
    # Rows with missing values
    rows_with_missing = df.isna().any(axis=1).sum()
    print(f"\nRows with at least one missing value: {rows_with_missing} ({rows_with_missing/len(df)*100:.2f}%)")
    
    # Column type analysis
    print(f"\n=== Column Data Types ===")
    print(df.dtypes)
    
    # Detailed column analysis
    print(f"\n=== Detailed Column Analysis ===")
    
    for col in df.columns:
        print(f"\n--- Analyzing Column: '{col}' ---")
        print(f"Total Rows: {len(df)}")
        print(f"Missing Values: {df[col].isna().sum()} ({df[col].isna().sum()/len(df)*100:.2f}%)")
        
        # Handle different data types appropriately
        if df[col].dtype == 'object':
            # For string/object columns
            non_missing_values = df[col].dropna()
            unique_values = non_missing_values.nunique()
            print(f"Unique Values: {unique_values}")
            print(f"Data Type: {df[col].dtype}")
            
            # Show sample of unique values
            if unique_values > 0:
                sample_size = min(10, unique_values)
                print(f"Sample of Unique Values ({sample_size} shown):")
                print(non_missing_values.unique()[:sample_size])
            
            # Check for inconsistencies in formatting
            if unique_values > 0:
                # Check for variations in casing or leading/trailing spaces
                values_lower = non_missing_values.str.lower().str.strip()
                if values_lower.nunique() < unique_values:
                    print("Inconsistency Detected: Variations in casing or spacing found.")
                
                # Check for special characters (commas, etc.)
                has_special_chars = non_missing_values.str.contains(r'[,]').any()
                if has_special_chars:
                    print("Inconsistency Detected: Special characters (e.g., commas) present.")
                    examples = non_missing_values[non_missing_values.str.contains(r'[,]')].sample(min(5, non_missing_values.str.contains(r'[,]').sum())).tolist()
                    print(f"Examples with special characters:")
                    print(examples)
                
                # Check for numeric values stored as strings
                looks_numeric = non_missing_values.str.replace(r'[^0-9.]', '', regex=True).str.match(r'^[0-9]+(\.[0-9]+)?$').any()
                if looks_numeric:
                    contains_text = non_missing_values.str.contains(r'[a-zA-Z]').any()
                    if contains_text:
                        print("Inconsistency Detected: Numeric values with text suffixes/prefixes.")
                        examples = non_missing_values[non_missing_values.str.contains(r'[0-9]') & non_missing_values.str.contains(r'[a-zA-Z]')].sample(min(5, (non_missing_values.str.contains(r'[0-9]') & non_missing_values.str.contains(r'[a-zA-Z]')).sum())).tolist()
                        print(f"Examples:")
                        print(examples)
        else:
            # For numeric columns
            non_missing_values = df[col].dropna()
            unique_values = non_missing_values.nunique()
            print(f"Unique Values: {unique_values}")
            print(f"Data Type: {df[col].dtype}")
            
            if len(non_missing_values) > 0:
                # Show sample of unique values
                sample_size = min(10, unique_values)
                if sample_size > 0:
                    print(f"Sample of Unique Values ({sample_size} shown):")
                    print(non_missing_values.unique()[:sample_size])
                
                # Basic statistics
                print(f"Min: {non_missing_values.min()}")
                print(f"Max: {non_missing_values.max()}")
                print(f"Mean: {non_missing_values.mean()}")
                print(f"Median: {non_missing_values.median()}")
                
                # Check for outliers (using IQR method)
                q1 = non_missing_values.quantile(0.25)
                q3 = non_missing_values.quantile(0.75)
                iqr = q3 - q1
                outliers = ((non_missing_values < (q1 - 1.5 * iqr)) | (non_missing_values > (q3 + 1.5 * iqr))).sum()
                if outliers > 0:
                    print(f"Potential Outliers: {outliers} ({outliers/len(non_missing_values)*100:.2f}%)")
    
    # Check for specific problem areas based on errors
    print(f"\n=== Specific Data Quality Issues ===")
    
    # Check condition column for mixed types
    if 'Condition' in df.columns:
        condition_values = df['Condition'].dropna().unique()
        print("\nCondition column values:")
        print(condition_values)
        
        # Try to detect if values are numeric or mixed
        numeric_pattern = re.compile(r'^[0-9]+(\.[0-9]+)?$')
        numeric_values = [v for v in condition_values if isinstance(v, (int, float)) or 
                         (isinstance(v, str) and numeric_pattern.match(re.sub(r'[^0-9.]', '', v)))]
        non_numeric_values = [v for v in condition_values if v not in numeric_values]
        
        if numeric_values and non_numeric_values:
            print("Issue Detected: Condition column has mixed numeric and non-numeric values")
            print(f"Numeric values: {numeric_values[:5]}")
            print(f"Non-numeric values: {non_numeric_values[:5]}")
    
    # Check the Make-Model relationship in the dataset
    if 'Make' in df.columns and 'Model' in df.columns:
        make_model_counts = df.groupby(['Make', 'Model']).size().reset_index(name='count')
        make_model_counts = make_model_counts.sort_values('count', ascending=False)
        
        print("\nMake-Model frequency distribution:")
        print(f"Total unique Make-Model combinations: {len(make_model_counts)}")
        print(f"Top 10 Make-Model combinations:")
        print(make_model_counts.head(10))
        
        print(f"\nRare Make-Model combinations (fewer than 5 entries): {len(make_model_counts[make_model_counts['count'] < 5])}")
        
        # Check for potential model misclassifications
        if len(make_model_counts) > 0:
            model_by_make = {}
            for make in df['Make'].dropna().unique():
                models = df[df['Make'] == make]['Model'].dropna().unique()
                model_by_make[make] = models
            
            # Look for models appearing under multiple makes
            all_models = df['Model'].dropna().unique()
            shared_models = []
            for model in all_models:
                makes_with_model = df[df['Model'] == model]['Make'].dropna().unique()
                if len(makes_with_model) > 1:
                    shared_models.append((model, list(makes_with_model)))
            
            if shared_models:
                print("\nPotential Data Quality Issue: Same model name appearing under multiple makes")
                for model, makes in shared_models[:5]:
                    print(f"Model '{model}' appears under makes: {makes}")
    
    # Check rows 728-730 specifically (as mentioned in error)
    if len(df) > 730:
        print("\n--- Specific Check: Rows 728–730 for 'Make' and 'Model' ---")
        specific_rows = df.iloc[728:731]
        print(specific_rows[['Make', 'Model']])
    
    # Check for 'Nan' strings vs actual NaN values
    nan_strings = []
    for col in df.columns:
        if df[col].dtype == 'object':
            nan_string_count = (df[col] == 'Nan').sum()
            if nan_string_count > 0:
                nan_strings.append((col, nan_string_count))
    
    if nan_strings:
        print("\nColumns with 'Nan' string values (not actual NaN):")
        for col, count in nan_strings:
            print(f"- {col}: {count} rows")
    
    return df

# Usage
if __name__ == "__main__":
    # Update this with your file path
    file_path = r"C:\Users\Ricky\Desktop\For Fun Projects\SBT-Japan\datasets\merged_used_cars_cleaned.csv"
    
    # Analyze the dataset
    df = analyze_dataset(file_path)
    
    print("\n=== Overall Dataset Diversity ===")
    print(f"Total Rows: {len(df)}")
    print(f"Total Columns: {len(df.columns)}")
    rows_with_missing = df.isna().any(axis=1).sum()
    print(f"Rows with at least one missing value: {rows_with_missing}")
    print(f"Percentage of rows with missing values: {rows_with_missing/len(df)*100:.2f}%")

Analyzing dataset: C:\Users\Ricky\Desktop\For Fun Projects\SBT-Japan\datasets\merged_used_cars_cleaned.csv

=== Basic Dataset Information ===
Shape: (13467, 15)
Columns: ['Make', 'Model', 'Year', 'Mileage', 'Engine size', 'Fuel type', 'Transmission', 'Condition', 'Drive type', 'Horsepower', 'Torque', 'Acceleration', 'Body type', 'Seats', 'Price']
Total Rows: 13467
Memory Usage: 1.54 MB

=== Missing Values Summary ===
              Missing Values  Missing (%)
Acceleration           10687        79.36
Horsepower             10680        79.30
Torque                 10678        79.29
Condition               9653        71.68
Seats                   4872        36.18
Year                    2414        17.93
Drive type               946         7.02
Body type                728         5.41
Price                    622         4.62
Mileage                  551         4.09
Engine size              123         0.91
Transmission             121         0.90
Fuel type                 15     

In [18]:
import pandas as pd
import numpy as np
import re
import os
import warnings
warnings.filterwarnings('ignore')  # Suppress warnings for cleaner output

def clean_used_cars_dataset(file_path, output_suffix="_fully_cleaned"):
    """
    Comprehensive cleaning script for used cars dataset with improved handling for:
    - Mixed data types in the Condition column
    - Make extraction from Model when Make is 'Nan'
    - Standardization of categorical columns
    - Numeric cleaning with better error handling
    - Careful imputation to avoid warnings
    """
    print(f"Loading data from: {file_path}")
    df = pd.read_csv(file_path)
    original_shape = df.shape
    print(f"Original dataset shape: {original_shape}")
    
    # Generate output path in the same directory
    file_dir = os.path.dirname(file_path)
    file_name = os.path.basename(file_path)
    base_name, ext = os.path.splitext(file_name)
    output_path = os.path.join(file_dir, f"{base_name}{output_suffix}{ext}")
    
    # =================== STEP 1: CLEAN MAKE AND MODEL ===================
    print("\n=== Cleaning Make and Model ===")
    
    # Replace string 'Nan' with np.nan in 'Make'
    nan_make_count = (df['Make'] == 'Nan').sum()
    print(f"Found {nan_make_count} rows with 'Nan' string in Make column")
    df['Make'] = df['Make'].replace('Nan', np.nan)
    
    # Create canonical list of known makes from non-missing rows
    known_makes = [
        "Toyota", "Mercedes", "BMW", "Land Rover", "Range Rover", "Alfa Romeo", "Nissan", "Honda",
        "Porsche", "Audi", "Lexus", "Maserati", "Jaguar", "Bentley", "Ferrari", "AMG", "Volvo",
        "Isuzu", "Mitsubishi", "Suzuki", "Subaru", "Volkswagen", "Hino", "Ford", "Lotus", "Aston Martin",
        "Hyundai", "Lamborghini", "Mazda", "Peugeot", "Volks", "Cadillac", "Chevrolet", "Daihatsu",
        "Kia", "Mini", "Renault", "Rover", "Jeep", "Citroen", "Datsun", "Fiat", "Leyland", "Tesla"
    ]
    
    # Standardize known makes for matching (proper casing)
    known_makes = [make.title() for make in known_makes]
    
    # Function to extract Make from Model with improved handling
    def extract_make_from_model(model):
        if pd.isna(model) or not isinstance(model, str) or model.strip() == '':
            return np.nan, model
        
        # Clean model string
        model = str(model).strip()
        model_lower = model.lower()
        
        # Specific handling for 'Toyota Model' pattern found in rows 728-730
        if model_lower.startswith('toyota '):
            return 'Toyota', model[7:].strip()
            
        # For other patterns, check against known makes list
        for make in known_makes:
            make_lower = make.lower()
            if model_lower.startswith(make_lower + ' '):
                return make, model[len(make) + 1:].strip()
            elif model_lower == make_lower:
                return make, ""
        
        # Handle special cases like 'Land Cruiser' -> 'Toyota'
        if 'land cruiser' in model_lower and not model_lower.startswith('land rover'):
            return 'Toyota', model
            
        # Failed to extract a make
        return np.nan, model
    
    # Apply extraction where 'Make' is NaN
    make_extraction_count = df['Make'].isna().sum()
    print(f"Extracting Make from Model for {make_extraction_count} rows")
    
    # Create a mask for rows that need Make extraction
    make_extraction_mask = df['Make'].isna()
    
    if make_extraction_mask.sum() > 0:
        # Apply the extraction function
        extracted_data = df.loc[make_extraction_mask, 'Model'].apply(extract_make_from_model)
        
        # Split the result into Make and updated Model
        extracted_makes, updated_models = zip(*extracted_data)
        
        # Update the DataFrame
        df.loc[make_extraction_mask, 'Make'] = extracted_makes
        df.loc[make_extraction_mask, 'Model'] = updated_models
    
    # =================== STEP 2: CLEAN AND STANDARDIZE COLUMNS ===================
    print("\n=== Cleaning Numerical and Categorical Columns ===")
    
    # Clean Year column to ensure it's numeric
    if 'Year' in df.columns:
        df['Year'] = pd.to_numeric(df['Year'].str.extract(r'(\d{4})')[0], errors='coerce')
    
    # Clean Engine size column - handle both cc and L formats
    def clean_engine_size(value):
        if pd.isna(value) or (isinstance(value, str) and value.strip() == ''):
            return np.nan
        
        try:
            value_str = str(value).lower().strip()
            # Remove all non-numeric characters except decimal point
            # But first check if it's in liters format
            is_liters = 'l' in value_str and not 'cc' in value_str
            
            # Remove all commas, spaces and letters
            cleaned_value = re.sub(r'[^\d.]', '', value_str)
            if not cleaned_value:
                return np.nan
                
            numeric_value = float(cleaned_value)
            
            # Convert liters to cc if necessary
            if is_liters and numeric_value < 10:  # Assume values < 10 are in liters
                return numeric_value * 1000
            return numeric_value
        except (ValueError, TypeError) as e:
            print(f"Error cleaning engine size '{value}': {e}")
            return np.nan
    
    df['Engine size'] = df['Engine size'].apply(clean_engine_size)
    
    # Clean Mileage column - standardize to kilometers without text
    def clean_mileage(value):
        if pd.isna(value) or (isinstance(value, str) and value.strip() == ''):
            return np.nan
        
        try:
            value_str = str(value).lower().strip()
            # Remove all commas, spaces and letters
            cleaned_value = re.sub(r'[^\d.]', '', value_str)
            if not cleaned_value:
                return np.nan
            return float(cleaned_value)
        except (ValueError, TypeError) as e:
            print(f"Error cleaning mileage '{value}': {e}")
            return np.nan
    
    df['Mileage'] = df['Mileage'].apply(clean_mileage)
    
    # Clean and standardize Price column
    def clean_price(value):
        if pd.isna(value) or (isinstance(value, str) and value.strip() == '') or value == 'ASK':
            return np.nan
        
        try:
            # Keep only digits and decimal points
            value_str = str(value)
            cleaned_value = re.sub(r'[^\d.]', '', value_str)
            if not cleaned_value:
                return np.nan
            return float(cleaned_value)
        except (ValueError, TypeError) as e:
            print(f"Error cleaning price '{value}': {e}")
            return np.nan
    
    df['Price'] = df['Price'].apply(clean_price)
    
    # Clean and standardize fuel type with proper categorization
    def standardize_fuel_type(value):
        if pd.isna(value) or (isinstance(value, str) and value.strip() == ''):
            return np.nan
            
        value_lower = str(value).lower().strip()
        
        # Standardize common fuel types
        if 'petrol' in value_lower or 'petroleum' in value_lower:
            if 'hybrid' in value_lower:
                return 'petrol hybrid'
            return 'petrol'
        elif 'diesel' in value_lower:
            if 'hybrid' in value_lower:
                return 'diesel hybrid'
            return 'diesel'
        elif 'hybrid' in value_lower:
            return 'hybrid'
        elif 'electric' in value_lower:
            return 'electric'
        else:
            return value_lower
    
    df['Fuel type'] = df['Fuel type'].apply(standardize_fuel_type)
    
    # Clean and standardize transmission
    def standardize_transmission(value):
        if pd.isna(value) or (isinstance(value, str) and value.strip() == ''):
            return np.nan
            
        value_lower = str(value).lower().strip()
        
        # Map common transmission types
        if value_lower in ['automatic', 'at']:
            return 'automatic'
        elif value_lower in ['manual', 'mt']:
            return 'manual'
        elif 'mt' in value_lower:  # For cases like '5MT', '6MT'
            return 'manual'
        elif 'automatic' in value_lower:
            return 'automatic'
        else:
            return value_lower
    
    df['Transmission'] = df['Transmission'].apply(standardize_transmission)
    
    # Clean and standardize Drive type
    def standardize_drive_type(value):
        if pd.isna(value) or (isinstance(value, str) and value.strip() == ''):
            return np.nan
            
        value_lower = str(value).lower().strip()
        
        # Standardize common drive types
        if value_lower in ['awd', '4wd', '4x4']:
            return 'awd'
        elif value_lower in ['2wd', 'fwd', 'ff']:
            return 'fwd'
        elif value_lower in ['rwd', 'fr', 'rr']:
            return 'rwd'
        else:
            return value_lower
    
    df['Drive type'] = df['Drive type'].apply(standardize_drive_type)
    
    # Clean and standardize Body type
    def standardize_body_type(value):
        if pd.isna(value) or (isinstance(value, str) and value.strip() == ''):
            return np.nan
            
        value_lower = str(value).lower().strip()
        
        # Map common body types to standard categories
        if 'suv' in value_lower:
            return 'suv'
        elif value_lower in ['saloon', 'sedan']:
            return 'sedan'
        elif 'hatchback' in value_lower:
            return 'hatchback'
        elif value_lower in ['wagon', 'estate']:
            return 'wagon'
        elif value_lower in ['minivan', 'van']:
            return 'van'
        elif 'pickup' in value_lower:
            return 'pickup'
        elif 'coupe' in value_lower:
            return 'coupe'
        else:
            return value_lower
    
    df['Body type'] = df['Body type'].apply(standardize_body_type)
    
    # Clean Condition column - handle mixed data types
    def clean_condition(value):
        if pd.isna(value) or (isinstance(value, str) and value.strip() == ''):
            return np.nan
            
        value_str = str(value).lower().strip()
        
        # Try to extract numeric rating
        numeric_match = re.search(r'(\d+(?:\.\d+)?)', value_str)
        if numeric_match:
            try:
                return float(numeric_match.group(1))
            except (ValueError, TypeError):
                pass
                
        # Map textual conditions to numeric scale
        condition_map = {
            'excellent': 5.0,
            'very good': 4.5,
            'good': 4.0,
            'average': 3.0,
            'below average': 2.0,
            'poor': 1.0,
            'new': 5.0,
            'unsold': np.nan,  # Not condition-related
            'sold': np.nan,     # Not condition-related
            'foreign used': np.nan,  # Not condition-related
            'locally used': np.nan,  # Not condition-related
            'ready for import': np.nan  # Not condition-related
        }
        
        # Try to match to our map
        for key, rating in condition_map.items():
            if key in value_str:
                return rating
                
        # Failed to parse condition
        return np.nan
    
    # Create a new column for numeric condition and keep original
    df['Condition_Original'] = df['Condition']
    df['Condition'] = df['Condition'].apply(clean_condition)
    
    # Clean Seats column to extract just the number
    def clean_seats(value):
        if pd.isna(value) or (isinstance(value, str) and value.strip() == ''):
            return np.nan
            
        # Extract the first number from the string
        match = re.search(r'(\d+)', str(value))
        if match:
            return int(match.group(1))
        return np.nan
    
    df['Seats'] = df['Seats'].apply(clean_seats)
    
    # Clean other numerical columns with better error handling
    def safe_numeric_clean(value):
        if pd.isna(value):
            return np.nan
        
        if isinstance(value, (int, float)):
            return float(value)
            
        try:
            value_str = str(value).strip()
            # Keep only digits and decimal points
            cleaned = re.sub(r'[^\d.]', '', value_str)
            if not cleaned:
                return np.nan
            return float(cleaned)
        except (ValueError, TypeError):
            return np.nan
    
    # Apply to remaining numeric columns
    for col in ['Horsepower', 'Torque', 'Acceleration']:
        if col in df.columns:
            df[col] = df[col].apply(safe_numeric_clean)
    
    # =================== STEP 3: IDENTIFY AND HANDLE OUTLIERS ===================
    print("\n=== Checking for Outliers ===")
    
    # Check for outliers in numerical columns
    numeric_cols = ['Engine size', 'Mileage', 'Price', 'Horsepower', 'Torque', 'Acceleration']
    numeric_cols = [col for col in numeric_cols if col in df.columns]
    
    for col in numeric_cols:
        if df[col].count() > 0:  # Skip empty columns
            q1 = df[col].quantile(0.25)
            q3 = df[col].quantile(0.75)
            iqr = q3 - q1
            lower_bound = q1 - 3 * iqr  # More permissive lower bound (3*IQR)
            upper_bound = q3 + 3 * iqr  # More permissive upper bound (3*IQR)
            
            # Flag outliers
            outliers_mask = (df[col] < lower_bound) | (df[col] > upper_bound)
            outlier_count = outliers_mask.sum()
            
            if outlier_count > 0:
                print(f"Found {outlier_count} outliers in {col} ({outlier_count/df[col].count()*100:.2f}%)")
                print(f"  Range: [{df[col].min()}, {df[col].max()}], IQR bounds: [{lower_bound}, {upper_bound}]")
                
                # Create reasonable caps based on column
                if col == 'Engine size':
                    # Cap engine size between 500cc and 10000cc
                    df.loc[df[col] < 500, col] = np.nan
                    df.loc[df[col] > 10000, col] = np.nan
                elif col == 'Horsepower':
                    # Cap horsepower at reasonable values (5-2000hp)
                    df.loc[df[col] < 5, col] = np.nan
                    df.loc[df[col] > 2000, col] = np.nan
                elif col == 'Torque':
                    # Cap torque at reasonable values (30-1500Nm)
                    df.loc[df[col] < 30, col] = np.nan
                    df.loc[df[col] > 1500, col] = np.nan
                elif col == 'Acceleration':
                    # Cap acceleration at reasonable values (1-30s for 0-100km/h)
                    df.loc[df[col] < 1, col] = np.nan
                    df.loc[df[col] > 30, col] = np.nan
                elif col == 'Price':
                    # Cap price at reasonable lower/upper bounds for car market
                    df.loc[df[col] < 10000, col] = np.nan  # Likely error if below 10,000 (currency)
                    
                print(f"  After capping: Range: [{df[col].min()}, {df[col].max()}]")
    
    # =================== STEP 4: IMPUTE MISSING VALUES ===================
    print("\n=== Imputing Missing Values ===")
    
    # First, identify rare Make-Model combinations to handle them separately
    model_counts = df.groupby(['Make', 'Model']).size().reset_index(name='count')
    rare_models = model_counts[model_counts['count'] < 5]
    rare_model_pairs = set(map(tuple, rare_models[['Make', 'Model']].values))
    
    # Create a mask for rare models
    def is_rare_model(row):
        if pd.isna(row['Make']) or pd.isna(row['Model']):
            return False
        return (row['Make'], row['Model']) in rare_model_pairs
    
    rare_model_mask = df.apply(is_rare_model, axis=1)
    print(f"Found {rare_model_mask.sum()} instances of rare models (fewer than 5 entries)")
    
    # Helper function to safely get mode of a series (handles empty series)
    def safe_mode(series):
        if series.empty or series.isna().all():
            return np.nan
        mode_values = series.mode()
        return mode_values[0] if not mode_values.empty else np.nan
    
    # Helper function to safely get median of a series (handles empty series)
    def safe_median(series):
        if series.empty or series.isna().all():
            return np.nan
        return series.median()
    
    # Columns to impute
    categorical_cols = ['Fuel type', 'Transmission', 'Drive type', 'Body type', 'Seats']
    categorical_cols = [col for col in categorical_cols if col in df.columns]
    
    numerical_cols = ['Engine size', 'Horsepower', 'Torque', 'Acceleration', 'Year']
    numerical_cols = [col for col in numerical_cols if col in df.columns]
    
    # Smarter imputation strategy:
    # 1. For non-rare models: Make-Model-Year → Make-Model → Make-Year → Make
    # 2. For rare models: Make-Year → Make
    
    # Group by Make-Model-Year for non-rare models
    for col in categorical_cols:
        print(f"Imputing {col}...")
        if col in df.columns and df[col].isna().any():
            # For non-rare models
            standard_mask = ~rare_model_mask & df[col].isna()
            if standard_mask.any():
                for make in df.loc[standard_mask, 'Make'].unique():
                    for model in df.loc[(standard_mask) & (df['Make'] == make), 'Model'].unique():
                        make_model_mask = (df['Make'] == make) & (df['Model'] == model) & standard_mask
                        if make_model_mask.any():
                            # First try Make-Model-Year
                            for year in df.loc[make_model_mask, 'Year'].unique():
                                year_mask = make_model_mask & (df['Year'] == year)
                                if year_mask.any():
                                    mode_value = safe_mode(df.loc[(df['Make'] == make) & 
                                                             (df['Model'] == model) & 
                                                             (df['Year'] == year) & 
                                                             ~df[col].isna(), col])
                                    if not pd.isna(mode_value):
                                        df.loc[year_mask, col] = mode_value
            
            # For remaining missing values in non-rare models, try Make-Model
            still_missing = ~rare_model_mask & df[col].isna()
            if still_missing.any():
                for make in df.loc[still_missing, 'Make'].unique():
                    for model in df.loc[(still_missing) & (df['Make'] == make), 'Model'].unique():
                        make_model_mask = (df['Make'] == make) & (df['Model'] == model) & still_missing
                        if make_model_mask.any():
                            mode_value = safe_mode(df.loc[(df['Make'] == make) & 
                                                     (df['Model'] == model) & 
                                                     ~df[col].isna(), col])
                            if not pd.isna(mode_value):
                                df.loc[make_model_mask, col] = mode_value
            
            # For rare models, use Make only
            rare_missing = rare_model_mask & df[col].isna()
            if rare_missing.any():
                for make in df.loc[rare_missing, 'Make'].unique():
                    make_mask = (df['Make'] == make) & rare_missing
                    if make_mask.any():
                        mode_value = safe_mode(df.loc[(df['Make'] == make) & ~df[col].isna(), col])
                        if not pd.isna(mode_value):
                            df.loc[make_mask, col] = mode_value
            
            # Final fallback for all: global mode
            still_missing = df[col].isna()
            if still_missing.any():
                mode_value = safe_mode(df.loc[~df[col].isna(), col])
                if not pd.isna(mode_value):
                    df.loc[still_missing, col] = mode_value
            
            print(f"  - Remaining missing in {col}: {df[col].isna().sum()}")
    
    # Similar approach for numerical columns but using median instead of mode
    for col in numerical_cols:
        print(f"Imputing {col}...")
        if col in df.columns and df[col].isna().any():
            # For non-rare models
            standard_mask = ~rare_model_mask & df[col].isna()
            if standard_mask.any():
                for make in df.loc[standard_mask, 'Make'].unique():
                    for model in df.loc[(standard_mask) & (df['Make'] == make), 'Model'].unique():
                        make_model_mask = (df['Make'] == make) & (df['Model'] == model) & standard_mask
                        if make_model_mask.any():
                            # First try Make-Model-Year
                            for year in df.loc[make_model_mask, 'Year'].unique():
                                if pd.notna(year):  # Skip NaN years
                                    year_mask = make_model_mask & (df['Year'] == year)
                                    if year_mask.any():
                                        median_value = safe_median(df.loc[(df['Make'] == make) & 
                                                               (df['Model'] == model) & 
                                                               (df['Year'] == year) & 
                                                               ~df[col].isna(), col])
                                        if not pd.isna(median_value):
                                            df.loc[year_mask, col] = median_value
            
            # For remaining missing values in non-rare models, try Make-Model
            still_missing = ~rare_model_mask & df[col].isna()
            if still_missing.any():
                for make in df.loc[still_missing, 'Make'].unique():
                    for model in df.loc[(still_missing) & (df['Make'] == make), 'Model'].unique():
                        make_model_mask = (df['Make'] == make) & (df['Model'] == model) & still_missing
                        if make_model_mask.any():
                            median_value = safe_median(df.loc[(df['Make'] == make) & 
                                                     (df['Model'] == model) & 
                                                     ~df[col].isna(), col])
                            if not pd.isna(median_value):
                                df.loc[make_model_mask, col] = median_value
            
            # For rare models, use Make only 
            rare_missing = rare_model_mask & df[col].isna()
            if rare_missing.any():
                for make in df.loc[rare_missing, 'Make'].unique():
                    make_mask = (df['Make'] == make) & rare_missing
                    if make_mask.any():
                        median_value = safe_median(df.loc[(df['Make'] == make) & ~df[col].isna(), col])
                        if not pd.isna(median_value):
                            df.loc[make_mask, col] = median_value
            
            # Final fallback for all: global median
            still_missing = df[col].isna()
            if still_missing.any():
                median_value = safe_median(df.loc[~df[col].isna(), col])
                if not pd.isna(median_value):
                    df.loc[still_missing, col] = median_value
                
            print(f"  - Remaining missing in {col}: {df[col].isna().sum()}")
    
    # Special handling for Condition column
    if 'Condition' in df.columns and df['Condition'].isna().any():
        print("Imputing Condition...")
        # Calculate the global median once for efficiency
        global_median = safe_median(df['Condition'].dropna())
        
        # Apply global median where missing
        df.loc[df['Condition'].isna(), 'Condition'] = global_median
        print(f"  - Remaining missing in Condition: {df['Condition'].isna().sum()}")
    
    # =================== STEP 5: ADD VALIDATION CHECKS ===================
    print("\n=== Running Validation Checks ===")
    
    # Validation: Check that diesel engines are typically >1500cc
    if 'Fuel type' in df.columns and 'Engine size' in df.columns:
        diesel_mask = df['Fuel type'] == 'diesel'
        diesel_engines = df.loc[diesel_mask & df['Engine size'].notna(), 'Engine size']
        if not diesel_engines.empty:
            min_diesel_size = diesel_engines.min()
            if min_diesel_size < 1500:
                print(f"WARNING: Found diesel engines smaller than 1500cc (minimum: {min_diesel_size})")
                # Flagging potential errors for diesel engine sizes
                suspicious_diesel = diesel_mask & (df['Engine size'] < 1500)
                print(f"  Found {suspicious_diesel.sum()} suspicious diesel engines < 1500cc")
            else:
                print(f"✅ All diesel engines are >1500cc (minimum: {min_diesel_size})")
    
    # Validation: Check that automatic transmission is properly identified
    if 'Transmission' in df.columns:
        auto_variants = df['Transmission'].value_counts().get('automatic', 0)
        print(f"✅ Found {auto_variants} cars with automatic transmission")
    
    # Validation: Check for valid price ranges
    if 'Price' in df.columns:
        price_stats = df['Price'].describe()
        print(f"Price range: {price_stats['min']:,.0f} to {price_stats['max']:,.0f}, "
              f"Mean: {price_stats['mean']:,.0f}")
        
        # Flag suspiciously low prices
        low_price = df['Price'] < price_stats['25%'] / 2
        if low_price.any():
            print(f"WARNING: Found {low_price.sum()} vehicles with suspiciously low prices")
    
    # Check if all Makes are populated
    if 'Make' in df.columns:
        missing_makes = df['Make'].isna().sum()
        if missing_makes > 0:
            print(f"WARNING: Still have {missing_makes} rows with missing Make")
        else:
            print("✅ All rows have Make populated")
    
    # Generate summary statistics for key columns after cleaning
    print("\n=== Data Summary After Cleaning ===")
    print(f"Original rows: {original_shape[0]}")
    print(f"Final rows: {len(df)}")
    
    print(f"\nMissing values remaining:")
    for col in df.columns:
        missing = df[col].isna().sum()
        if missing > 0:
            print(f"  - {col}: {missing} ({missing/len(df)*100:.1f}%)")
    
    # =================== STEP 6: SAVE THE CLEAN DATASET ===================
    print(f"\nSaving cleaned dataset to: {output_path}")
    df.to_csv(output_path, index=False)
    
    # Return the clean DataFrame for further use if needed
    return df, output_path

# Execute the cleaning if script is run directly
if __name__ == "__main__":
    # Update this path to your dataset location
    file_path = r"C:\Users\Ricky\Desktop\For Fun Projects\SBT-Japan\datasets\merged_used_cars_cleaned.csv"
    
    # Run the cleaning function
    cleaned_df, output_path = clean_used_cars_dataset(file_path)
    
    # Print sample of cleaned data
    print("\n=== Sample of Cleaned Data ===")
    print(cleaned_df.head())
    
    # Sample of previously problematic rows (728-730)
    if len(cleaned_df) > 730:
        print("\nCheck rows 728-730 after cleaning:")
        print(cleaned_df.iloc[728:731][['Make', 'Model']])
    
    print(f"\nCleaning complete! Dataset saved to: {output_path}")

Loading data from: C:\Users\Ricky\Desktop\For Fun Projects\SBT-Japan\datasets\merged_used_cars_cleaned.csv
Original dataset shape: (13467, 15)

=== Cleaning Make and Model ===
Found 329 rows with 'Nan' string in Make column
Extracting Make from Model for 329 rows

=== Cleaning Numerical and Categorical Columns ===

=== Checking for Outliers ===
Found 222 outliers in Engine size (1.66%)
  Range: [0.0, 60006.0], IQR bounds: [-2740.0, 7130.0]
  After capping: Range: [500.0, 9830.0]
Found 116 outliers in Mileage (0.90%)
  Range: [0.0, 1375000.0], IQR bounds: [-230000.0, 414000.0]
  After capping: Range: [0.0, 1375000.0]
Found 2139 outliers in Price (16.75%)
  Range: [2.0, 69783000.0], IQR bounds: [-1555622.5, 2084181.5]
  After capping: Range: [10002.0, 69783000.0]
Found 43 outliers in Horsepower (1.54%)
  Range: [46.0, 1841.0], IQR bounds: [-192.0, 585.0]
  After capping: Range: [46.0, 1841.0]
Found 5 outliers in Acceleration (0.18%)
  Range: [2.8, 450.0], IQR bounds: [-2.0, 19.0]
  After