In [1]:
import pandas as pd
import numpy as np

In [2]:
def MMM_imputation(df, column_name, imputation_method):
    """
    Impute missing values in a specified column using the chosen method.

    Parameters:
    - df: DataFrame to be processed.
    - column_name: Name of the column to impute.
    - imputation_method: Method of imputation ('mode', 'median', or 'mean').

    Returns:
    - None: The function modifies the DataFrame in place.
    """
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' does not exist in the DataFrame.")

    if imputation_method == "mode":
        mode_value = df[column_name].mode()
        if mode_value.empty:
            raise ValueError(f"Mode could not be computed for column '{column_name}'.")
        df[column_name].fillna(mode_value[0], inplace=True)
    elif imputation_method == "median":
        if not pd.api.types.is_numeric_dtype(df[column_name]):
            raise ValueError(f"Median imputation is not applicable for non-numeric column '{column_name}'.")
        median_value = df[column_name].median()
        df[column_name].fillna(median_value, inplace=True)
    elif imputation_method == "mean":
        if not pd.api.types.is_numeric_dtype(df[column_name]):
            raise ValueError(f"Mean imputation is not applicable for non-numeric column '{column_name}'.")
        mean_value = df[column_name].mean()
        df[column_name].fillna(mean_value, inplace=True)
    else:
        raise ValueError("Imputation method must be 'mode', 'median', or 'mean'.")


In [3]:
df = pd.read_excel("combined_data.xlsx")
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

In [4]:
df_edit = df.copy()
for i in df_edit.columns:
    print(f"{i}:unique count -> {df_edit[i].nunique()}, <> unique percentage -> {(df_edit[i].nunique()/len(df_edit))*100}% <>, null value percentage -> {(df_edit[i].isnull().sum()/len(df_edit))*100}%, Data types -> {df_edit[i].dtypes}")
    print("-------------------------------------------------------------------------------------")

it:unique count -> 1, <> unique percentage -> 0.01194885888397658% <>, null value percentage -> 0.0%, Data types -> int64
-------------------------------------------------------------------------------------
ft:unique count -> 5, <> unique percentage -> 0.0597442944198829% <>, null value percentage -> 0.0%, Data types -> object
-------------------------------------------------------------------------------------
bt:unique count -> 10, <> unique percentage -> 0.1194885888397658% <>, null value percentage -> 0.04779543553590632%, Data types -> object
-------------------------------------------------------------------------------------
km:unique count -> 4090, <> unique percentage -> 48.87083283546421% <>, null value percentage -> 0.0%, Data types -> object
-------------------------------------------------------------------------------------
transmission:unique count -> 2, <> unique percentage -> 0.02389771776795316% <>, null value percentage -> 0.0%, Data types -> object
----------------

In [5]:
"""
Drop columns based on below category due to lot of columns:- 
1) Duplicate columns - Transmission,Fuel Type,kms_driven,ownership,displacement,engine,seats.1,seating_capacity,wheel_size
2) 100% single value in entire column - it,trendingText.imgUrl,trendingText.heading,trendingText.desc,
3) Unnecessary columns - car_links,
4) Domain knowledge if there is some null values - max_torque, Value Configuration,Fuel Suppy System, BoreX Stroke, Compression Ratio, Front Tread, Rear Tread, Acceleration         
5) Null value over 80% - priceActual, priceSaving, priceFixedText
6) Spec and feature reduced due to high features:- 'power_folding3rd_row_seat','removable_convertible_top','smart_key_band','real_time_vehicle_tracking','headlamp_washers','wifi_connectivity','find_my_car_location','cassette_player','sos_emergency_assistance','heads_up_display','side_stepper','heated_wing_mirror','remote_horn_light_control','geo_fence_alert','remote_engine_start_stop','internal_storage','rear_folding_table','cd_changer','xenon_headlamps','touch_screen_size','compass','mirror_link','leddrls','active_noise_cancellation','drive_modes','cigarette_lighter','ledtaillights','steering_wheel_gearshift_paddles','tailgate_ajar','isofix_child_seat_mounts','seat_lumbar_support','audio_system_remote_control','cup_holders_rear','chrome_garnish','tinted_glass','glove_box_cooling','number_of_speaker','no_of_airbags','follow_me_home_headlamps','ebd','roof_rail','alloy_wheels','luggage_hook_and_net','smart_access_card_entry','trunk_light','centrally_mounted_fuel_tank','chrome_grille','wheel_covers','fabric_upholstery','cup_holders_front','vanity_mirror'

df_edit.drop(['it','priceActual','priceSaving','priceFixedText','trendingText.imgUrl','trendingText.heading','trendingText.desc','Fuel Type','Kms Driven','Transmission','Engine','Displacement','Max Torque','Value Configuration','Fuel Suppy System','BoreX Stroke','Compression Ratio','Front Tread','Rear Tread','Seats.1','car_links','Seating Capacity','Acceleration','Wheel Size','Ownership'],axis=1,inplace=True)
"""

df_edit.drop([
    # Duplicate columns
    'Transmission', 'Fuel Type', 'Kms Driven', 'Ownership', 'Displacement', 'Engine', 'Seats.1', 'Seating Capacity', 'Wheel Size',
    
    # Columns with 100% single value
    'it', 'trendingText.imgUrl', 'trendingText.heading', 'trendingText.desc',
    
    # Unnecessary columns
    'car_links',
    
    # Domain knowledge for nulls and for customer usability
    'Max Torque', 'Value Configuration', 'Fuel Suppy System', 'BoreX Stroke', 'Compression Ratio', 'Front Tread', 'Rear Tread', 'Acceleration',
    
    # Columns with >80% null values
    'priceActual', 'priceSaving', 'priceFixedText'
], axis=1, inplace=True)

df_edit.columns = df_edit.columns.str.lower().str.replace(" ", "_")

df_edit.drop(['power_folding3rd_row_seat','removable_convertible_top','smart_key_band','real_time_vehicle_tracking','headlamp_washers','wifi_connectivity','find_my_car_location','cassette_player','sos_emergency_assistance','heads_up_display','side_stepper','heated_wing_mirror','remote_horn_light_control','geo_fence_alert','remote_engine_start_stop','internal_storage','rear_folding_table','cd_changer','xenon_headlamps','touch_screen_size','compass','mirror_link','leddrls','active_noise_cancellation','drive_modes','cigarette_lighter','ledtaillights','steering_wheel_gearshift_paddles','tailgate_ajar','isofix_child_seat_mounts','seat_lumbar_support','audio_system_remote_control','cup_holders_rear','chrome_garnish','tinted_glass','glove_box_cooling','number_of_speaker','no_of_airbags','follow_me_home_headlamps','ebd','roof_rail','alloy_wheels','luggage_hook_and_net','smart_access_card_entry','trunk_light','centrally_mounted_fuel_tank','chrome_grille','wheel_covers','fabric_upholstery','cup_holders_front','vanity_mirror','remote_climate_control','dual_tone_dashboard','leather_wrap_gear_shift_selector','dual_tone_body_colour','rear_seat_centre_arm_rest','rear_spoiler','keyless_entry','digital_clock','leather_steering_wheel'], axis=1, inplace=True)

In [6]:
# Creating grouped columns in the dataframe `df_edit`

# Stability Features 
df_edit['stability_features'] = df_edit['eletronic_stability_control'] | df_edit['vehicle_stability_control_system'] | df_edit['traction_control']

# Blind Spot Assistance 
df_edit['blind_spot_assistance'] = df_edit['lane_watch_camera'] | df_edit['blind_spot_monitor']

# Indicators
df_edit['indicators'] = df_edit['outside_rear_view_mirror_turn_indicators'] | df_edit['lane_change_indicator']

# Folding Rear View Mirror
df_edit['Adjustable_rear_view_mirror'] = df_edit['electric_folding_rear_view_mirror'] | df_edit['manually_adjustable_exterior_rear_view_mirror'] | df_edit['power_adjustable_exterior_rear_view_mirror']

# Rear Window Wiper
df_edit['rear_window_wiper'] = df_edit['rear_window_wiper'] | df_edit['rear_window_washer']

# Fog Lamps
df_edit['fog_lamps'] = df_edit['fog_lights_front'] | df_edit['fog_lights_rear'] | df_edit['ledfog_lamps'] | df_edit['cornering_foglamps'] | df_edit['smoke_headlamps']

# Automatic Lamps/Lights 
df_edit['automatic_lamps'] = df_edit['automatic_driving_lights'] | df_edit['automatic_head_lamps']

# Meters 
df_edit['meters'] = df_edit['tachometer'] | df_edit['digital_odometer'] | df_edit['electronic_multi_tripmeter']

#Boot opening
df_edit['power_boot'] = df_edit['hands_free_tailgate'] | df_edit['power_boot']

#roof on car
df_edit['car_roof'] = df_edit['moon_roof'] | df_edit['sun_roof']

#Speakers
df_edit['speakers'] = df_edit['speakers_front'] | df_edit['speakers_rear']

#antenna
df_edit['antenna'] = df_edit['integrated_antenna'] | df_edit['power_antenna']

#player
df_edit['player'] = df_edit['cd_player'] | df_edit['dvd_player']

#Smartphone integrate (android auto and apple carplay)
df_edit['smartphone_integration'] = df_edit['android_auto'] | df_edit['apple_car_play']

#Anti theft device
df_edit['anti_theft_device'] = df_edit['anti_theft_device'] | df_edit['engine_immobilizer']

#adjustable seats
df_edit['adjustable_seats'] = df_edit['adjustable_seats'] | df_edit['height_adjustable_driver_seat']

#Smartphone integrate (android auto and apple carplay)
df_edit['impact_beams'] = df_edit['front_impact_beams'] | df_edit['side_impact_beams']

In [7]:
# Dropping original columns after grouping in df_edit

# Columns to drop for each group
stability_features_cols = ['eletronic_stability_control', 'vehicle_stability_control_system', 'traction_control']
blind_spot_assistance_cols = ['lane_watch_camera', 'blind_spot_monitor']
indicators_cols = ['outside_rear_view_mirror_turn_indicators', 'lane_change_indicator']
folding_rear_view_mirror_cols = ['electric_folding_rear_view_mirror', 'manually_adjustable_exterior_rear_view_mirror','power_adjustable_exterior_rear_view_mirror']
rear_window_wiper_cols = ['rear_window_wiper', 'rear_window_washer']
fog_lamps_cols = ['fog_lights_front', 'fog_lights_rear', 'ledfog_lamps', 'cornering_foglamps','smoke_headlamps']
automatic_lamps_cols = ['automatic_driving_lights', 'automatic_head_lamps']
meters_cols = ['tachometer', 'digital_odometer', 'electronic_multi_tripmeter']
power_boot_cols = ['hands_free_tailgate', 'power_boot']
car_roof_cols = ['moon_roof', 'sun_roof']
speakers_cols = ['speakers_front', 'speakers_rear']
antenna = ['integrated_antenna', 'power_antenna']
player = ['cd_player', 'dvd_player']
smartphone_integration = ['android_auto', 'apple_car_play']
anti_theft_device = ['anti_theft_device', 'engine_immobilizer']
adjustable_seats = ['adjustable_seats', 'height_adjustable_driver_seat']
impact_beams = ['front_impact_beams', 'side_impact_beams']

# Dropping the columns
df_edit.drop(columns=(stability_features_cols + blind_spot_assistance_cols + indicators_cols +
                      folding_rear_view_mirror_cols + rear_window_wiper_cols + fog_lamps_cols +
                      automatic_lamps_cols + meters_cols+power_boot_cols+car_roof_cols+speakers_cols+antenna+player+
                     smartphone_integration+anti_theft_device+adjustable_seats+impact_beams), inplace=True)


In [8]:
df_edit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8369 entries, 0 to 8368
Columns: 132 entries, ft to impact_beams
dtypes: bool(86), float64(4), int64(3), object(39)
memory usage: 3.6+ MB


In [9]:
df_edit.head(1)

Unnamed: 0,ft,bt,km,transmission,ownerno,owner,oem,model,modelyear,centralvariantid,variantname,price,registration_year,insurance_validity,seats,rto,engine_displacement,year_of_manufacture,power_steering,power_windows_front,air_conditioner,heater,adjustable_head_lights,centeral_locking,child_safety_locks,power_windows_rear,remote_trunk_opener,remote_fuel_lid_opener,low_fuel_warning_light,accessory_power_outlet,rear_seat_headrest,glove_compartment,day_night_rear_view_mirror,passenger_side_rear_view_mirror,halogen_headlamps,rear_seat_belts,door_ajar_warning,anti_lock_braking_system,multifunction_steering_wheel,navigation_system,engine_start_stop_button,gear_shift_indicator,adjustable_steering,outside_temperature_display,rear_window_defogger,power_door_locks,driver_air_bag,passenger_air_bag,seat_belt_warning,engine_check_warning,crash_sensor,rear_camera,speed_sensing_auto_door_lock,pretensioners_and_force_limiter_seatbelts,impact_sensing_auto_door_lock,radio,integrated2din_audio,usb_auxiliary_input,bluetooth,touch_screen,driving_experience_control_eco,rear_reading_lamp,rear_acvents,air_quality_control,height_adjustable_front_seat_belts,cruise_control,voice_control,leather_seats,battery_saver,anti_theft_alarm,hill_assist,brake_assist,rain_sensing_wiper,adjustable_headrest,ledheadlights,cornering_headlamps,side_air_bag_front,side_air_bag_rear,tyre_pressure_monitor,clutch_lock,anti_pinch_power_windows,knee_airbags,wireless_phone_charging,projector_headlamps,speed_alert,rear_entertainment_system,ventilated_seats,view360camera,steering_mounted_tripmeter,hill_descent_control,roof_carrier,mileage,max_power,torque,color,engine_type,no_of_cylinder,values_per_cylinder,turbo_charger,super_charger,length,width,height,wheel_base,kerb_weight,gross_weight,gear_box,drive_type,steering_type,turning_radius,front_brake_type,rear_brake_type,top_speed,tyre_type,no_door_numbers,cargo_volumn,alloy_wheel_size,ground_clearance_unladen,city,stability_features,blind_spot_assistance,indicators,Adjustable_rear_view_mirror,fog_lamps,automatic_lamps,meters,car_roof,speakers,antenna,player,smartphone_integration,impact_beams
0,Petrol,Hatchback,120000,Manual,3,3rd Owner,Maruti,Maruti Celerio,2015,3979,VXI,₹ 4 Lakh,2015,Third Party insurance,5 Seats,KA51,998 cc,2015.0,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,23.1 kmpl,67.04bhp@6000rpm,90Nm,White,K10B Engine,3.0,4.0,No,No,3715mm,1635mm,1565mm,2425mm,835kg,1250kg,5 Speed,FWD,Power,4.7 metres,Ventilated Disc,Drum,150 Kmph,"Tubeless, Radial",5.0,235-litres,,,bangalore,False,False,False,True,False,False,True,False,False,True,False,False,True


In [10]:
df_edit['ground_clearance_unladen'].unique()

array([nan, '190mm', '205', '209 mm', '155mm', '120mm', '192mm', '210mm',
       '160', '110mm', '185mm', '178mm', '116mm', '145mm', '200mm', '170',
       '184mm', '180mm', '140mm', '170mm', '210 mm', '91mm', '180',
       '175mm', '205mm', '165', '156mm'], dtype=object)

In [11]:
# Remove spaces and 'mm' in one line, then convert to numeric
df_edit['ground_clearance_unladen'] = (
    df_edit['ground_clearance_unladen']
    .str.replace(" ", "", regex=False)
    .str.replace("mm", "", regex=False)
    .pipe(pd.to_numeric, errors='coerce')
)

# Display unique values after replacement
print(df_edit['ground_clearance_unladen'].unique())

# Dictionary with average ground clearance values for each body type
ground_clearance_values = {
    "Hatchback": 165, "SUV": 210, "Sedan": 150, "MUV": 190, "Coupe": 130,
    "Minivans": 160, "Pickup Trucks": 220, "Convertibles": 130,
    "Hybrids": 150, "Wagon": 160
}

# Function to replace NaN values with average ground clearance based on body type
def replace_nan(row, values_dict):
    return values_dict.get(row['bt'], row['ground_clearance_unladen']) if pd.isnull(row['ground_clearance_unladen']) and pd.notnull(row['bt']) else row['ground_clearance_unladen']

# Apply the function to fill NaN values based on 'bt' column
df_edit['ground_clearance_unladen'] = df_edit.apply(lambda row: replace_nan(row, ground_clearance_values), axis=1)


[ nan 190. 205. 209. 155. 120. 192. 210. 160. 110. 185. 178. 116. 145.
 200. 170. 184. 180. 140.  91. 175. 165. 156.]


In [12]:
df_edit['gross_weight'].unique()

array(['1250kg', '1660Kg', nan, '1670kg', '1530Kg', '1340kg', '2450kg',
       '2160kg', '1335', '2730kg', '2055kg', '1350kg', '1730 Kg',
       '2165 Kg', '2510 kg', '1640kg', '2240kg', '1185kg', '2510kg',
       '1655kg', '1140kg', '2995kg', '1500kg', '1260', '1760kg', '1405Kg',
       '1585kg', '1860kg', '1430kg', '1750Kg', '2060kg', '1360kg',
       '1720kg', '1580kg', '1965kg', '2124kg', '1740kg', '2430kg',
       '1985kg', '2610Kg', '2505 kg', '1787kg', '1405kg', '1813kg',
       '1455kg', '1580 kg', '1620kg', '1915kg', '2220kg', '1680kg',
       '1758kg', '1705Kg', '1845kg', '1482kg', '1436kg', '2140kg',
       '1690Kg', '2280kg', '1520', '1505kg', '2300kg', '1490kg', '2320kg',
       '1530', '1520 Kg', '1890kg', '1660 kg', '1650', '1315Kg', '1980kg',
       '1315kg', '1525kg', '1210kg', '2135kg', '1415kg', '1510kg',
       '1950kg', '1315kgs', '2830', '1757kg', '2,010 kg', '1764kg',
       '2030kg', '2225kg', '1615kg', '1770kg', '1500Kg', '2300 kg',
       '1410', '1459kg', '17

In [13]:
# Clean the 'gross_weight' column
df_edit['gross_weight'] = df_edit['gross_weight'].replace({
    "Kg": "", 
    " ": "", 
    ",": "", 
    "s": "", 
    "mm": ""
}, regex=True)

# Function to replace ranges with average
def replace_range_with_average(value):
    if pd.isna(value):
        return np.nan
    try:
        # Handle ranges by splitting and calculating the average
        if '-' in value:
            start, end = map(int, value.split('-'))
            return (start + end) / 2
        # Handle valid numeric strings
        else:
            return float(value)
    except ValueError:
        return np.nan  # Handle invalid formats gracefully

# Apply the function to the 'gross_weight' column
df_edit['gross_weight'] = df_edit['gross_weight'].apply(replace_range_with_average)

# Convert to numeric values, handling errors gracefully
df_edit['gross_weight'] = pd.to_numeric(df_edit['gross_weight'], errors='coerce')

# Print unique values to verify
print(df_edit['gross_weight'].unique())

# Check data type of the 'gross_weight' column
print(df_edit['gross_weight'].dtypes)

[  nan 1660. 1530. 1335. 1730. 2165. 1260. 1405. 1750. 2610. 1705. 1690.
 1520. 1650. 1315. 2830. 1500. 1410. 1436. 1170. 2075. 2225. 1600. 2460.
 2510. 1820. 1700. 1570. 1680. 2735. 1760. 1185. 1895. 1710. 1640. 1800.
 1670. 1795. 2490. 2400. 1438. 1340. 2449. 1740. 2450. 2755. 2150. 2005.
 2200. 2430. 1250. 2445. 1545. 2345. 2715. 3200. 1480. 2105. 3150. 1320.
 2125. 1890. 1505. 1715. 1765. 1645. 2680. 1415. 2185. 1655. 1533. 3350.
 3490. 2215. 1373. 1461. 2470. 2500. 1755.]
float64


In [14]:
df_edit['top_speed'].unique()

array(['150 Kmph', nan, '150 kmph', '172km/hr', '190 Kmph', '180 Kmph',
       '165 Kmph', '172 Kmph', '175 kmph', '219kmph', '195 Kmph',
       '226km/hr', '250 kmph', '236 Kmph', '222 Kmph', '152 kmph',
       '240 Kmph', '250kmph', '176 Kmph', '170 Kmph', '191km/hr',
       '232 Kmph', '233 Kmph', '137km/hr', '220 Kmph', '215 Kmph',
       '154.19 kmph', '166 kmph', '183 Kmph', '226 Kmph', '160 kmph',
       '156 Kmph', '185kmph', '157 Kmph', '165km/hr', '159.4 kmph',
       '250 Kmph', '135 Kmph', '185 Kmph', '216 kmph', '200 Kmph',
       '160.2 kmph', '230 Kmph', '216km/hr', '195.68 kmph', '158.83 kmph',
       '210 Kmph', '168 Kmph', '155 Kmph', '177.22 kmph', '163 Kmph',
       '230', '164 Kmph', '182km/hr', '186.08 kmph', '172 kmph',
       '171.43 kmph', '155 kmph', '178.55 kmph', '182 Kmph', '186km/hr',
       '189 kmph', '165 Km', '175 Kmph', '227km/hr', '164.26 kmph',
       '194 Kmph', '242km/hr', '158km/hr', '160 Kmph', '190 kmph',
       '157 kmph', '180 kmph', '225 Kmp

In [15]:
df_edit['mileage'].unique()

array(['23.1 kmpl', '17 kmpl', '23.84 kmpl', '19.1 kmpl', '23.65 kmpl',
       '17.1 kmpl', '20.63 kmpl', '18.15 kmpl', '20.28 kmpl', '21.4 kmpl',
       '18.9 kmpl', '18.2 kmpl', '15.1 kmpl', nan, '21.27 kmpl',
       '17.33 kmpl', '17.8 kmpl', '18.48 kmpl', '22.38 kmpl',
       '16.42 kmpl', '7.81 kmpl', '17.4 kmpl', '20.3 kmpl', '13.45 kmpl',
       '15 kmpl', '17.11 kmpl', '20.14 kmpl', '20.51 kmpl', '19.34 kmpl',
       '12.6 kmpl', '15.56 kmpl', '11.5 kmpl', '19.87 kmpl', '18.6 kmpl',
       '22.54 kmpl', '16.3 kmpl', '18.53 kmpl', '22.05 kmpl',
       '12.05 kmpl', '16.5 kmpl', '19.7 kmpl', '11.3 kmpl', '14.08 kmpl',
       '12.07 kmpl', '22 kmpl', '21.5 kmpl', '15.3 kmpl', '26.68 kmpl',
       '21.14 kmpl', '15.9 kmpl', '19.67 kmpl', '14.6 kmpl', '28.4 kmpl',
       '21.76 kmpl', '26.21 kmpl', '19 kmpl', '19.98 kmpl', '18.5 kmpl',
       '20 kmpl', '16.8 kmpl', '17.2 kmpl', '21.13 kmpl', '24 kmpl',
       '25.32 kmpl', '16.55 kmpl', '19.2 kmpl', '24.04 kmpl',
       '27.03 kmpl

In [16]:
# List of patterns to replace in 'top_speed' and 'mileage'
top_speed_patterns = ["Kmph", "km/hr", "km/Hour", "kmph", "Km/Hour", "km/h", "km"]
mileage_patterns = ["kmpl", "km/kg", "Kmpl", "Km/kg"]

# Step 1: Clean 'top_speed' and 'mileage' columns by replacing multiple patterns
for pattern in top_speed_patterns:
    df_edit['top_speed'] = df_edit['top_speed'].str.replace(pattern, "", regex=False)

for pattern in mileage_patterns:
    df_edit['mileage'] = df_edit['mileage'].str.replace(pattern, "", regex=False)

# Remove any spaces
df_edit['top_speed'] = df_edit['top_speed'].str.replace(" ", "", regex=False)
df_edit['mileage'] = df_edit['mileage'].str.replace(" ", "", regex=False)

# Convert 'top_speed' and 'mileage' to numeric values
df_edit['top_speed'] = pd.to_numeric(df_edit['top_speed'], errors='coerce')
df_edit['mileage'] = pd.to_numeric(df_edit['mileage'], errors='coerce')

# Step 2: Group by 'mileage' and calculate the mean of 'top_speed'
top_speed_value = df_edit.groupby('mileage')['top_speed'].mean().to_dict()

# Step 3: Define the function to replace NaN in 'top_speed'
def replace_nan(row, values_dict):
    if pd.isnull(row['top_speed']) and pd.notnull(row['mileage']):
        return values_dict.get(row['mileage'], row['top_speed'])
    return row['top_speed']

# Step 4: Apply the function to the DataFrame
df_edit['top_speed'] = df_edit.apply(
    lambda row: replace_nan(row, top_speed_value),
    axis=1
)


In [17]:
# Remove 'R' from the alloy_wheel_size column
df_edit['alloy_wheel_size'] = df_edit['alloy_wheel_size'].str.replace('R', "", regex=False)

# Step 1: Calculate the mode of alloy_wheel_size for each body type (bt)
# Using 'mode' and handling the case of multiple modes by choosing the first mode
alloy_wheel_size_value = df_edit.groupby('bt')['alloy_wheel_size'].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else None).to_dict()

# Step 2: Define the function to replace NaN values in alloy_wheel_size based on body type (bt)
def replace_nan_alloy(row, values_dict):
    # If alloy_wheel_size is NaN and bt is not NaN, replace it with the mode for that bt
    if pd.isnull(row['alloy_wheel_size']) and pd.notnull(row['bt']):
        return values_dict.get(row['bt'], row['alloy_wheel_size'])
    return row['alloy_wheel_size']

# Step 3: Apply the function to update the alloy_wheel_size column
df_edit['alloy_wheel_size'] = df_edit.apply(lambda row: replace_nan_alloy(row, alloy_wheel_size_value), axis=1)

# Check the unique values in the updated alloy_wheel_size column
print("Unique values in alloy_wheel_size after update:", df_edit['alloy_wheel_size'].unique())

# Check the proportion of null values in the alloy_wheel_size and bt columns
null_alloy_percentage = df_edit['alloy_wheel_size'].isnull().sum() / len(df_edit)
null_bt_percentage = df_edit['bt'].isnull().sum() / len(df_edit)

print(f"Proportion of null values in alloy_wheel_size: {null_alloy_percentage:.4f}")
print(f"Proportion of null values in bt: {null_bt_percentage:.4f}")

Unique values in alloy_wheel_size after update: ['14' '16' '17' '15' '19' '13' '18' '12' '20' '21' nan None]
Proportion of null values in alloy_wheel_size: 0.0006
Proportion of null values in bt: 0.0005


In [18]:
rto_value = df_edit.groupby('city')['rto'].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else None).to_dict()

def replace_rto(row, values_dict):
    if pd.isnull(row['rto']) and pd.notnull(row['city']):
        return values_dict.get(row['city'], row['rto'])
    return row['rto']

df_edit['rto'] = df_edit.apply(
    lambda row: replace_rto(row, rto_value),  # Use the correct function name here
    axis=1
)

In [19]:
# Create a dictionary for replacements
replacements = {
    "FWD ": "FWD",
    "Front Wheel Drive": "FWD",
    "2WD ": "2WD",
    "2 WD": "2WD",
    "4 WD": "4WD",
    "Rear Wheel Drive with ESP": "RWD",
    "RWD(with MTT)": "4WD",
    "All Wheel Drive": "AWD",
    "AWD INTEGRATED MANAGEMENT": "AWD",
    "Two Wheel Drive": "2WD",
    "Permanent all-wheel drive quattro": "AWD",
    "4x2": "2WD",
    "4X4": "4WD",
    "4X2": "2WD",  # Duplicate entry, already covered in the previous line
    "4x4": "4WD"
}

# Use the replace method with the dictionary, ensuring it applies the replacements correctly
df_edit['drive_type'] = df_edit['drive_type'].replace(replacements, regex=False)

# Check unique values after replacement
df_edit['drive_type'].unique()

array(['FWD', '2WD', 'AWD', nan, 'RWD', '4WD'], dtype=object)

In [20]:
df_edit['turning_radius'] = df_edit['turning_radius'].str.replace(r'[^0-9.\-\,]', '', regex=True)
df_edit['turning_radius'] = df_edit['turning_radius'].str.replace("6250", "6.25", regex=False)
df_edit['turning_radius'] = pd.to_numeric(df_edit['turning_radius'], errors='coerce')

# Dictionary with turning radius for each body type
turning_radius = {
    "Hatchback": 4.5,          # Hatchbacks are compact with a smaller turning radius
    "SUV": 5.8,                # SUVs are larger and require a larger turning radius
    "Sedan": 5.5,              # Sedans have a moderate turning radius
    "MUV": 5.8,                # Multi-Utility Vehicles (MUVs) are similar to SUVs
    "Coupe": 5.0,              # Coupes are generally compact with a moderate turning radius
    "Minivans": 6.0,           # Minivans are larger with a bigger turning radius
    "Pickup Trucks": 6.4,      # Pickup trucks tend to have a wide turning radius
    "Convertibles": 5.2,       # Convertibles are similar to sedans or coupes in size
    "Hybrids": 5.4,            # Hybrids vary but tend to have a moderate turning radius
    "Wagon": 5.6               # Wagons are mid-sized with a moderate turning radius
}

def replace_nan(row, values_dict):
    """
    Replace NaN turning radius with the corresponding value from the dictionary 
    based on the body type, if body type is not NaN.
    """
    # Check if turning_radius is NaN and body type (bt) is available
    if pd.isnull(row['turning_radius']) and pd.notnull(row.get('bt')):
        # Return the turning radius for the body type, if exists in the dictionary
        return values_dict.get(row['bt'], row['turning_radius'])
    return row['turning_radius']

# Apply the function to the DataFrame to replace NaN values in 'turning_radius'
df_edit['turning_radius'] = df_edit.apply(
    lambda row: replace_nan(row, turning_radius),
    axis=1
)


In [21]:
df_edit['turbo_charger'] = df_edit['turbo_charger'].str.lower()
df_edit['turbo_charger'] = df_edit['turbo_charger'].str.replace("turbo", "yes", regex=False)
df_edit['turbo_charger'] = df_edit['turbo_charger'].str.replace("twin", "yes", regex=False)

In [22]:
df_edit['turbo_charger'].unique()

array(['no', 'yes', nan], dtype=object)

In [23]:
df_edit['super_charger'] = df_edit['super_charger'].str.lower()
df_edit['super_charger'].unique()

array(['no', nan, 'yes'], dtype=object)

In [25]:
print(df_edit['variantname'].nunique())
print("-------------------------------------------------------------------------------------")
print(df_edit['centralvariantid'].nunique(),df_edit['centralvariantid'].dtypes)
print("-------------------------------------------------------------------------------------")
print(df_edit['bt'].nunique())

2157
-------------------------------------------------------------------------------------
2713 int64
-------------------------------------------------------------------------------------
10


In [26]:
df_edit.head(1)

Unnamed: 0,ft,bt,km,transmission,ownerno,owner,oem,model,modelyear,centralvariantid,variantname,price,registration_year,insurance_validity,seats,rto,engine_displacement,year_of_manufacture,power_steering,power_windows_front,air_conditioner,heater,adjustable_head_lights,centeral_locking,child_safety_locks,power_windows_rear,remote_trunk_opener,remote_fuel_lid_opener,low_fuel_warning_light,accessory_power_outlet,rear_seat_headrest,glove_compartment,day_night_rear_view_mirror,passenger_side_rear_view_mirror,halogen_headlamps,rear_seat_belts,door_ajar_warning,anti_lock_braking_system,multifunction_steering_wheel,navigation_system,engine_start_stop_button,gear_shift_indicator,adjustable_steering,outside_temperature_display,rear_window_defogger,power_door_locks,driver_air_bag,passenger_air_bag,seat_belt_warning,engine_check_warning,crash_sensor,rear_camera,speed_sensing_auto_door_lock,pretensioners_and_force_limiter_seatbelts,impact_sensing_auto_door_lock,radio,integrated2din_audio,usb_auxiliary_input,bluetooth,touch_screen,driving_experience_control_eco,rear_reading_lamp,rear_acvents,air_quality_control,height_adjustable_front_seat_belts,cruise_control,voice_control,leather_seats,battery_saver,anti_theft_alarm,hill_assist,brake_assist,rain_sensing_wiper,adjustable_headrest,ledheadlights,cornering_headlamps,side_air_bag_front,side_air_bag_rear,tyre_pressure_monitor,clutch_lock,anti_pinch_power_windows,knee_airbags,wireless_phone_charging,projector_headlamps,speed_alert,rear_entertainment_system,ventilated_seats,view360camera,steering_mounted_tripmeter,hill_descent_control,roof_carrier,mileage,max_power,torque,color,engine_type,no_of_cylinder,values_per_cylinder,turbo_charger,super_charger,length,width,height,wheel_base,kerb_weight,gross_weight,gear_box,drive_type,steering_type,turning_radius,front_brake_type,rear_brake_type,top_speed,tyre_type,no_door_numbers,cargo_volumn,alloy_wheel_size,ground_clearance_unladen,city,stability_features,blind_spot_assistance,indicators,Adjustable_rear_view_mirror,fog_lamps,automatic_lamps,meters,car_roof,speakers,antenna,player,smartphone_integration,impact_beams
0,Petrol,Hatchback,120000,Manual,3,3rd Owner,Maruti,Maruti Celerio,2015,3979,VXI,₹ 4 Lakh,2015,Third Party insurance,5 Seats,KA51,998 cc,2015.0,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,23.1,67.04bhp@6000rpm,90Nm,White,K10B Engine,3.0,4.0,no,no,3715mm,1635mm,1565mm,2425mm,835kg,,5 Speed,FWD,Power,4.7,Ventilated Disc,Drum,150.0,"Tubeless, Radial",5.0,235-litres,14,165.0,bangalore,False,False,False,True,False,False,True,False,False,True,False,False,True


In [27]:
# Cleaning 'kerb_weight' column
df_edit['kerb_weight'] = df_edit['kerb_weight'].str.replace(r"kg|s|mm|,| |Kg", "", regex=True).str.strip()
# Apply the function to the 'kerb Weight' column
df_edit['kerb_weight'] = df_edit['kerb_weight'].apply(replace_range_with_average)

In [28]:
df_edit['cargo_volumn'] = df_edit['cargo_volumn'].str.extract(r'(\d+\.?\d*)', expand=False)
df_edit['cargo_volumn'] = pd.to_numeric(df_edit['cargo_volumn'], errors='coerce')

In [29]:
# Calculate the percentage of null values for each column
null_percentage = (df_edit.isnull().sum() / len(df_edit)) * 100

# Identify columns with null percentage less than 5%
columns_to_drop = null_percentage[null_percentage < 5].index

# Drop rows with null values in those columns
df_edit = df_edit.dropna(subset=columns_to_drop)

In [30]:
for i in df_edit.columns:
    print(f"{i}:null value percentage -> {(df_edit[i].isnull().sum()/len(df_edit))*100}%, Data types -> {df_edit[i].dtypes}")

ft:null value percentage -> 0.0%, Data types -> object
bt:null value percentage -> 0.0%, Data types -> object
km:null value percentage -> 0.0%, Data types -> object
transmission:null value percentage -> 0.0%, Data types -> object
ownerno:null value percentage -> 0.0%, Data types -> int64
owner:null value percentage -> 0.0%, Data types -> object
oem:null value percentage -> 0.0%, Data types -> object
model:null value percentage -> 0.0%, Data types -> object
modelyear:null value percentage -> 0.0%, Data types -> int64
centralvariantid:null value percentage -> 0.0%, Data types -> int64
variantname:null value percentage -> 0.0%, Data types -> object
price:null value percentage -> 0.0%, Data types -> object
registration_year:null value percentage -> 0.0%, Data types -> object
insurance_validity:null value percentage -> 0.0%, Data types -> object
seats:null value percentage -> 0.0%, Data types -> object
rto:null value percentage -> 0.0%, Data types -> object
engine_displacement:null value pe

In [31]:
# List of additional columns to fill NaN values based on mode by 'model'
additional_columns_to_fill = ['turbo_charger', 'super_charger','drive_type', 'cargo_volumn']

# Step 1: Calculate the mode of each column for each model value
mode_values_additional = {
    col: df_edit.groupby('model')[col].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else None).to_dict()
    for col in additional_columns_to_fill if col in df_edit.columns  # Ensure the column exists in the dataframe
}

# Step 2: Define the replace_nan function for each column
def replace_nan(row, col, values_dict):
    if pd.isnull(row[col]) and pd.notnull(row['model']):
        return values_dict.get(row['model'], row[col])
    return row[col]

# Step 3: Apply the function to update each column
for col in mode_values_additional:
    df_edit[col] = df_edit.apply(
        lambda row: replace_nan(row, col, mode_values_additional[col]),
        axis=1
    )

# Step 1: Calculate the percentage where gross_weight is not null
df_non_null = df_edit[df_edit['gross_weight'].notna()]
df_non_null['percentage_weight_calculation'] = ((df_non_null['gross_weight'] - df_non_null['kerb_weight']) / df_non_null['kerb_weight']) * 100

# Step 2: Compute the mean percentage
mean_percentage = df_non_null['percentage_weight_calculation'].mean()

# Step 3: Fill the null values in gross_weight using the mean percentage
df_edit.loc[df_edit['gross_weight'].isna(), 'gross_weight'] = df_edit['kerb_weight'] * (1 + mean_percentage / 100)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_non_null['percentage_weight_calculation'] = ((df_non_null['gross_weight'] - df_non_null['kerb_weight']) / df_non_null['kerb_weight']) * 100


In [32]:
# List of columns to fill NaN values with mode
columns_to_fill_final = ['turbo_charger', 'super_charger', 'top_speed','cargo_volumn']

# Fill null values with mode for each column, ensuring the column exists
for column in columns_to_fill_final:
    if column in df_edit.columns:  # Check if the column exists
        mode_value = df_edit[column].mode()[0]  # Get the mode value
        df_edit[column].fillna(mode_value, inplace=True)  # Fill NaNs with the mode value

In [33]:
# Calculate the percentage of null values for each column
null_percentage = (df_edit.isnull().sum() / len(df_edit)) * 100

# Identify columns with null percentage less than 5%
columns_to_drop = null_percentage[null_percentage < 5].index

# Drop rows with null values in those columns
df_edit = df_edit.dropna(subset=columns_to_drop)

In [34]:
null_percentage = (df_edit.isnull().sum() / len(df_edit)) * 100
columns_with_null_BT10to30 = null_percentage[(null_percentage > 0) & (null_percentage <= 100)]
print(columns_with_null_BT10to30)
print(columns_with_null_BT10to30.shape[0])

Series([], dtype: float64)
0


In [35]:
df_edit.to_excel('All_imputed_data.xlsx', index=False)