In [1]:
import pandas as pd
import numpy as np

In [2]:
def MMM_imputation(df, column_name, imputation_method):
    """
    Impute missing values in a specified column using the chosen method.

    Parameters:
    - df: DataFrame to be processed.
    - column_name: Name of the column to impute.
    - imputation_method: Method of imputation ('mode', 'median', or 'mean').

    Returns:
    - None: The function modifies the DataFrame in place.
    """
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' does not exist in the DataFrame.")

    if imputation_method == "mode":
        mode_value = df[column_name].mode()
        if mode_value.empty:
            raise ValueError(f"Mode could not be computed for column '{column_name}'.")
        df[column_name].fillna(mode_value[0], inplace=True)
    elif imputation_method == "median":
        if not pd.api.types.is_numeric_dtype(df[column_name]):
            raise ValueError(f"Median imputation is not applicable for non-numeric column '{column_name}'.")
        median_value = df[column_name].median()
        df[column_name].fillna(median_value, inplace=True)
    elif imputation_method == "mean":
        if not pd.api.types.is_numeric_dtype(df[column_name]):
            raise ValueError(f"Mean imputation is not applicable for non-numeric column '{column_name}'.")
        mean_value = df[column_name].mean()
        df[column_name].fillna(mean_value, inplace=True)
    else:
        raise ValueError("Imputation method must be 'mode', 'median', or 'mean'.")


In [3]:
df = pd.read_excel("combined_data.xlsx")

In [4]:
df_edit = df.copy()
df_edit.drop('Transmission',axis=1,inplace=True)#drop the Duplicate column 
df_edit.columns = df_edit.columns.str.lower().str.replace(" ", "_")
pd.set_option('display.max_columns',None)
df_edit.head(1)

Unnamed: 0,it,ft,bt,km,transmission,ownerno,owner,oem,model,modelyear,centralvariantid,variantname,price,priceactual,pricesaving,pricefixedtext,trendingtext.imgurl,trendingtext.heading,trendingtext.desc,registration_year,insurance_validity,fuel_type,seats,kms_driven,rto,ownership,engine_displacement,year_of_manufacture,power_steering,power_windows_front,air_conditioner,heater,adjustable_head_lights,manually_adjustable_exterior_rear_view_mirror,centeral_locking,child_safety_locks,power_windows_rear,remote_trunk_opener,remote_fuel_lid_opener,low_fuel_warning_light,accessory_power_outlet,vanity_mirror,rear_seat_headrest,cup_holders_front,digital_odometer,electronic_multi_tripmeter,fabric_upholstery,glove_compartment,digital_clock,wheel_covers,power_antenna,chrome_grille,day_night_rear_view_mirror,passenger_side_rear_view_mirror,halogen_headlamps,rear_seat_belts,door_ajar_warning,side_impact_beams,front_impact_beams,adjustable_seats,centrally_mounted_fuel_tank,engine_immobilizer,anti_theft_device,fog_lights_front,anti_lock_braking_system,cd_player,trunk_light,multifunction_steering_wheel,navigation_system,smart_access_card_entry,engine_start_stop_button,gear_shift_indicator,luggage_hook_and_net,adjustable_steering,tachometer,leather_steering_wheel,outside_temperature_display,height_adjustable_driver_seat,power_adjustable_exterior_rear_view_mirror,electric_folding_rear_view_mirror,rear_window_wiper,rear_window_washer,rear_window_defogger,alloy_wheels,integrated_antenna,outside_rear_view_mirror_turn_indicators,roof_rail,power_door_locks,driver_air_bag,passenger_air_bag,seat_belt_warning,keyless_entry,engine_check_warning,crash_sensor,ebd,follow_me_home_headlamps,rear_camera,speed_sensing_auto_door_lock,pretensioners_and_force_limiter_seatbelts,impact_sensing_auto_door_lock,no_of_airbags,radio,speakers_front,speakers_rear,integrated2din_audio,usb_auxiliary_input,bluetooth,touch_screen,number_of_speaker,glove_box_cooling,driving_experience_control_eco,tinted_glass,rear_spoiler,chrome_garnish,vehicle_stability_control_system,rear_reading_lamp,rear_seat_centre_arm_rest,cup_holders_rear,rear_acvents,air_quality_control,height_adjustable_front_seat_belts,cruise_control,voice_control,audio_system_remote_control,leather_seats,fog_lights_rear,traction_control,seat_lumbar_support,battery_saver,lane_change_indicator,sun_roof,automatic_driving_lights,anti_theft_alarm,automatic_head_lamps,isofix_child_seat_mounts,hill_assist,tailgate_ajar,brake_assist,steering_wheel_gearshift_paddles,ledtaillights,cigarette_lighter,rain_sensing_wiper,drive_modes,active_noise_cancellation,adjustable_headrest,hands_free_tailgate,dual_tone_dashboard,leather_wrap_gear_shift_selector,dual_tone_body_colour,leddrls,ledheadlights,cornering_headlamps,cornering_foglamps,side_air_bag_front,side_air_bag_rear,tyre_pressure_monitor,clutch_lock,anti_pinch_power_windows,knee_airbags,apple_car_play,android_auto,mirror_link,wireless_phone_charging,compass,moon_roof,projector_headlamps,speed_alert,eletronic_stability_control,touch_screen_size,xenon_headlamps,cd_changer,power_boot,rear_folding_table,smoke_headlamps,dvd_player,internal_storage,rear_entertainment_system,remote_engine_start_stop,ventilated_seats,ledfog_lamps,view360camera,geo_fence_alert,steering_mounted_tripmeter,remote_climate_control,remote_horn_light_control,heated_wing_mirror,side_stepper,blind_spot_monitor,hill_descent_control,heads_up_display,sos_emergency_assistance,cassette_player,find_my_car_location,wifi_connectivity,headlamp_washers,real_time_vehicle_tracking,roof_carrier,smart_key_band,lane_watch_camera,removable_convertible_top,power_folding3rd_row_seat,mileage,engine,max_power,torque,seats.1,color,engine_type,displacement,max_torque,no_of_cylinder,values_per_cylinder,value_configuration,fuel_suppy_system,borex_stroke,compression_ratio,turbo_charger,super_charger,length,width,height,wheel_base,front_tread,rear_tread,kerb_weight,gross_weight,gear_box,drive_type,seating_capacity,steering_type,turning_radius,front_brake_type,rear_brake_type,top_speed,acceleration,tyre_type,no_door_numbers,cargo_volumn,wheel_size,alloy_wheel_size,ground_clearance_unladen,car_links,city
0,0,Petrol,Hatchback,120000,Manual,3,3rd Owner,Maruti,Maruti Celerio,2015,3979,VXI,₹ 4 Lakh,,,,https://stimg.cardekho.com/used-cars/common/ic...,Trending Car!,High chances of sale in next 6 days,2015,Third Party insurance,Petrol,5 Seats,"1,20,000 Kms",KA51,Third Owner,998 cc,2015.0,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,23.1 kmpl,998 CC,67.04bhp@6000rpm,90Nm,5.0,White,K10B Engine,998.0,90Nm@3500rpm,3.0,4.0,DOHC,MPFi,73 X 82 mm,11.0:1,No,No,3715mm,1635mm,1565mm,2425mm,1420mm,1410mm,835kg,1250kg,5 Speed,FWD,5.0,Power,4.7 metres,Ventilated Disc,Drum,150 Kmph,15.05 Seconds,"Tubeless, Radial",5.0,235-litres,,,,https://www.cardekho.com/used-car-details/used...,bangalore


In [5]:
pd.set_option('display.max_rows',None)
df_edit.dtypes

it                                                 int64
ft                                                object
bt                                                object
km                                                object
transmission                                      object
ownerno                                            int64
owner                                             object
oem                                               object
model                                             object
modelyear                                          int64
centralvariantid                                   int64
variantname                                       object
price                                             object
priceactual                                       object
pricesaving                                      float64
pricefixedtext                                   float64
trendingtext.imgurl                               object
trendingtext.heading           

In [6]:
(df_edit.isnull().sum()/len(df_edit))*100

it                                                 0.000000
ft                                                 0.000000
bt                                                 0.047795
km                                                 0.000000
transmission                                       0.000000
ownerno                                            0.000000
owner                                              0.000000
oem                                                0.000000
model                                              0.000000
modelyear                                          0.000000
centralvariantid                                   0.000000
variantname                                        0.000000
price                                              0.000000
priceactual                                       80.045406
pricesaving                                      100.000000
pricefixedtext                                   100.000000
trendingtext.imgurl                     

In [7]:
null_percentage = (df_edit.isnull().sum() / len(df_edit)) * 100

# Filter columns with a null percentage greater than 0 (i.e., columns that have missing values)
columns_with_null_E100 = null_percentage[null_percentage == 100]
print(columns_with_null_E100)
columns_with_null_E100.shape[0]

pricesaving       100.0
pricefixedtext    100.0
dtype: float64


2

In [8]:
#drop "priceFixedText", "priceSaving", "priceActual" columns due to priceActual is 80% 
df_edit.drop(['pricefixedtext','pricesaving','priceactual'], axis=1, inplace=True)

In [9]:
null_percentage = (df_edit.isnull().sum() / len(df_edit)) * 100
columns_with_null_GT80 = null_percentage[null_percentage > 80]
print(columns_with_null_GT80)
print(columns_with_null_GT80.shape[0])

ground_clearance_unladen    93.846338
dtype: float64
1


In [10]:
null_percentage = (df_edit.isnull().sum() / len(df_edit)) * 100
columns_with_null_BT50to80 = null_percentage[(null_percentage > 50) & (null_percentage <= 80)]
print(columns_with_null_BT50to80)
print(columns_with_null_BT50to80.shape[0])

borex_stroke         71.262994
compression_ratio    77.356912
gross_weight         54.259768
dtype: float64
3


In [11]:
for col in columns_with_null_BT50to80.index:
    unique_values = df_edit[col].unique()
    print(f"Unique values in column {col}: {unique_values}")

Unique values in column borex_stroke: ['73 X 82 mm' '79 X 76.5 mm' '77 X 85.8 mm' nan '69.6 x 82 mm'
 '74 x 85 mm' '73.0 X 89.4 mm' '84.0 x 90.0 mm' '69 x 72 mm' '74 X 85 mm'
 '81.0mm X 87.2mm' '82 x 94.6 mm' '68.5 x 72.0 mm' '83 x 92 mm' '76x82.5'
 '71.5 x 81.5 mm' '69.6 X 82mm' '82.5 x 84.2 mm' '82 x 93.2 mm'
 '76.5 x 86.9 mm' '84 x 90 mm' '71 x 75.6 mm' '69.6 X 82 mm' '74.5 x 76.4'
 '77 X 80.5 mm' '69.6 X 82' '76.0 X 82.5 mm' '73.5 X 88.3 mm' '74 x 85'
 '73 x 82mm' '75.0 x 55.0 mm' '81.5 x 95.8 mm' '74.5x76.4 mm'
 '82.5 x 92.8' '82x 78.8 mm' '73 x 72 mm' '75.5 X 83.5 mm' '73 X 71.5 mm'
 '75 x 90 mm' '77.0 X 85.44 mm' '73 X 82mm' '79.5 x 95.5 mm'
 '82.0 x 78.8 mm' '71 X 75.6 mm' '74.5 X 76.4mm' '76.5 X 75.6 mm'
 '77.2mm X 84.5mm' '77.0mm X 85.44mm' '83 X 92.35 mm' '72.2 x 81.3'
 '78.3 x 86.4 mm' '76.5 X 86.9 mm' '85 x 96 mm' '92.9 x 86.0 mm'
 '83.0 x 92.0 mm' '77x85.8' '77.2 X 84.5 mm' '82.5 x 92.8 mm'
 '79.5 x 80.5 mm' '3.39 in X 3.39 mm' '77.0 x 80.5 mm' '74.0 x 75.5 mm'
 '71.5 x 8

In [12]:
null_percentage = (df_edit.isnull().sum() / len(df_edit)) * 100
columns_with_null_BT30to50 = null_percentage[(null_percentage > 30) & (null_percentage <= 50)]
print(columns_with_null_BT30to50)
print(columns_with_null_BT30to50.shape[0])

front_tread         42.514040
rear_tread          42.657426
top_speed           44.951607
acceleration        41.964392
wheel_size          35.643446
alloy_wheel_size    35.643446
dtype: float64
6


In [13]:
null_percentage = (df_edit.isnull().sum() / len(df_edit)) * 100
columns_with_null_BT10to30 = null_percentage[(null_percentage > 10) & (null_percentage <= 30)]
print(columns_with_null_BT10to30)
print(columns_with_null_BT10to30.shape[0])

rto                    10.682280
value_configuration    26.478671
fuel_suppy_system      20.635679
turbo_charger          12.426813
super_charger          23.443661
drive_type             18.640220
turning_radius         19.118174
cargo_volumn           19.034532
dtype: float64
8


In [14]:
null_percentage = (df_edit.isnull().sum() / len(df_edit)) * 100
columns_with_null_B10 = null_percentage[(null_percentage > 0) & (null_percentage <= 10)]
print(columns_with_null_B10)
print(columns_with_null_B10.shape[0])

bt                     0.047795
registration_year      0.621341
insurance_validity     0.047795
seats                  0.071693
kms_driven             0.023898
ownership              0.382363
engine_displacement    0.047795
year_of_manufacture    0.238977
mileage                3.429322
engine                 0.047795
max_power              0.716932
torque                 0.716932
seats.1                0.071693
color                  0.035847
engine_type            3.524913
displacement           0.047795
max_torque             0.716932
no_of_cylinder         0.370415
values_per_cylinder    0.609392
length                 0.943960
width                  0.991755
height                 0.943960
wheel_base             1.947664
kerb_weight            0.704983
gear_box               1.266579
seating_capacity       0.071693
steering_type          3.046959
front_brake_type       1.147090
rear_brake_type        1.147090
tyre_type              0.920062
no_door_numbers        0.131437
dtype: f

In [15]:
df_edit['bt'].unique()

array(['Hatchback', 'SUV', 'Sedan', 'MUV', 'Coupe', 'Minivans',
       'Pickup Trucks', 'Convertibles', 'Hybrids', nan, 'Wagon'],
      dtype=object)

In [16]:
df_edit['variantname'].unique()

array(['VXI', '1.5 Petrol Titanium BSIV', '1.2 Revotron XZ', ...,
       'XZA Plus P Dark Edition AMT', 'X-Line DCT', 'C 200 CGI Elegance'],
      dtype=object)

In [17]:
df_edit['centralvariantid'].unique()

array([3979, 6087, 2983, ..., 8568, 9709, 4672], dtype=int64)

In [18]:
columns_with_null = null_percentage[null_percentage > 0]
columns_with_null.shape[0]

49

In [19]:
columns_with_null

bt                           0.047795
registration_year            0.621341
insurance_validity           0.047795
seats                        0.071693
kms_driven                   0.023898
rto                         10.682280
ownership                    0.382363
engine_displacement          0.047795
year_of_manufacture          0.238977
mileage                      3.429322
engine                       0.047795
max_power                    0.716932
torque                       0.716932
seats.1                      0.071693
color                        0.035847
engine_type                  3.524913
displacement                 0.047795
max_torque                   0.716932
no_of_cylinder               0.370415
values_per_cylinder          0.609392
value_configuration         26.478671
fuel_suppy_system           20.635679
borex_stroke                71.262994
compression_ratio           77.356912
turbo_charger               12.426813
super_charger               23.443661
length      

In [20]:
df_edit['ground_clearance_unladen'].replace(" ","",inplace = True)
df_edit['ground_clearance_unladen'].replace("mm","",inplace = True)
# Removing spaces
df_edit['ground_clearance_unladen'] = df_edit['ground_clearance_unladen'].str.replace(" ", "", regex=False)

# Removing 'mm'
df_edit['ground_clearance_unladen'] = df_edit['ground_clearance_unladen'].str.replace("mm", "", regex=False)

# Display unique values after replacement
unique_values = df_edit['ground_clearance_unladen'].unique()
print(unique_values)

[nan '190' '205' '209' '155' '120' '192' '210' '160' '110' '185' '178'
 '116' '145' '200' '170' '184' '180' '140' '91' '175' '165' '156']


In [21]:
df_edit['ground_clearance_unladen'] = pd.to_numeric(df_edit['ground_clearance_unladen'], errors='coerce')

In [22]:
df_edit['ground_clearance_unladen'].dtypes

dtype('float64')

In [23]:
len(df_edit['ground_clearance_unladen'].unique())

23

In [24]:
df_edit['ground_clearance_unladen'].isnull().sum()

7854

In [25]:
# Dictionary with average ground clearance values for each body type
ground_clearance_values = {
    "Hatchback": 165,
    "SUV": 210,
    "Sedan": 150,
    "MUV": 190,
    "Coupe": 130,
    "Minivans": 160,
    "Pickup Trucks": 220,
    "Convertibles": 130,
    "Hybrids": 150,
    "Wagon": 160
}

def replace_nan(row, values_dict):
    # Check if 'Ground Clearance Unladen' is NaN and 'bt' is not NaN
    if pd.isnull(row['ground_clearance_unladen']) and pd.notnull(row['bt']):
        # Return the value from the dictionary based on 'bt', or NaN if 'bt' is not in the dictionary
        return values_dict.get(row['bt'], row['ground_clearance_unladen'])
    # Return the existing value if 'Ground Clearance Unladen' is not NaN
    return row['ground_clearance_unladen']


# Apply the function to the DataFrame
df_edit['ground_clearance_unladen'] = df_edit.apply(
    lambda row: replace_nan(row, ground_clearance_values),
    axis=1
)

In [26]:
df_edit['ground_clearance_unladen'].isnull().sum()

4

In [27]:
len(df_edit['ground_clearance_unladen'].unique())

26

In [28]:
len(df_edit['gross_weight'].unique())

331

In [29]:
df_edit['gross_weight'].unique()

array(['1250kg', '1660Kg', nan, '1670kg', '1530Kg', '1340kg', '2450kg',
       '2160kg', '1335', '2730kg', '2055kg', '1350kg', '1730 Kg',
       '2165 Kg', '2510 kg', '1640kg', '2240kg', '1185kg', '2510kg',
       '1655kg', '1140kg', '2995kg', '1500kg', '1260', '1760kg', '1405Kg',
       '1585kg', '1860kg', '1430kg', '1750Kg', '2060kg', '1360kg',
       '1720kg', '1580kg', '1965kg', '2124kg', '1740kg', '2430kg',
       '1985kg', '2610Kg', '2505 kg', '1787kg', '1405kg', '1813kg',
       '1455kg', '1580 kg', '1620kg', '1915kg', '2220kg', '1680kg',
       '1758kg', '1705Kg', '1845kg', '1482kg', '1436kg', '2140kg',
       '1690Kg', '2280kg', '1520', '1505kg', '2300kg', '1490kg', '2320kg',
       '1530', '1520 Kg', '1890kg', '1660 kg', '1650', '1315Kg', '1980kg',
       '1315kg', '1525kg', '1210kg', '2135kg', '1415kg', '1510kg',
       '1950kg', '1315kgs', '2830', '1757kg', '2,010 kg', '1764kg',
       '2030kg', '2225kg', '1615kg', '1770kg', '1500Kg', '2300 kg',
       '1410', '1459kg', '17

In [30]:
df_edit['gross_weight'] = df_edit['gross_weight'].str.replace("Kg", "", regex=False)
df_edit['gross_weight'] = df_edit['gross_weight'].str.replace(" ", "", regex=False)
df_edit['gross_weight'] = df_edit['gross_weight'].str.replace(",", "", regex=False)
df_edit['gross_weight'] = df_edit['gross_weight'].str.replace("s", "", regex=False)
df_edit['gross_weight'] = df_edit['gross_weight'].str.replace("mm", "", regex=False)

In [31]:
def replace_range_with_average(value):
    if pd.isna(value):
        return np.nan
    try:
        if '-' in value:
            start, end = map(int, value.split('-'))
            return (start + end) / 2
        else:
            return float(value)  # Convert valid numeric strings to float
    except ValueError:
        return np.nan  # Handle invalid formats gracefully

# Apply the function to the 'Gross Weight' column
df_edit['gross_weight'] = df_edit['gross_weight'].apply(replace_range_with_average)

print(df_edit['gross_weight'].unique())

[  nan 1660. 1530. 1335. 1730. 2165. 1260. 1405. 1750. 2610. 1705. 1690.
 1520. 1650. 1315. 2830. 1500. 1410. 1436. 1170. 2075. 2225. 1600. 2460.
 2510. 1820. 1700. 1570. 1680. 2735. 1760. 1185. 1895. 1710. 1640. 1800.
 1670. 1795. 2490. 2400. 1438. 1340. 2449. 1740. 2450. 2755. 2150. 2005.
 2200. 2430. 1250. 2445. 1545. 2345. 2715. 3200. 1480. 2105. 3150. 1320.
 2125. 1890. 1505. 1715. 1765. 1645. 2680. 1415. 2185. 1655. 1533. 3350.
 3490. 2215. 1373. 1461. 2470. 2500. 1755.]


In [32]:
df_edit['gross_weight'] = pd.to_numeric(df_edit['gross_weight'], errors='coerce')

In [33]:
df_edit['gross_weight'].dtypes

dtype('float64')

In [34]:
df_edit['kerb_weight'].unique()

array(['835kg', '1242Kg', '1012kg', '1180', '1230Kg', '1551 kgs', '1070',
       '1440', '1105', '870kg', '1200', '1066 Kg', '1900', '1562 kgs',
       '1675', '1120', '1315', '1049kg', '1600kg', '875-905', '2200kg',
       '1050', '845', '1585', '1595kg', '1940', '1650', '1140', '885kg',
       '1170 Kg', '1655kg', '1955 kg', '1375', '1066kg', '1515kg',
       '880kg', '1329kg', '1735kg', '762kg', '1990', '1250kg', '1535',
       '1980', '705kg', '2535kg', '1825kg', '1060', '2345kg', '1970',
       '750', '1305Kg', '1095kg', '815', '1206kg', '1225Kg', '1320',
       '1700', '960Kg', '850kg', '1720', '1105kg', '1360kg', '1376kg',
       '980kg', '1015kg', '1260', '1380', '1192kg', '1025kg', '1040',
       '1135', '1865', '890kg', '860kg', '730', '1652kg', '1835', '1345',
       '1820', '925kg', '910-935', '760kg', '1086', '910', '1179Kg',
       '1525', '1730', '900', '1300', '1465', '2020', '1350kg', '840kg',
       '1445kg', '1580', '1608kg', '960', '1100', '1830kg', '755', '1020',
 

In [35]:
df_edit['kerb_weight'] = df_edit['kerb_weight'].str.replace("kg", "", regex=False)
df_edit['kerb_weight'] = df_edit['kerb_weight'].str.replace(" ", "", regex=False)
df_edit['kerb_weight'] = df_edit['kerb_weight'].str.replace(",", "", regex=False)
df_edit['kerb_weight'] = df_edit['kerb_weight'].str.replace("s", "", regex=False)
df_edit['kerb_weight'] = df_edit['kerb_weight'].str.replace("mm", "", regex=False)

In [36]:
# Apply the function to the 'Gross Weight' column
df_edit['kerb_weight'] = df_edit['kerb_weight'].apply(replace_range_with_average)

In [37]:
mean_ratio = (df_edit[df_edit['gross_weight'].notnull()]['gross_weight'] / 
              df_edit[df_edit['gross_weight'].notnull()]['kerb_weight']).mean()

In [38]:
def replace_nan(row, mean_ratio):
    if pd.isnull(row['gross_weight']) and pd.notnull(row['kerb_weight']):
        return row['kerb_weight'] * (mean_ratio)
    return row['gross_weight']

# Apply the function to each row in the DataFrame
df_edit['gross_weight'] = df_edit.apply(
    lambda row: replace_nan(row, mean_ratio),
    axis=1
)

In [39]:
df_edit['gross_weight'].isnull().sum()/len(df_edit)

0.03955072290596248

In [40]:
# Drop the 'BoreX Stroke', 'Compression Ratio' columns, as these are measurable attributes that cannot be assumed without compromising the quality and features of the product
df_edit.drop(['borex_stroke','compression_ratio'], axis=1, inplace=True) 

In [41]:
df_edit['front_tread'].unique()

array(['1420mm', nan, '1400mm', '1479mm', '1440mm', '1515mm', '1617mm',
       '1505mm', '1480mm', '1558mm', '1530', '1560', '1624mm', '1463 mm',
       '1315mm', '1595mm', '1564mm', '1295mm', '1510 mm', '1588mm',
       '1605mm', '1540 mm', '1569mm', '1627mm', '1501mm', '1655mm',
       '1651mm', '1450mm', '1430', '1530mm', '1541 mm', '1495mm',
       '1490mm', '1539mm', '1470mm', '1463mm', '1536', '1386mm', '1457mm',
       '1546mm', '1460mm', '1560mm', '1,505 mm', '1475mm', '1492mm',
       '1527', '1644mm', '1621 mm', '1565mm', '1510mm', '1615mm',
       '1519mm', '1577mm', '1585mm', '1485mm', '1666 mm', '1559mm',
       '1555mm', '1531', '1520mm', '1435mm', '1552mm', '1520', '1,539 mm',
       '1620 mm', '1600mm', '1621mm', '1540mm', '1601mm', '1,400 mm',
       '1490', '1,522 mm', '1316', '1470 mm', '1547', '1544 mm', '1620mm',
       '1571mm', '1544mm', '1572mm', '1496mm', '1641mm', '1553mm',
       '1521 mm', '1648mm', '1549mm', '1205mm', '1535mm', '1576mm',
       '1561mm', '1

In [42]:
df_edit['front_tread'] = df_edit['front_tread'].str.replace("mm", "", regex=False)
df_edit['front_tread'] = df_edit['front_tread'].str.replace(",", "", regex=False)
df_edit['front_tread'] = df_edit['front_tread'].str.replace("Mm", "", regex=False)
df_edit['front_tread'] = df_edit['front_tread'].str.replace(" ", "", regex=False)
df_edit['front_tread'] = pd.to_numeric(df_edit['front_tread'], errors='coerce')

In [43]:
df_edit['rear_tread'].unique()

array(['1410mm', nan, '1420mm', '1493mm', '1445mm', '1525mm', '1596mm',
       '1503mm', '1465mm', '1581mm', '1530', '1567', '1637mm', '1481 mm',
       '1300mm', '1597mm', '1551mm', '1290mm', '1520 mm', '1588 mm',
       '1630mm', '1540 mm', '1515mm', '1582mm', '1618mm', '1501mm',
       '1675mm', '1681mm', '1530 mm', '1440mm', '1440', '1530mm',
       '1581 mm', '1505mm', '1478mm', '1534mm', '1480mm', '1463mm',
       '1490mm', '1385mm', '1585mm', '1535', '1368mm', '1500mm', '1541mm',
       '1454mm', '1567mm', '1614mm', '1,503 mm', '1470mm', '1484mm',
       '1525', '1650mm', '1617 mm', '1456mm', '1565mm', '1510mm',
       '1620mm', '1485mm', '1524mm', '1570mm', '1604mm', '1535mm',
       '1495mm', '1686 mm', '1605mm', '1526mm', '1516', '1520mm',
       '1425mm', '1435mm', '1549mm', '1520', '1,528 mm', '1666 mm',
       '1600mm', '1540mm', '1510 mm', '1626mm', '1,385 mm', '1480',
       '1,522 mm', '1318', '1480 mm', '1545', '1583 mm', '1636mm',
       '1575mm', '1583mm', '1555mm', 

In [44]:
df_edit['rear_tread'] = df_edit['rear_tread'].str.replace("mm", "", regex=False)
df_edit['rear_tread'] = df_edit['rear_tread'].str.replace(",", "", regex=False)
df_edit['rear_tread'] = df_edit['rear_tread'].str.replace("Mm", "", regex=False)
df_edit['rear_tread'] = df_edit['rear_tread'].str.replace(" ", "", regex=False)
df_edit['rear_tread']= pd.to_numeric(df_edit['rear_tread'], errors='coerce')

In [45]:
MMM_imputation(df_edit, 'front_tread', 'mean')
MMM_imputation(df_edit, 'rear_tread', 'mean')

In [46]:
df_edit['wheel_size'].unique()

array([nan, '16', '14', '17', '15', '19', '13', '18', 'R18', '12', 'R16',
       'R17', '20', 'R15', 'R19', 'R14', '21', 'R20'], dtype=object)

In [47]:
# Step 1: Calculate the mode of wheel_size for each body type (bt)
wheel_size_value = df_edit.groupby('bt')['wheel_size'].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else None).to_dict()

# Step 2: Define the replace_nan function
def replace_nan(row, values_dict):
    if pd.isnull(row['wheel_size']) and pd.notnull(row['bt']):
        return values_dict.get(row['bt'], row['wheel_size'])
    return row['wheel_size']

# Step 3: Apply the function to update the wheel_size column
df_edit['wheel_size'] = df_edit.apply(
    lambda row: replace_nan(row, wheel_size_value),
    axis=1
)

# Check the unique values in the updated wheel_size column
print(df_edit['wheel_size'].unique())

# Check the proportion of null values in the wheel_size and bt columns
print(df_edit['wheel_size'].isnull().sum() / len(df_edit))
print(df_edit['bt'].isnull().sum() / len(df_edit))

['14' '16' '17' '15' '19' '13' '18' 'R18' '12' 'R16' 'R17' '20' 'R15'
 'R19' 'R14' '21' 'R20' nan None]
0.000597442944198829
0.0004779543553590632


In [48]:
df_edit['wheel_size']=df_edit['wheel_size'].str.replace('R',"",regex=False)

In [49]:
df_edit['alloy_wheel_size']=df_edit['alloy_wheel_size'].str.replace('R',"",regex=False)
df_edit['alloy_wheel_size'].unique()

array([nan, '16', '14', '17', '15', '19', '13', '18', '12', '20', '21'],
      dtype=object)

In [50]:
# Step 1: Calculate the mode of alloy_wheel_size for each body type (bt)
alloy_wheel_size_value = df_edit.groupby('bt')['alloy_wheel_size'].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else None).to_dict()

# Step 2: Define the replace_nan function for alloy_wheel_size
def replace_nan_alloy(row, values_dict):
    if pd.isnull(row['alloy_wheel_size']) and pd.notnull(row['bt']):
        return values_dict.get(row['bt'], row['alloy_wheel_size'])
    return row['alloy_wheel_size']

# Step 3: Apply the function to update the alloy_wheel_size column
df_edit['alloy_wheel_size'] = df_edit.apply(
    lambda row: replace_nan_alloy(row, alloy_wheel_size_value),
    axis=1
)

# Check the unique values in the updated alloy_wheel_size column
print(df_edit['alloy_wheel_size'].unique())

# Check the proportion of null values in the alloy_wheel_size and bt columns
print(df_edit['alloy_wheel_size'].isnull().sum() / len(df_edit))
print(df_edit['bt'].isnull().sum() / len(df_edit))

['14' '16' '17' '15' '19' '13' '18' '12' '20' '21' nan None]
0.000597442944198829
0.0004779543553590632


In [51]:
df_edit['top_speed'].unique()

array(['150 Kmph', nan, '150 kmph', '172km/hr', '190 Kmph', '180 Kmph',
       '165 Kmph', '172 Kmph', '175 kmph', '219kmph', '195 Kmph',
       '226km/hr', '250 kmph', '236 Kmph', '222 Kmph', '152 kmph',
       '240 Kmph', '250kmph', '176 Kmph', '170 Kmph', '191km/hr',
       '232 Kmph', '233 Kmph', '137km/hr', '220 Kmph', '215 Kmph',
       '154.19 kmph', '166 kmph', '183 Kmph', '226 Kmph', '160 kmph',
       '156 Kmph', '185kmph', '157 Kmph', '165km/hr', '159.4 kmph',
       '250 Kmph', '135 Kmph', '185 Kmph', '216 kmph', '200 Kmph',
       '160.2 kmph', '230 Kmph', '216km/hr', '195.68 kmph', '158.83 kmph',
       '210 Kmph', '168 Kmph', '155 Kmph', '177.22 kmph', '163 Kmph',
       '230', '164 Kmph', '182km/hr', '186.08 kmph', '172 kmph',
       '171.43 kmph', '155 kmph', '178.55 kmph', '182 Kmph', '186km/hr',
       '189 kmph', '165 Km', '175 Kmph', '227km/hr', '164.26 kmph',
       '194 Kmph', '242km/hr', '158km/hr', '160 Kmph', '190 kmph',
       '157 kmph', '180 kmph', '225 Kmp

In [52]:
df_edit['top_speed'] = df_edit['top_speed'].str.replace("Kmph", "", regex=False)
df_edit['top_speed'] = df_edit['top_speed'].str.replace("km/hr", "", regex=False)
df_edit['top_speed'] = df_edit['top_speed'].str.replace(" ", "", regex=False)
df_edit['top_speed'] = df_edit['top_speed'].str.replace("km/Hour", "", regex=False)
df_edit['top_speed'] = df_edit['top_speed'].str.replace("kmph", "", regex=False)
df_edit['top_speed'] = df_edit['top_speed'].str.replace("kmph", "", regex=False)
df_edit['top_speed'] = df_edit['top_speed'].str.replace("Km/Hour", "", regex=False)
df_edit['top_speed'] = df_edit['top_speed'].str.replace("km/h", "", regex=False)
df_edit['top_speed'] = df_edit['top_speed'].str.replace("km", "", regex=False)
df_edit['top_speed'] = pd.to_numeric(df_edit['top_speed'], errors='coerce')

In [53]:
df_edit['mileage'].unique()

array(['23.1 kmpl', '17 kmpl', '23.84 kmpl', '19.1 kmpl', '23.65 kmpl',
       '17.1 kmpl', '20.63 kmpl', '18.15 kmpl', '20.28 kmpl', '21.4 kmpl',
       '18.9 kmpl', '18.2 kmpl', '15.1 kmpl', nan, '21.27 kmpl',
       '17.33 kmpl', '17.8 kmpl', '18.48 kmpl', '22.38 kmpl',
       '16.42 kmpl', '7.81 kmpl', '17.4 kmpl', '20.3 kmpl', '13.45 kmpl',
       '15 kmpl', '17.11 kmpl', '20.14 kmpl', '20.51 kmpl', '19.34 kmpl',
       '12.6 kmpl', '15.56 kmpl', '11.5 kmpl', '19.87 kmpl', '18.6 kmpl',
       '22.54 kmpl', '16.3 kmpl', '18.53 kmpl', '22.05 kmpl',
       '12.05 kmpl', '16.5 kmpl', '19.7 kmpl', '11.3 kmpl', '14.08 kmpl',
       '12.07 kmpl', '22 kmpl', '21.5 kmpl', '15.3 kmpl', '26.68 kmpl',
       '21.14 kmpl', '15.9 kmpl', '19.67 kmpl', '14.6 kmpl', '28.4 kmpl',
       '21.76 kmpl', '26.21 kmpl', '19 kmpl', '19.98 kmpl', '18.5 kmpl',
       '20 kmpl', '16.8 kmpl', '17.2 kmpl', '21.13 kmpl', '24 kmpl',
       '25.32 kmpl', '16.55 kmpl', '19.2 kmpl', '24.04 kmpl',
       '27.03 kmpl

In [54]:
df_edit['mileage'] = df_edit['mileage'].str.replace("kmpl", "", regex=False)
df_edit['mileage'] = df_edit['mileage'].str.replace(" ", "", regex=False)
df_edit['mileage'] = df_edit['mileage'].str.replace("km/kg", "", regex=False)
df_edit['mileage'] = df_edit['mileage'].str.replace("Kmpl", "", regex=False)
df_edit['mileage'] = df_edit['mileage'].str.replace("Km/kg", "", regex=False)
df_edit['mileage'] = df_edit['mileage'].str.replace("Kmpl", "", regex=False)
df_edit['mileage'] = pd.to_numeric(df_edit['mileage'], errors='coerce')

In [55]:
# Step 1: Group by 'mileage' and calculate the mean of 'top_speed'
top_speed_value = df_edit.groupby('mileage')['top_speed'].mean().to_dict()

# Step 2: Define the function to replace NaN in 'top_speed'
def replace_nan(row, values_dict):
    if pd.isnull(row['top_speed']) and pd.notnull(row['mileage']):
        return values_dict.get(row['mileage'], row['top_speed'])
    return row['top_speed']

# Step 3: Apply the function to the DataFrame
df_edit['top_speed'] = df_edit.apply(
    lambda row: replace_nan(row, top_speed_value),
    axis=1
)

In [56]:
#drop "acceleration" column,assumption based on acceleration may be change due to body type, weight, areodynamics  
df_edit.drop('acceleration', axis=1, inplace=True)

In [57]:
df_edit['value_configuration'].unique()

array(['DOHC', nan, 'SOHC', 'DOHC ', 'undefined', 'DOHC with VIS',
       '16 Modules 48 Cells', 'iDSI', '23 Modules 69 Cells',
       'DOHC with VGT', '16-valve DOHC layout', 'DOHC with TIS', 'VTEC',
       'SOHC '], dtype=object)

In [58]:
df_edit['value_configuration'] = df_edit['value_configuration'].str.replace("DOHC with VIS", "DOHC", regex=False)
df_edit['value_configuration'] = df_edit['value_configuration'].str.replace("DOHC with TIS", "DOHC", regex=False)
df_edit['value_configuration'] = df_edit['value_configuration'].str.replace("DOHC ", "DOHC", regex=False)
df_edit['value_configuration'] = df_edit['value_configuration'].str.replace("16-valve DOHC layout", "DOHC", regex=False)
df_edit['value_configuration'] = df_edit['value_configuration'].str.replace("SOHC ", "SOHC", regex=False)
df_edit['value_configuration'] = df_edit['value_configuration'].str.replace("SOHC ", "SOHC", regex=False)
df_edit['value_configuration'] = df_edit['value_configuration'].str.replace("16-valve DOHClayout", "DOHC", regex=False)
df_edit['value_configuration'] = df_edit['value_configuration'].str.replace("DOHCwith VGT", "DOHC", regex=False)
df_edit['value_configuration'] = df_edit['value_configuration'].replace("undefined", None)

In [59]:
df_edit['turbo_charger'].unique()

array(['No', 'Yes', 'NO', nan, 'Twin', 'YES', 'no', 'twin', 'yes',
       'Turbo'], dtype=object)

In [60]:
df_edit['turbo_charger'] = df_edit['turbo_charger'].str.lower()
df_edit['turbo_charger'] = df_edit['turbo_charger'].str.replace("turbo", "yes", regex=False)
df_edit['turbo_charger'] = df_edit['turbo_charger'].str.replace("twin", "yes", regex=False)

In [61]:
df_edit['super_charger'] = df_edit['super_charger'].str.lower()
df_edit['super_charger'].unique()

array(['no', nan, 'yes'], dtype=object)

In [62]:
df_edit['drive_type'].unique()

array(['FWD', 'FWD ', '4X2', 'AWD', nan, 'RWD', '4x2', '2WD', '4WD',
       '2 WD', 'Front Wheel Drive', 'Rear Wheel Drive with ESP', '4X4',
       'Two Wheel Drive', '2WD ', 'All Wheel Drive', '4x4',
       'AWD INTEGRATED MANAGEMENT', 'RWD(with MTT)',
       'Permanent all-wheel drive quattro', '4 WD'], dtype=object)

In [63]:
# Create a dictionary for replacements
replacements = {
    "FWD ": "FWD",
    "Front Wheel Drive": "FWD",
    "2WD ": "2WD",
    "2 WD": "2WD",
    "4 WD": "4WD",
    "Rear Wheel Drive with ESP": "RWD",
    "RWD(with MTT)": "4WD",
    "All Wheel Drive": "AWD",
    "AWD INTEGRATED MANAGEMENT": "AWD",
    "Two Wheel Drive": "2WD",
    "Permanent all-wheel drive quattro": "AWD",
    "4x2": "2WD",
    "4X4": "4WD",
    "4X2": "2WD",
    "4x4": "4WD"
}

# Use the replace method with the dictionary
df_edit['drive_type'] = df_edit['drive_type'].replace(replacements, regex=False)

In [64]:
df_edit['drive_type'].unique()

array(['FWD', '2WD', 'AWD', nan, 'RWD', '4WD'], dtype=object)

In [65]:
df_edit['turning_radius'].unique()

array(['4.7 metres', '5.3 metres', '4.9 meters', '5.2 meters', nan,
       '4.6 metres', '5.4 metres', '4.8 metres', '5.1 meters',
       '5.6 metres', '5.1', '4.8', '5.2m', '6.15 metres ', '4.4 meters',
       '6 metres', '5.8 metres', '5.2 metres', '5.61 metres', '5.6',
       '5.9 m', '5.95 metres', '4.6', '4.6m', '6.2 metres', '6.0 metres',
       '4.9 metres', '5.1m', '5 m', '4.8 Meters', '5.5 metres',
       '5.55 metres', '5.3metres', '4.97 metres', '5.3 meters',
       '5.05 metres', '5.65 metres', '5.3 m', '4.5 metres', '4.9',
       '4.6metres', '5.55', '4.5', '5.20 m', '4.7 m', '6.4 meters', '5.8',
       '5.9metres', '5.2meters', '5.5 m', '5.7 metres', '5.4', '4.9 m',
       '5.75 metres', '5.35 metres', '5.05', '5.0metres', '5.4 m', '5.5m',
       '4.7m', '5.2', '5.05 meters', '4.85', '5.3', '5.55 m',
       '5.25 metres', '5.0m', '5.9 metres', '5.0', '4.8metres',
       '4.7 Metre', '5.9 meters', '4.35mm', '4.2', '4.9metres', '4.1 m',
       '5.39', '5.4 meters', '5.4m', 

In [66]:
df_edit['turning_radius'] = df_edit['turning_radius'].str.replace(r'[^0-9.\-\,]', '', regex=True)
df_edit['turning_radius'] = df_edit['turning_radius'].str.replace("6250", "6.25", regex=False)
df_edit['turning_radius'] = pd.to_numeric(df_edit['turning_radius'], errors='coerce')

In [67]:
df_edit['cargo_volumn'].unique()

array(['235-litres', '352-litres', '242-litres', '407-litres',
       '353-litres', '408-litres', '265-litres', '350', '510-litres',
       '339-litres', '256', '354-litres', nan, '520 litres', '268', '475',
       '530-litres', '235', '540-litres', '480-litres', '407',
       '180-liters', '209 litres', '520-litres', '295-litres',
       '420-litres', '211litres', '242', '433', '285', '680-litres', '73',
       '330-litres', '279', '313', '460-litres', '400-litres', '425',
       '268-litres', '520', '207-litres', '590', '236-liters', '311',
       '506-litres', '460', '256-liters', '243-litres', '225-litre',
       '300', '380-litres', '259 Litres', '155 L', '175-litres', '405',
       '215-litres', '494', '476', '587', '400', '720-litres', '445',
       '460 litres', '565-litres', '475-litres', '214', '295 Lit',
       '257-litres', '260-litres', '84', '650-litres', '460 liters',
       '513', '480', '550 Litre', '204-liters', '510', '280',
       '280-liters', '450-litres', '392 Li

In [68]:
df_edit['cargo_volumn'] = df_edit['cargo_volumn'].str.extract(r'(\d+\.?\d*)', expand=False)

In [69]:
df_edit['cargo_volumn'] = pd.to_numeric(df_edit['cargo_volumn'], errors='coerce')

In [70]:
null_percentage = (df_edit.isnull().sum() / len(df_edit)) * 100
columns_with_null_B10 = null_percentage[(null_percentage > 0) & (null_percentage <= 10)]
print(columns_with_null_B10)
print(columns_with_null_B10.shape[0])

bt                          0.047795
registration_year           0.621341
insurance_validity          0.047795
seats                       0.071693
kms_driven                  0.023898
ownership                   0.382363
engine_displacement         0.047795
year_of_manufacture         0.238977
mileage                     3.429322
engine                      0.047795
max_power                   0.716932
torque                      0.716932
seats.1                     0.071693
color                       0.035847
engine_type                 3.524913
displacement                0.047795
max_torque                  0.716932
no_of_cylinder              0.370415
values_per_cylinder         0.609392
length                      0.943960
width                       0.991755
height                      0.943960
wheel_base                  1.947664
kerb_weight                 7.288804
gross_weight                3.955072
gear_box                    1.266579
seating_capacity            0.071693
s

In [71]:
# List of columns to fill NaN values based on mode by 'model'
columns_to_fill = [
    'bt', 'registration_year', 'seats', 'kms_driven', 'engine_displacement',
    'year_of_manufacture', 'mileage', 'engine', 'max_power', 'torque',
    'color', 'engine_type', 'displacement', 'max_torque', 'no_of_cylinder',
    'values_per_cylinder', 'length', 'width', 'height', 'wheel_base',
    'kerb_weight', 'gear_box', 'seating_capacity', 'steering_type',
    'front_brake_type', 'rear_brake_type', 'tyre_type', 'no_door_numbers'
]

# Step 1: Calculate the mode of each column for each model value
mode_values = {
    col: df_edit.groupby('model')[col].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else None).to_dict()
    for col in columns_to_fill
}

# Step 2: Define the replace_nan function for each column
def replace_nan(row, col, values_dict):
    if pd.isnull(row[col]) and pd.notnull(row['model']):
        return values_dict.get(row['model'], row[col])
    return row[col]

# Step 3: Apply the function to update each column
for col in columns_to_fill:
    df_edit[col] = df_edit.apply(
        lambda row: replace_nan(row, col, mode_values[col]),
        axis=1
    )


In [72]:
# List of columns to fill NaN values based on mode by 'model'
columns_to_fill = [
    'value_configuration', 'fuel_suppy_system', 'turbo_charger', 'super_charger',
    'drive_type', 'turning_radius', 'cargo_volumn'
]

# Step 1: Calculate the mode of each column for each model value
mode_values = {
    col: df_edit.groupby('model')[col].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else None).to_dict()
    for col in columns_to_fill
}

# Step 2: Define the replace_nan function for each column
def replace_nan(row, col, values_dict):
    if pd.isnull(row[col]) and pd.notnull(row['model']):
        return values_dict.get(row['model'], row[col])
    return row[col]

# Step 3: Apply the function to update each column
for col in columns_to_fill:
    df_edit[col] = df_edit.apply(
        lambda row: replace_nan(row, col, mode_values[col]),
        axis=1
    )

In [73]:
rto_value = df_edit.groupby('city')['rto'].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else None).to_dict()

def replace_rto(row, values_dict):
    if pd.isnull(row['rto']) and pd.notnull(row['city']):
        return values_dict.get(row['city'], row['rto'])
    return row['rto']

df_edit['rto'] = df_edit.apply(
    lambda row: replace_rto(row, rto_value),  # Use the correct function name here
    axis=1
)

In [74]:
# Step 1: Calculate the percentage where gross_weight is not null
df_non_null = df_edit[df_edit['gross_weight'].notna()]
df_non_null['percentage_weight_calculation'] = ((df_non_null['gross_weight'] - df_non_null['kerb_weight']) / df_non_null['kerb_weight']) * 100

# Step 2: Compute the mean percentage
mean_percentage = df_non_null['percentage_weight_calculation'].mean()

# Step 3: Fill the null values in gross_weight using the mean percentage
df_edit.loc[df_edit['gross_weight'].isna(), 'gross_weight'] = df_edit['kerb_weight'] * (1 + mean_percentage / 100)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_non_null['percentage_weight_calculation'] = ((df_non_null['gross_weight'] - df_non_null['kerb_weight']) / df_non_null['kerb_weight']) * 100


In [75]:
columns_to_fill = [
    'bt', 'registration_year', 'insurance_validity', 'seats',
    'ownership', 'year_of_manufacture', 'mileage', 'max_power',
    'torque', 'seats.1', 'engine_type', 'max_torque', 'no_of_cylinder',
    'values_per_cylinder', 'value_configuration', 'fuel_suppy_system',
    'turbo_charger', 'super_charger', 'length', 'width', 'height',
    'wheel_base', 'kerb_weight', 'gross_weight', 'gear_box', 'drive_type',
    'seating_capacity', 'steering_type', 'turning_radius', 'front_brake_type',
    'rear_brake_type', 'top_speed', 'tyre_type', 'no_door_numbers',
    'cargo_volumn', 'wheel_size', 'alloy_wheel_size', 'ground_clearance_unladen'
]

# Fill null values with mode for each column
for column in columns_to_fill:
    mode_value = df_edit[column].mode()[0]  # Get the mode value
    df_edit[column].fillna(mode_value, inplace=True)  # Fill NaNs with the mode value

In [76]:
#drop duplicate columns
df_edit.drop(['owner','fuel_type','kms_driven','ownership','engine_displacement','year_of_manufacture','engine','seats.1','seating_capacity','alloy_wheel_size'], axis=1, inplace=True)

In [79]:
df_edit.dtypes

it                                                 int64
ft                                                object
bt                                                object
km                                                object
transmission                                      object
ownerno                                            int64
oem                                               object
model                                             object
modelyear                                          int64
centralvariantid                                   int64
variantname                                       object
price                                             object
trendingtext.imgurl                               object
trendingtext.heading                              object
trendingtext.desc                                 object
registration_year                                 object
insurance_validity                                object
seats                          

In [77]:
df_edit.isnull().sum()/len(df_edit)

it                                               0.0
ft                                               0.0
bt                                               0.0
km                                               0.0
transmission                                     0.0
ownerno                                          0.0
oem                                              0.0
model                                            0.0
modelyear                                        0.0
centralvariantid                                 0.0
variantname                                      0.0
price                                            0.0
trendingtext.imgurl                              0.0
trendingtext.heading                             0.0
trendingtext.desc                                0.0
registration_year                                0.0
insurance_validity                               0.0
seats                                            0.0
rto                                           

In [81]:
df_edit.to_excel('All_imputed_data.xlsx', index=False)