In [42]:
import pandas as pd
import numpy as np
import re

In [43]:
# Load dataset
df = pd.read_csv("/content/cars_dataset_2025.csv", encoding='latin-1')

In [44]:
df.shape

(1218, 11)

In [45]:
df.head()

Unnamed: 0,Company Names,Cars Names,Engines,CC/Battery Capacity,HorsePower,Total Speed,Performance(0 - 100 )KM/H,Cars Prices,Fuel Types,Seats,Torque
0,FERRARI,SF90 STRADALE,V8,3990 cc,963 hp,340 km/h,2.5 sec,"$1,100,000",plug in hyrbrid,2,800 Nm
1,ROLLS ROYCE,PHANTOM,V12,6749 cc,563 hp,250 km/h,5.3 sec,"$460,000",Petrol,5,900 Nm
2,Ford,KA+,1.2L Petrol,"1,200 cc",70-85 hp,165 km/h,10.5 sec,"$12,000-$15,000",Petrol,5,100 - 140 Nm
3,MERCEDES,GT 63 S,V8,"3,982 cc",630 hp,250 km/h,3.2 sec,"$161,000",Petrol,4,900 Nm
4,AUDI,AUDI R8 Gt,V10,"5,204 cc",602 hp,320 km/h,3.6 sec,"$253,290",Petrol,2,560 Nm


In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1218 entries, 0 to 1217
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Company Names              1218 non-null   object
 1   Cars Names                 1218 non-null   object
 2   Engines                    1218 non-null   object
 3   CC/Battery Capacity        1215 non-null   object
 4   HorsePower                 1218 non-null   object
 5   Total Speed                1218 non-null   object
 6   Performance(0 - 100 )KM/H  1212 non-null   object
 7   Cars Prices                1218 non-null   object
 8   Fuel Types                 1218 non-null   object
 9   Seats                      1218 non-null   object
 10  Torque                     1217 non-null   object
dtypes: object(11)
memory usage: 104.8+ KB


In [47]:
df.columns

Index(['Company Names', 'Cars Names', 'Engines', 'CC/Battery Capacity',
       'HorsePower', 'Total Speed', 'Performance(0 - 100 )KM/H', 'Cars Prices',
       'Fuel Types', 'Seats', 'Torque'],
      dtype='object')

In [48]:
df.rename(columns={'CC/Battery Capacity':'battery_capacity','Performance(0 - 100 )KM/H':'performance'}, inplace=True)

In [49]:
# Standardize column names
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
df.columns

Index(['company_names', 'cars_names', 'engines', 'battery_capacity',
       'horsepower', 'total_speed', 'performance', 'cars_prices', 'fuel_types',
       'seats', 'torque'],
      dtype='object')

In [50]:
# Remove duplicate rows
df = df.drop_duplicates()

In [51]:
df.shape

(1214, 11)

In [52]:
# Strip extra spaces in string columns
for col in df.select_dtypes(include="object").columns:
    df[col] = df[col].str.strip()


In [59]:
# Handle ranges
def handle_range(val):
    if pd.isna(val):
        return np.nan
    s = str(val).replace(",", "")
    nums = re.findall(r"\d+\.?\d*", s)
    if len(nums) == 0:
        return np.nan
    nums = list(map(float, nums))
    return np.mean(nums)

In [58]:
# Clean numeric columns

df["battery_capacity"] = df["battery_capacity"].astype(str).str.replace("cc","", regex=False)
df["battery_capacity"] = df["battery_capacity"].apply(handle_range)

df["horsepower"] = df["horsepower"].astype(str).str.replace("hp","", regex=False)
df["horsepower"] = df["horsepower"].apply(handle_range)

df["total_speed"] = df["total_speed"].astype(str).str.replace("km/h","", regex=False)
df["total_speed"] = df["total_speed"].apply(handle_range)

df["performance"] = df["performance"].astype(str).str.replace("sec","", regex=False)
df["performance"] = df["performance"].apply(handle_range)

df["cars_prices"] = df["cars_prices"].astype(str).str.replace("USD","", regex=False).str.replace("usd","", regex=False).str.replace("$", "", regex=False)
df["cars_prices"] = df["cars_prices"].apply(handle_range)

df["torque"] = df["torque"].astype(str).str.replace("Nm","", regex=False)
df["torque"] = df["torque"].apply(handle_range)

df = df.reset_index(drop=True)

df.to_csv("cars_dataset_2025_cleaned.csv", index=False)

print("cars_dataset_2025_cleaned.csv")
df.head()


cars_dataset_2025_cleaned.csv


Unnamed: 0,company_names,cars_names,engines,battery_capacity,horsepower,total_speed,performance,cars_prices,fuel_types,seats,torque
0,FERRARI,SF90 STRADALE,V8,3990.0,963.0,340.0,2.5,1100000.0,plug in hyrbrid,2,800.0
1,ROLLS ROYCE,PHANTOM,V12,6749.0,563.0,250.0,5.3,460000.0,Petrol,5,900.0
2,Ford,KA+,1.2L Petrol,1200.0,77.5,165.0,10.5,13500.0,Petrol,5,120.0
3,MERCEDES,GT 63 S,V8,3982.0,630.0,250.0,3.2,161000.0,Petrol,4,900.0
4,AUDI,AUDI R8 Gt,V10,5204.0,602.0,320.0,3.6,253290.0,Petrol,2,560.0


In [57]:
df.tail()

Unnamed: 0,company_names,cars_names,engines,battery_capacity,horsepower,total_speed,performance,cars_prices,fuel_types,seats,torque
1209,Toyota,Crown Signia,2.5L Hybrid I4,2487.0,240.0,180.0,7.6,45795.0,Hybrid (Gas + Electric),5,239.0
1210,Toyota,4Runner (6th Gen),2.4L Turbo I4 (i-FORCE MAX Hybrid),2393.0,326.0,180.0,6.8,50000.0,Hybrid,7,630.0
1211,Toyota,Corolla Cross,2.0L Gas / 2.0L Hybrid,1987.0,182.5,190.0,8.6,27172.5,Gas / Hybrid,5,200.0
1212,Toyota,C-HR+,1.8L / 2.0L Hybrid,1892.5,169.0,180.0,9.2,33000.0,Hybrid,5,197.5
1213,Toyota,RAV4 (6th Gen),2.5L Hybrid / Plug-in Hybrid,2487.0,260.5,200.0,7.05,36000.0,Hybrid / Plug-in,5,310.5
