In [1]:
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Load Train & Test Datasets

In [2]:
# Load training and testing data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [3]:
# Display first few rows
train_df.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.5
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74


## Check Data Structure & Missing Values

In [4]:
# Check column details
train_df.info()
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               6019 non-null   object 
 1   Location           6019 non-null   object 
 2   Year               6019 non-null   int64  
 3   Kilometers_Driven  6019 non-null   int64  
 4   Fuel_Type          6019 non-null   object 
 5   Transmission       6019 non-null   object 
 6   Owner_Type         6019 non-null   object 
 7   Mileage            6017 non-null   object 
 8   Engine             5983 non-null   object 
 9   Power              5983 non-null   object 
 10  Seats              5977 non-null   float64
 11  New_Price          824 non-null    object 
 12  Price              6019 non-null   float64
dtypes: float64(2), int64(2), object(9)
memory usage: 611.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1234 entries, 0 to 1233
Data columns (total 12 columns):
 #

In [5]:
# Check missing values
print(train_df.isnull().sum())
print(test_df.isnull().sum())

Name                    0
Location                0
Year                    0
Kilometers_Driven       0
Fuel_Type               0
Transmission            0
Owner_Type              0
Mileage                 2
Engine                 36
Power                  36
Seats                  42
New_Price            5195
Price                   0
dtype: int64
Name                    0
Location                0
Year                    0
Kilometers_Driven       0
Fuel_Type               0
Transmission            0
Owner_Type              0
Mileage                 0
Engine                 10
Power                  10
Seats                  11
New_Price            1052
dtype: int64


## Data Cleaning

In [6]:
# Fill missing values in 'Seats' with the most common value
train_df["Seats"].fillna(train_df["Seats"].mode()[0], inplace=True)
test_df["Seats"].fillna(test_df["Seats"].mode()[0], inplace=True)

# Convert 'Mileage' to numeric (extracting only the number)
train_df["Mileage"] = train_df["Mileage"].str.split().str[0].astype(float)
test_df["Mileage"] = test_df["Mileage"].str.split().str[0].astype(float)

# Convert 'Engine' to numeric
train_df["Engine"] = train_df["Engine"].str.replace(" CC", "").astype(float)
test_df["Engine"] = test_df["Engine"].str.replace(" CC", "").astype(float)

# Convert 'Power' to numeric (handling 'null' values)
train_df["Power"] = train_df["Power"].str.replace(" bhp", "").replace("null", None).astype(float)
test_df["Power"] = test_df["Power"].str.replace(" bhp", "").replace("null", None).astype(float)

# Drop 'New_Price' as it has too many missing values
train_df.drop(["New_Price"], axis=1, inplace=True)
test_df.drop(["New_Price"], axis=1, inplace=True)

## Encode Categorical Features (Fuel_Type, Transmission, Owner_Type)

In [7]:
# Convert categorical data to numeric using One-Hot Encoding
train_df = pd.get_dummies(train_df, columns=['Fuel_Type', 'Transmission', 'Owner_Type'], drop_first=True)
test_df = pd.get_dummies(test_df, columns=['Fuel_Type', 'Transmission', 'Owner_Type'], drop_first=True)

## Save Cleaned Data

In [8]:
# Save cleaned datasets
train_df.to_csv("cleaned_train.csv", index=False)
test_df.to_csv("cleaned_test.csv", index=False)

print("Data cleaning complete. Ready for modeling!")

Data cleaning complete. Ready for modeling!


In [9]:
print(train_df.isnull().sum())
print(test_df.isnull().sum())

Name                           0
Location                       0
Year                           0
Kilometers_Driven              0
Mileage                        2
Engine                        36
Power                        143
Seats                          0
Price                          0
Fuel_Type_Diesel               0
Fuel_Type_Electric             0
Fuel_Type_LPG                  0
Fuel_Type_Petrol               0
Transmission_Manual            0
Owner_Type_Fourth & Above      0
Owner_Type_Second              0
Owner_Type_Third               0
dtype: int64
Name                          0
Location                      0
Year                          0
Kilometers_Driven             0
Mileage                       0
Engine                       10
Power                        32
Seats                         0
Fuel_Type_Diesel              0
Fuel_Type_LPG                 0
Fuel_Type_Petrol              0
Transmission_Manual           0
Owner_Type_Fourth & Above     0
Owner_Type