In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


In [2]:
print("Loading Car Dekho Dataset...")
df = pd.read_csv('car_data.csv')

# Display first few rows
df.head()

Loading Car Dekho Dataset...


Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [3]:
# Basic info
print("Dataset Information:")
print(df.info())

# Statistical summary
print("\nStatistical Summary:")
print(df.describe())

# Check missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Price distribution
print("\nPrice Statistics:")
print(f"Min Price: â‚¹{df['Selling_Price'].min()} Lakhs")
print(f"Max Price: â‚¹{df['Selling_Price'].max()} Lakhs")
print(f"Mean Price: â‚¹{df['Selling_Price'].mean():.2f} Lakhs")
print(f"Median Price: â‚¹{df['Selling_Price'].median():.2f} Lakhs")

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Kms_Driven     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Seller_Type    301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB
None

Statistical Summary:
              Year  Selling_Price  Present_Price     Kms_Driven       Owner
count   301.000000     301.000000     301.000000     301.000000  301.000000
mean   2013.627907       4.661296       7.628472   36947.205980    0.043189
std       2.891554       5.082812       8.644115   38886.883882    0.2479

In [4]:
print("Cleaning data...")

# Drop rows with missing values
df = df.dropna()

# Remove Owner column as it is useless plus each row in it has 0 value
df = df.drop('Owner', axis=1)
df.head()


Cleaning data...


Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual


In [5]:
# Cell 6: Feature Engineering (FIXED - No Data Leakage!)
print("Creating new features...")

# 1. Car Age (very important!)
df['Car_Age'] = 2024 - df['Year']
print("âœ“ Created Car_Age feature")

# 2. Kilometers per year (good feature)
df['Kms_Per_Year'] = df['Kms_Driven'] / (df['Car_Age'] + 1)
print("âœ“ Created Kms_Per_Year feature")

# 3. Age category
df['Age_Category'] = pd.cut(df['Car_Age'], bins=[0, 3, 7, 15, 100], 
                             labels=['New', 'Mid', 'Old', 'Very Old'])
print("âœ“ Created Age_Category feature")

# 4. High mileage flag
df['High_Mileage'] = (df['Kms_Driven'] > df['Kms_Driven'].median()).astype(int)
print("âœ“ Created High_Mileage flag")

# 5. Present to Selling Price Ratio (this is OK - doesn't leak target)
# This shows the general market value, not the exact selling price
df['Price_Ratio'] = df['Present_Price'] / (df['Present_Price'].mean())
print("âœ“ Created Price_Ratio feature")

# Drop Year column (we have Car_Age now)
df = df.drop(['Year'], axis=1)
print("âœ“ Dropped Year column")

print(f"\nâœ“ Feature engineering complete! Shape: {df.shape}")
df.head()

Creating new features...
âœ“ Created Car_Age feature
âœ“ Created Kms_Per_Year feature
âœ“ Created Age_Category feature
âœ“ Created High_Mileage flag
âœ“ Created Price_Ratio feature
âœ“ Dropped Year column

âœ“ Feature engineering complete! Shape: (301, 12)


Unnamed: 0,Car_Name,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Car_Age,Kms_Per_Year,Age_Category,High_Mileage,Price_Ratio
0,ritz,3.35,5.59,27000,Petrol,Dealer,Manual,10,2454.545455,Old,0,0.732781
1,sx4,4.75,9.54,43000,Diesel,Dealer,Manual,11,3583.333333,Old,1,1.250578
2,ciaz,7.25,9.85,6900,Petrol,Dealer,Manual,7,862.5,Mid,0,1.291215
3,wagon r,2.85,4.15,5200,Petrol,Dealer,Manual,13,371.428571,Old,0,0.544015
4,swift,4.6,6.87,42450,Diesel,Dealer,Manual,10,3859.090909,Old,1,0.900574


In [6]:
# Cell 7: Correlation Analysis
print("Analyzing correlations with Selling_Price...")

# Select only numerical columns
numerical_cols = df.select_dtypes(include=[np.number])

# Calculate correlations
correlation = numerical_cols.corr()['Selling_Price'].sort_values(ascending=False)

print("\nCorrelation with Selling_Price:")
print(correlation)

print("\nðŸ”¥ Top Features:")
print(correlation.head(6))

Analyzing correlations with Selling_Price...

Correlation with Selling_Price:
Selling_Price    1.000000
Price_Ratio      0.878983
Present_Price    0.878983
Kms_Per_Year     0.134574
High_Mileage     0.119304
Kms_Driven       0.029187
Car_Age         -0.236141
Name: Selling_Price, dtype: float64

ðŸ”¥ Top Features:
Selling_Price    1.000000
Price_Ratio      0.878983
Present_Price    0.878983
Kms_Per_Year     0.134574
High_Mileage     0.119304
Kms_Driven       0.029187
Name: Selling_Price, dtype: float64


In [7]:
# Cell 8: Encoding Categorical Variables
print("Encoding categorical variables...")

# Drop Car_Name (too many unique values, not useful for prediction)
if 'Car_Name' in df.columns:
    df = df.drop(['Car_Name'], axis=1)
    print("âœ“ Dropped Car_Name column")

# One-Hot Encoding for categorical columns
categorical_cols = ['Fuel_Type', 'Seller_Type', 'Transmission']

df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

print(f"âœ“ Encoding complete! New shape: {df_encoded.shape}")
print(f"\nNew columns: {df_encoded.columns.tolist()}")
df_encoded.head()

Encoding categorical variables...
âœ“ Dropped Car_Name column
âœ“ Encoding complete! New shape: (301, 12)

New columns: ['Selling_Price', 'Present_Price', 'Kms_Driven', 'Car_Age', 'Kms_Per_Year', 'Age_Category', 'High_Mileage', 'Price_Ratio', 'Fuel_Type_Diesel', 'Fuel_Type_Petrol', 'Seller_Type_Individual', 'Transmission_Manual']


Unnamed: 0,Selling_Price,Present_Price,Kms_Driven,Car_Age,Kms_Per_Year,Age_Category,High_Mileage,Price_Ratio,Fuel_Type_Diesel,Fuel_Type_Petrol,Seller_Type_Individual,Transmission_Manual
0,3.35,5.59,27000,10,2454.545455,Old,0,0.732781,False,True,False,True
1,4.75,9.54,43000,11,3583.333333,Old,1,1.250578,True,False,False,True
2,7.25,9.85,6900,7,862.5,Mid,0,1.291215,False,True,False,True
3,2.85,4.15,5200,13,371.428571,Old,0,0.544015,False,True,False,True
4,4.6,6.87,42450,10,3859.090909,Old,1,0.900574,True,False,False,True


In [8]:
# Cell 9: Remove Outliers
print("Removing outliers...")

initial_rows = len(df_encoded)

# Remove extreme selling prices (1st and 99th percentile)
Q1 = df_encoded['Selling_Price'].quantile(0.01)
Q3 = df_encoded['Selling_Price'].quantile(0.99)
df_encoded = df_encoded[(df_encoded['Selling_Price'] >= Q1) & (df_encoded['Selling_Price'] <= Q3)]

# Remove extreme kilometers
Q1_km = df_encoded['Kms_Driven'].quantile(0.01)
Q3_km = df_encoded['Kms_Driven'].quantile(0.99)
df_encoded = df_encoded[(df_encoded['Kms_Driven'] >= Q1_km) & (df_encoded['Kms_Driven'] <= Q3_km)]

print(f"âœ“ Removed {initial_rows - len(df_encoded)} outliers")
print(f"âœ“ Final shape: {df_encoded.shape}")

Removing outliers...
âœ“ Removed 12 outliers
âœ“ Final shape: (289, 12)
