# **Data Exploration**

In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('CAR DETAILS FROM CAR DEKHO.csv')

# Display basic information
print(df.info())

# Check for missing values
print(df.isnull().sum())

# Basic statistics of numerical features
print(df.describe())

# Check the first few rows of the data
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           4340 non-null   object
 1   year           4340 non-null   int64 
 2   selling_price  4340 non-null   int64 
 3   km_driven      4340 non-null   int64 
 4   fuel           4340 non-null   object
 5   seller_type    4340 non-null   object
 6   transmission   4340 non-null   object
 7   owner          4340 non-null   object
dtypes: int64(3), object(5)
memory usage: 271.4+ KB
None
name             0
year             0
selling_price    0
km_driven        0
fuel             0
seller_type      0
transmission     0
owner            0
dtype: int64
              year  selling_price      km_driven
count  4340.000000   4.340000e+03    4340.000000
mean   2013.090783   5.041273e+05   66215.777419
std       4.215344   5.785487e+05   46644.102194
min    1992.000000   2.000000e+04       1.0000

# **Feature Engineering**

In [2]:
# Drop 'name' column as it is unlikely to help in prediction
df = df.drop('name', axis=1)

# Convert 'year' to age of the car
df['age'] = 2024 - df['year']
df = df.drop('year', axis=1)

# Convert categorical features into dummy/indicator variables
df = pd.get_dummies(df, columns=['fuel', 'seller_type', 'transmission', 'owner'], drop_first=True)

# Check transformed dataset
print(df.head())


   selling_price  km_driven  age  fuel_Diesel  fuel_Electric  fuel_LPG  \
0          60000      70000   17        False          False     False   
1         135000      50000   17        False          False     False   
2         600000     100000   12         True          False     False   
3         250000      46000    7        False          False     False   
4         450000     141000   10         True          False     False   

   fuel_Petrol  seller_type_Individual  seller_type_Trustmark Dealer  \
0         True                    True                         False   
1         True                    True                         False   
2        False                    True                         False   
3         True                    True                         False   
4        False                    True                         False   

   transmission_Manual  owner_Fourth & Above Owner  owner_Second Owner  \
0                 True                       Fal

# **Model Selection and Training**

In [3]:
from sklearn.model_selection import train_test_split

# Split dataset into features and target variable
X = df.drop('selling_price', axis=1)
y = df['selling_price']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Import necessary regression models
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

# Initialize models
rf_model = RandomForestRegressor()
gb_model = GradientBoostingRegressor()
lr_model = LinearRegression()

# Train the models
rf_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)
lr_model.fit(X_train, y_train)


# **Model Evaluation**

In [4]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Define a function to evaluate models
def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    mae = mean_absolute_error(y_test, predictions)
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    print(f"Model: {model.__class__.__name__}")
    print(f"Mean Absolute Error: {mae}")
    print(f"Root Mean Squared Error: {rmse}")
    print("\n")

# Evaluate all models
evaluate_model(rf_model, X_test, y_test)
evaluate_model(gb_model, X_test, y_test)
evaluate_model(lr_model, X_test, y_test)


Model: RandomForestRegressor
Mean Absolute Error: 170564.55712671299
Root Mean Squared Error: 391742.1217109978


Model: GradientBoostingRegressor
Mean Absolute Error: 175676.80444416497
Root Mean Squared Error: 399595.27277472924


Model: LinearRegression
Mean Absolute Error: 221706.36930930085
Root Mean Squared Error: 426786.6899872593




# **Cross-Validation for Robustness**

In [5]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation for RandomForest
rf_cv_scores = cross_val_score(rf_model, X, y, cv=5, scoring='neg_mean_absolute_error')
print(f"Random Forest Cross-Validation MAE: {-np.mean(rf_cv_scores)}")

# Perform cross-validation for GradientBoosting
gb_cv_scores = cross_val_score(gb_model, X, y, cv=5, scoring='neg_mean_absolute_error')
print(f"Gradient Boosting Cross-Validation MAE: {-np.mean(gb_cv_scores)}")


Random Forest Cross-Validation MAE: 166722.4983987383
Gradient Boosting Cross-Validation MAE: 177829.1085754659
