# **Golden Project: Car Price Prediction**

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df = pd.read_csv('car_price.csv')

# Data Exploration
print("First few rows of the dataset:")
print(df.head())

print("\nSummary of the dataset:")
print(df.describe())

# Check for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())

# Feature Engineering
# 1. Calculate car age from 'year' column
df['car_age'] = 2024 - df['year']

# 2. Log transform 'km_driven' to normalize its distribution
df['log_km_driven'] = np.log(df['km_driven'] + 1)

# 3. Encode categorical features using one-hot encoding
df = pd.get_dummies(df, columns=['fuel', 'seller_type', 'transmission', 'owner'], drop_first=True)

# 4. Label encode the 'name' feature (car model), if needed
# For simplicity, we'll not directly use 'name' here as it's complex, but it can be encoded
# label_encoder = LabelEncoder()
# df['name_encoded'] = label_encoder.fit_transform(df['name'])

# Drop 'name' and 'year' columns as we don't need them for the regression
df.drop(columns=['name', 'year'], inplace=True)

# Define features (X) and target variable (y)
X = df.drop(columns=['selling_price'])
y = df['selling_price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Selection and Training
# 1. Train a Random Forest model as an example
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Model Evaluation
# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f'\nMean Absolute Error: {mae}')

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Root Mean Squared Error: {rmse}')

# Cross-validation for model robustness
cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_error')
print(f'\nCross-validation scores (Negative MAE): {cv_scores}')
print(f'Mean Cross-validation Score: {cv_scores.mean()}')



First few rows of the dataset:
                       name  year  selling_price  km_driven    fuel  \
0             Maruti 800 AC  2007          60000      70000  Petrol   
1  Maruti Wagon R LXI Minor  2007         135000      50000  Petrol   
2      Hyundai Verna 1.6 SX  2012         600000     100000  Diesel   
3    Datsun RediGO T Option  2017         250000      46000  Petrol   
4     Honda Amaze VX i-DTEC  2014         450000     141000  Diesel   

  seller_type transmission         owner  
0  Individual       Manual   First Owner  
1  Individual       Manual   First Owner  
2  Individual       Manual   First Owner  
3  Individual       Manual   First Owner  
4  Individual       Manual  Second Owner  

Summary of the dataset:
              year  selling_price      km_driven
count  4340.000000   4.340000e+03    4340.000000
mean   2013.090783   5.041273e+05   66215.777419
std       4.215344   5.785487e+05   46644.102194
min    1992.000000   2.000000e+04       1.000000
25%    2011.00

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor

# Scaling features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train a Gradient Boosting model
model = GradientBoostingRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Model Evaluation
mae = mean_absolute_error(y_test, y_pred)
print(f'\nMean Absolute Error: {mae}')

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Root Mean Squared Error: {rmse}')

# Cross-validation for model robustness
cv_scores = cross_val_score(model, X_scaled, y, cv=5, scoring='neg_mean_absolute_error')
print(f'\nCross-validation scores (Negative MAE): {cv_scores}')
print(f'Mean Cross-validation Score: {cv_scores.mean()}')



Mean Absolute Error: 175772.53803690788
Root Mean Squared Error: 400198.3959106852

Cross-validation scores (Negative MAE): [-176532.4726987  -187321.0524059  -174721.89809838 -158019.92909149
 -193293.12593755]
Mean Cross-validation Score: -177977.6956464046
