##Import libraries

In [1]:
import pandas as pd
import numpy as np
import pickle
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

##Load dataset

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
fp = '/content/drive/MyDrive/DSA ICT Data Science/Data/CaseStudy/car_prediction_data (1).csv'
car = pd.read_csv(fp)
car.head()

#Here we take selling price as target brcause based on other features we want to predict selling price

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


##Basic Data Understanding

In [4]:
car.shape
car.info()
car.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Kms_Driven     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Seller_Type    301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB


Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Owner
count,301.0,301.0,301.0,301.0,301.0
mean,2013.627907,4.661296,7.628472,36947.20598,0.043189
std,2.891554,5.082812,8.644115,38886.883882,0.247915
min,2003.0,0.1,0.32,500.0,0.0
25%,2012.0,0.9,1.2,15000.0,0.0
50%,2014.0,3.6,6.4,32000.0,0.0
75%,2016.0,6.0,9.9,48767.0,0.0
max,2018.0,35.0,92.6,500000.0,3.0


##Check Missing Values

In [5]:
car.isnull().sum()

Unnamed: 0,0
Car_Name,0
Year,0
Selling_Price,0
Present_Price,0
Kms_Driven,0
Fuel_Type,0
Seller_Type,0
Transmission,0
Owner,0


##Feature engineering

In [6]:
# Fixed year to avoid leakage
car['Car_Age'] = 2024 - car['Year']

# Drop unnecessary column
car.drop('Car_Name', axis=1, inplace=True)

##Encoding categorical variables

In [7]:
cat_cols = car.select_dtypes(include='object').columns
car = pd.get_dummies(car, columns=cat_cols, drop_first=True)

##Trainâ€“test split

In [8]:
X = car.drop('Selling_Price', axis=1)
y = car['Selling_Price']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

##Scaling

In [9]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

##Linear Regression

In [10]:
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

y_pred_lr = lr_model.predict(X_test_scaled)

print("Linear Regression")
print("R2:", r2_score(y_test, y_pred_lr))
print("MAE:", mean_absolute_error(y_test, y_pred_lr))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_lr)))

Linear Regression
R2: 0.8489813024899069
MAE: 1.2162256821297053
RMSE: 1.8651552135513807


##Random Forest

In [11]:
rfr_model = RandomForestRegressor(
    n_estimators=300,
    max_depth=15,
    random_state=42
)

rfr_model.fit(X_train_scaled, y_train)

y_pred_rfr = rfr_model.predict(X_test_scaled)

print("\nRandom Forest Regressor")
print("R2:", r2_score(y_test, y_pred_rfr))
print("MAE:", mean_absolute_error(y_test, y_pred_rfr))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_rfr)))


Random Forest Regressor
R2: 0.9649806211215927
MAE: 0.5970759562841516
RMSE: 0.8981602652097331


##Save model AND scaler

In [12]:
pickle.dump(rfr_model, open("random_forest_model.pkl", "wb"))
pickle.dump(scaler, open("scaler.pkl", "wb"))

print("Model and scaler saved successfully!")

Model and scaler saved successfully!
