## Import Libraries

In [1]:
import os
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

## Load Preprocessed Data

In [2]:
df = pd.read_csv("processed_data.csv")

In [3]:
# Save brands details for further use to filter out cars in web application.
brands = {i:list(j.car.unique())for i,j in df.groupby('brand')}
df.drop("brand",axis=1,inplace=True)   # remove brands from data

### Feature Type Analysis

In [4]:
numerical_features = list(df.describe().columns)
categorical_feature = list(df.describe(include='O').columns)
numerical_features.remove('price')
print(f"{numerical_features = }\n{categorical_feature = }")

numerical_features = ['kms_driven', 'age']
categorical_feature = ['car', 'ownership', 'fuel_type']


## Feature Scaling

In [5]:
# Perform Scaling (Standardization or normalization based on user input)
option = int(input('Select the Feature Scaling method :\nStandardization --- 1\nNormalization --- 2\n'))
if option == 2:
    scaler = MinMaxScaler()
    print('Applied MinMax Scaler')
elif option == 1:
    scaler = StandardScaler()
    print('Applied Standard Scaler')

df[numerical_features] = scaler.fit_transform(df[numerical_features])

Select the Feature Scaling method :
Standardization --- 1
Normalization --- 2
 1


Applied Standard Scaler


### Saving the Scaler 

In [6]:
# Ensures the directory exists; creates it if missing
artifact_dir = os.path.join('..','artifacts')
os.makedirs(artifact_dir, exist_ok=True)

sclaler_path = os.path.join(artifact_dir,'scaler.pkl')
pickle.dump(scaler, open(sclaler_path,'wb'))

brands_path = os.path.join(artifact_dir,'brands.pkl')
pickle.dump(brands, open(brands_path,'wb'))

### Categorical Features Encoding

In [7]:
df = pd.get_dummies(df,dtype=int)
df.head()

Unnamed: 0,kms_driven,price,age,car_800,car_Accent,car_Alto,car_Amaze,car_Ameo,car_Baleno,car_Beat,...,car_Xylo,car_Zen,car_Zest,car_i10,car_i20,ownership_First Owner,ownership_Second Owner,ownership_Third Owner,fuel_type_Diesel,fuel_type_Petrol
0,-1.316777,1283000.0,-1.791035,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
1,-1.013419,777000.0,-1.034672,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
2,-0.049796,515000.0,-0.53043,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
3,-0.494071,766000.0,-0.782551,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
4,-0.630117,758000.0,-0.782551,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1


## Model Training

In [8]:
# Dependent features and targets
x = df.drop('price',axis=1)
y = np.log(df.price)

# Data Split into train-test
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=.8)

# Creating a Linear Regression model instance
model = LinearRegression()

# Fitting the model with training data
model.fit(x_train,y_train)

train_accuracy = model.score(x_train,y_train)*100   # Calculating the model's accuracy on the training data (R² Score)

# Calculating the R² Score (test accuracy) on the test set
y_predicted = model.predict(x_test)
test_accuracy = r2_score(y_test,y_predicted)*100

rmse = np.sqrt(mean_squared_error(y_test,y_predicted))

# Printing the evaluation metrics
print(f'Training Accuracy : {train_accuracy:.2f} %\nTest Accuracy : {test_accuracy:.2f} %\n\
Root Mean Squared Error : {rmse:.2f}')

Training Accuracy : 90.48 %
Test Accuracy : 90.86 %
Root Mean Squared Error : 0.21


### Cross Validation

In [9]:
# Performing cross-validation and collecting the scores
cv = KFold()
cross_validation_scores = cross_val_score(model,x,y)
avg_cross_cal_score = np.mean(cross_validation_scores)*100
print(f'Cross Validation Score : {avg_cross_cal_score:.2f}')

Cross Validation Score : 89.60


### Final Model Training

In [10]:
#final_model training on whole data
final_model = LinearRegression()
final_model.fit(x,y)
final_score = final_model.score(x,y)*100
print(f"Final Model Accuracy : {final_score:.2f}")

Final Model Accuracy : 90.57


## Saving Trained Model

In [11]:
# Folder path for saving the trained model
model_dir = os.path.join("..","model")

# Ensures the directory exists; creates it if missing
os.makedirs(model_dir,exist_ok=True)   

model_path = os.path.join(model_dir,'saved_model.pkl')
pickle.dump(final_model, open(model_path,'wb'))

-----
