# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load Dataset


In [None]:
df = pd.read_csv("Datasets/car_price_dataset.csv")
df.head()

# Preprocessing

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df.info()

In [6]:
# Replace 'Year' with 'Car_Age' feature
current_year = 2025
df['Car_Age'] = current_year - df['Year']
X = df.drop(columns=['Price', 'Year'])
y = df['Price']

# Columns that need encoding
categorical_features = ['Brand', 'Model', 'Fuel_Type', 'Transmission']
# Columns that need scaling
numerical_features = ['Engine_Size', 'Mileage', 'Doors', 'Owner_Count']

# Preprocessing using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
                    ('num', Pipeline([
                        ('imputer', SimpleImputer(strategy='mean')),
                        ('scaler', StandardScaler())
                    ]), numerical_features),
                    ('cat', Pipeline([
                        ('imputer', SimpleImputer(strategy='most_frequent')),
                        ('onehot', OneHotEncoder(drop='first'))
                    ]), categorical_features)
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training

In [None]:
# Pipeline with preprocessor and LinearRegression model
model = Pipeline([('preprocessor', preprocessor),
                  ('regressor', LinearRegression())
])

# Fit the model
model.fit(X_train, y_train)

# Predictions

In [None]:
# Predict Prices
y_pred = model.predict(X_test)

# Compare actual vs predicted
df_results = pd.DataFrame({'Actual': y_test, "Predicted": y_pred})
df_results.head()

# Model Evaluation

In [None]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae: .2f}")
print(f"Mean Squared Error: {mse: .2f}")
print(f"Root Mean Squared Error: {rmse: .2f}")
print(f"R^2 Score: {r2: .4f}")

# Visualize Predictions

In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(x=y_test, y=y_pred, alpha=0.6)
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Actual vs Predicted Prices")
plt.show()

# Improving Linear Regression

1.   **Feature Engineering:** Adding new features
2.   **Polynomial Regression:** Adding non-linearity
3.   **Regularization:** Ridge and Lasso Regression

#### Polynomial Regression

In [11]:
# Apply Polynomial Regression to numerical features
preprocessor = ColumnTransformer(
    transformers=[
                    ('num', Pipeline([
                        ('imputer', SimpleImputer(strategy='mean')),
                        ('scaler', StandardScaler()),
                        ('poly', PolynomialFeatures(degree=2, include_bias=False))
                    ]), numerical_features),
                    ('cat', Pipeline([
                        ('imputer', SimpleImputer(strategy='most_frequent')),
                        ('onehot', OneHotEncoder(drop='first'))
                    ]), categorical_features)
])

In [None]:
# Pipeline with preprocessor and LinearRegression model
poly_model = Pipeline([('preprocessor', preprocessor),
                  ('regressor', LinearRegression())
])

# Fit the model
poly_model.fit(X_train, y_train)

In [None]:
# Predict Prices
poly_y_pred = poly_model.predict(X_test)

# Compare actual vs predicted
df_results = pd.DataFrame({'Actual': y_test, "Predicted": poly_y_pred})
df_results.head()

#### Regularization-Ridge

In [14]:
preprocessor = ColumnTransformer(
    transformers=[
                    ('num', Pipeline([
                        ('imputer', SimpleImputer(strategy='mean')),
                        ('scaler', StandardScaler())
                    ]), numerical_features),
                    ('cat', Pipeline([
                        ('imputer', SimpleImputer(strategy='most_frequent')),
                        ('onehot', OneHotEncoder(drop='first'))
                    ]), categorical_features)
])

In [None]:
ridge_model = Pipeline([('preprocessor', preprocessor),
                  ('regressor', Ridge(alpha=10))
])

# Fit the model
ridge_model.fit(X_train, y_train)

In [None]:
# Predict Prices
ridge_y_pred = ridge_model.predict(X_test)

# Compare actual vs predicted
df_results = pd.DataFrame({'Actual': y_test, "Predicted": ridge_y_pred})
df_results.head()

#### Regularization-Lasso

In [17]:
preprocessor = ColumnTransformer(
    transformers=[
                    ('num', Pipeline([
                        ('imputer', SimpleImputer(strategy='mean')),
                        ('scaler', StandardScaler())
                    ]), numerical_features),
                    ('cat', Pipeline([
                        ('imputer', SimpleImputer(strategy='most_frequent')),
                        ('onehot', OneHotEncoder(drop='first'))
                    ]), categorical_features)
])

In [None]:
lasso_model = Pipeline([('preprocessor', preprocessor),
                  ('regressor', Lasso(alpha=10))
])

# Fit the model
lasso_model.fit(X_train, y_train)

In [None]:
# Predict Prices
lasso_y_pred = lasso_model.predict(X_test)

# Compare actual vs predicted
df_results = pd.DataFrame({'Actual': y_test, "Predicted": lasso_y_pred})
df_results.head()

# Improved Models Evaluation

In [None]:
poly_r2 = r2_score(y_test, poly_y_pred)
ridge_r2 = r2_score(y_test, ridge_y_pred)
lasso_r2 = r2_score(y_test, lasso_y_pred)

print(f"Polynomial Features Model R^2 Score: {poly_r2: .4f}")
print(f"Ridge R^2 Score: {ridge_r2: .4f}")
print(f"Lasso R^2 Score: {lasso_r2: .4f}")

In [None]:
y_pred_arr = [poly_y_pred, ridge_y_pred, lasso_y_pred]
models = ["Polynomial Features", "Ridge", "Lasso"]
plt.figure(figsize=(10,30))
for i in range(3):
  plt.subplot(3,1,i+1)
  sns.scatterplot(x=y_test, y=y_pred_arr[i], alpha=0.6)
  plt.xlabel("Actual Prices")
  plt.ylabel("Predicted Prices")
  plt.title(f"Actual vs Predicted Prices {models[i]}")
plt.show()