# Lab Question
1. Linear Regression and Feature Engineering
Using any dataset of your choice (e.g., housing prices, car data, salary data, etc.), complete the
following tasks:

(a) Data Preparation: Load the dataset, identify numeric and categorical columns, and handle
missing values appropriately.

(b) Feature Engineering:
• Encode categorical features using one-hot encoding.
• Create at least two new engineered features (e.g., ratios, interaction terms, polynomial
features).
• Remove features with correlation greater than 0.8.

(c) Model Building: Build two linear regression models:
• Model 1: Uses original raw features.
• Model 2: Uses engineered features (scaled, encoded, and newly created).

(d) Evaluation: For each model, compute R2
, MSE.

(e) Interpretation: Provide short answers describing:
• Which model performed better and why.
• Which engineered features contributed most.
• How scaling or encoding affected performance.

In [None]:
 import kagglehub
import os

# Ensure your Kaggle API credentials (KAGGLE_USERNAME and KAGGLE_KEY)
# are set as secrets in Colab's secret manager before running this cell.

# Download the dataset using kagglehub (it will use the credentials from secrets)
path = kagglehub.dataset_download("amjadzhour/car-price-prediction")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/amjadzhour/car-price-prediction?dataset_version_number=1...


100%|██████████| 19.4k/19.4k [00:00<00:00, 22.0MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/amjadzhour/car-price-prediction/versions/1





In [None]:
import os

dataset_path = path # 'path' variable is from the previous cell execution
print(os.listdir(dataset_path))

['Car_Price_Prediction.csv']


In [None]:
import pandas as pd
import os

file_path = os.path.join(dataset_path, 'Car_Price_Prediction.csv')
df = pd.read_csv(file_path)


In [None]:
df.head()

Unnamed: 0,Make,Model,Year,Engine Size,Mileage,Fuel Type,Transmission,Price
0,Honda,Model B,2015,3.9,74176,Petrol,Manual,30246.207931
1,Ford,Model C,2014,1.7,94799,Electric,Automatic,22785.747684
2,BMW,Model B,2006,4.1,98385,Electric,Manual,25760.290347
3,Honda,Model B,2015,2.6,88919,Electric,Automatic,25638.003491
4,Honda,Model C,2004,3.4,138482,Petrol,Automatic,21021.386657


In [None]:
df.shape

(1000, 8)

In [None]:
df.isnull().sum()

Unnamed: 0,0
Make,0
Model,0
Year,0
Engine Size,0
Mileage,0
Fuel Type,0
Transmission,0
Price,0


In [None]:
df.dtypes

Unnamed: 0,0
Make,object
Model,object
Year,int64
Engine Size,float64
Mileage,int64
Fuel Type,object
Transmission,object
Price,float64


In [None]:
import numpy as np

# Get a list of all boolean columns
boolean_cols = df_encoded.select_dtypes(include='bool').columns

# Convert boolean columns to integer (1 and 0)
df_encoded[boolean_cols] = df_encoded[boolean_cols].astype(int)

print("DataFrame after converting boolean columns to 0/1:")
display(df_encoded.head())

DataFrame after converting boolean columns to 0/1:


Unnamed: 0,Year,Engine Size,Mileage,Price,Make_BMW,Make_Ford,Make_Honda,Make_Toyota,Model_Model B,Model_Model C,Model_Model D,Model_Model E,Fuel Type_Electric,Fuel Type_Petrol,Transmission_Manual
0,2015,3.9,74176,30246.207931,0,0,1,0,1,0,0,0,0,1,1
1,2014,1.7,94799,22785.747684,0,1,0,0,0,1,0,0,1,0,0
2,2006,4.1,98385,25760.290347,1,0,0,0,1,0,0,0,1,0,1
3,2015,2.6,88919,25638.003491,0,0,1,0,1,0,0,0,1,0,0
4,2004,3.4,138482,21021.386657,0,0,1,0,0,1,0,0,0,1,0


In [None]:
import numpy as np

# Calculate the correlation matrix
corr_matrix = df_encoded.corr().abs()

# Select upper triangle of correlation matrix
uppertri_corr = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find features with correlation greater than 0.8
to_drop = [column for column in uppertri_corr.columns if any(uppertri_corr[column] > 0.8)]

print(f"Original DataFrame shape: {df_encoded.shape}")
print(f"Features to drop due to high correlation (> 0.8): {to_drop}")

# Drop features
df_filtered = df_encoded.drop(columns=to_drop)

print(f"DataFrame shape after dropping highly correlated features: {df_filtered.shape}")
print("DataFrame after removing highly correlated features (first 5 rows):")
display(df_filtered.head())

Original DataFrame shape: (1000, 17)
Features to drop due to high correlation (> 0.8): ['Car_Age']
DataFrame shape after dropping highly correlated features: (1000, 16)
DataFrame after removing highly correlated features (first 5 rows):


Unnamed: 0,Year,Engine Size,Mileage,Price,Make_BMW,Make_Ford,Make_Honda,Make_Toyota,Model_Model B,Model_Model C,Model_Model D,Model_Model E,Fuel Type_Electric,Fuel Type_Petrol,Transmission_Manual,Mileage_Per_Engine_Size
0,2015,3.9,74176,30246.207931,0,0,1,0,1,0,0,0,0,1,1,19019.487179
1,2014,1.7,94799,22785.747684,0,1,0,0,0,1,0,0,1,0,0,55764.117647
2,2006,4.1,98385,25760.290347,1,0,0,0,1,0,0,0,1,0,1,23996.341463
3,2015,2.6,88919,25638.003491,0,0,1,0,1,0,0,0,1,0,0,34199.615385
4,2004,3.4,138482,21021.386657,0,0,1,0,0,1,0,0,0,1,0,40730.0


In [None]:
# 1. Engineer 'Car_Age' feature
current_year = 2024  # Assuming current year is 2024
df_encoded['Car_Age'] = current_year - df_encoded['Year']

# 2. Engineer 'Mileage_Per_Engine_Size' feature
# Add a small constant to 'Engine Size' to avoid division by zero, if necessary.
# Based on df.describe() or df.min() for 'Engine Size', it seems to be non-zero.
df_encoded['Mileage_Per_Engine_Size'] = df_encoded['Mileage'] / df_encoded['Engine Size']

print("DataFrame after engineering new features:")
display(df_encoded.head())

DataFrame after engineering new features:


Unnamed: 0,Year,Engine Size,Mileage,Price,Make_BMW,Make_Ford,Make_Honda,Make_Toyota,Model_Model B,Model_Model C,Model_Model D,Model_Model E,Fuel Type_Electric,Fuel Type_Petrol,Transmission_Manual,Car_Age,Mileage_Per_Engine_Size
0,2015,3.9,74176,30246.207931,0,0,1,0,1,0,0,0,0,1,1,9,19019.487179
1,2014,1.7,94799,22785.747684,0,1,0,0,0,1,0,0,1,0,0,10,55764.117647
2,2006,4.1,98385,25760.290347,1,0,0,0,1,0,0,0,1,0,1,18,23996.341463
3,2015,2.6,88919,25638.003491,0,0,1,0,1,0,0,0,1,0,0,9,34199.615385
4,2004,3.4,138482,21021.386657,0,0,1,0,0,1,0,0,0,1,0,20,40730.0


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Define features (X) and target (y)
X = df_filtered.drop('Price', axis=1)
y = df_filtered['Price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (800, 15)
Shape of X_test: (200, 15)
Shape of y_train: (800,)
Shape of y_test: (200,)


In [None]:
# Initialize and train the Linear Regression model
model = LinearRegression()


Linear Regression model trained successfully.


In [None]:
model.fit(X_train, y_train)

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test)

model.score(X_test,y_test)*100

81.70804851050565

In [None]:
from sklearn.preprocessing import StandardScaler

# Separate features (X) and target (y) from the filtered DataFrame
X_scaled = df_filtered.drop('Price', axis=1)
y_scaled = df_filtered['Price']

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the features
X_scaled_data = scaler.fit_transform(X_scaled)

# Convert the scaled features back to a DataFrame for better readability
X_scaled = pd.DataFrame(X_scaled_data, columns=X_scaled.columns, index=X_scaled.index)

print("DataFrame with scaled features (first 5 rows):")
display(X_scaled.head())

DataFrame with scaled features (first 5 rows):


Unnamed: 0,Year,Engine Size,Mileage,Make_BMW,Make_Ford,Make_Honda,Make_Toyota,Model_Model B,Model_Model C,Model_Model D,Model_Model E,Fuel Type_Electric,Fuel Type_Petrol,Transmission_Manual,Mileage_Per_Engine_Size
0,0.686031,1.076274,-0.387368,-0.465344,-0.538816,2.012587,-0.479596,1.927947,-0.507801,-0.495308,-0.474858,-0.693889,1.421671,0.978237,-0.656672
1,0.526933,-1.072952,-0.040282,-0.465344,1.855921,-0.496873,-0.479596,-0.518686,1.969276,-0.495308,-0.474858,1.441153,-0.703398,-1.022247,0.417647
2,-0.745852,1.271658,0.02007,2.148948,-0.538816,-0.496873,-0.479596,1.927947,-0.507801,-0.495308,-0.474858,1.441153,-0.703398,0.978237,-0.511161
3,0.686031,-0.193723,-0.139243,-0.465344,-0.538816,2.012587,-0.479596,1.927947,-0.507801,-0.495308,-0.474858,1.441153,-0.703398,-1.022247,-0.212844
4,-1.064048,0.587813,0.694904,-0.465344,-0.538816,2.012587,-0.479596,-0.518686,1.969276,-0.495308,-0.474858,-0.693889,1.421671,-1.022247,-0.021912


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Split the scaled data into training and testing sets
X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

print("Shape of X_train_scaled:", X_train_scaled.shape)
print("Shape of X_test_scaled:", X_test_scaled.shape)
print("Shape of y_train_scaled:", y_train_scaled.shape)
print("Shape of y_test_scaled:", y_test_scaled.shape)

Shape of X_train_scaled: (800, 15)
Shape of X_test_scaled: (200, 15)
Shape of y_train_scaled: (800,)
Shape of y_test_scaled: (200,)


In [None]:
# Initialize and train the Linear Regression model on scaled data
scaled_model = LinearRegression()
scaled_model.fit(X_train_scaled, y_train_scaled)

print("Linear Regression model trained successfully on scaled data.")

Linear Regression model trained successfully on scaled data.


In [None]:
# Make predictions on the scaled test set
y_pred_scaled = scaled_model.predict(X_test_scaled)

# Evaluate the model on scaled data
scaled_model.score(X_test_scaled,y_test_scaled)*100



81.70804851049907

In [None]:
print("Metrics for Model trained on Unscaled Data:")
print(f"  R-squared (R2): {r2:.2f}")
print(f"  Mean Squared Error (MSE): {mse:.2f}")
print("\nMetrics for Model trained on Scaled Data:")
print(f"  R-squared (R2): {r2_scaled:.2f}")
print(f"  Mean Squared Error (MSE): {mse_scaled:.2f}")

Metrics for Model trained on Unscaled Data:
  R-squared (R2): 0.82
  Mean Squared Error (MSE): 5005900.77

Metrics for Model trained on Scaled Data:
  R-squared (R2): 0.82
  Mean Squared Error (MSE): 5005900.77
