In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Assuming df is your dataframe
train_data=pd.read_csv("train_preprocessed.csv")
test_data=pd.read_csv("test_preprocessed.csv")

df=train_data.combine_first(test_data)
# Log-transform skewed numerical features
df['log_horsepower'] = np.log(df['horsepower'])
df['log_enginesize'] = np.log(df['enginesize'])
df['log_curbweight'] = np.log(df['curbweight'])

# Interaction terms
df['horsepower_enginesize'] = df['horsepower'] * df['enginesize']
df['curbweight_carheight'] = df['curbweight'] * df['carheight']
df['wheelbase_length_width'] = df['wheelbase'] * df['carlength'] * df['carwidth']
df['engine_complex'] = df['boreratio'] * df['stroke'] * df['compressionratio']

# Polynomial features
df['horsepower_squared'] = df['horsepower'] ** 2
df['enginesize_squared'] = df['enginesize'] ** 2
df['carheight_squared'] = df['carheight'] ** 2
df['carwidth_squared'] = df['carwidth'] ** 2
df['curbweight_squared'] = df['curbweight'] ** 2

# Fuel efficiency metric
df['average_mpg'] = (df['citympg'] + df['highwaympg']) / 2

# One-hot encode categorical variables
df = pd.get_dummies(df, columns=['drivewheel', 'enginelocation', 'symboling'], drop_first=True)

# Standardize numerical features
scaler = StandardScaler()
numerical_features = ['wheelbase', 'carlength', 'carwidth', 'carheight', 'curbweight', 'enginesize', 'boreratio', 'stroke', 'compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg', 'brandavg', 'log_horsepower', 'log_enginesize', 'log_curbweight', 'horsepower_enginesize', 'curbweight_carheight', 'wheelbase_length_width', 'engine_complex', 'horsepower_squared', 'enginesize_squared', 'carheight_squared', 'carwidth_squared', 'curbweight_squared', 'average_mpg']
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# The df now contains the engineered features


In [13]:
df.corr(numeric_only=True)['price'].sort_values(ascending=False)

price                     1.000000
horsepower_enginesize     0.918372
enginesize                0.898726
brandavg                  0.891223
enginesize_squared        0.878693
log_enginesize            0.870582
horsepower_squared        0.846073
curbweight_squared        0.845796
horsepower                0.832665
curbweight                0.828523
cylindernumber            0.804266
log_curbweight            0.801185
log_horsepower            0.792875
curbweight_carheight      0.792581
carwidth_squared          0.752913
carwidth                  0.748764
wheelbase_length_width    0.722638
carlength                 0.701202
drivewheel_1              0.651069
wheelbase                 0.612687
boreratio                 0.566276
enginelocation_1          0.330171
symboling_3               0.188225
carheight                 0.187869
carheight_squared         0.187385
symboling_0               0.148647
symboling_-1              0.146695
stroke                    0.079915
engine_complex      