In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# ---------------- Step 1: Load Dataset ----------------
data = pd.read_csv(r"data\CarPrice_Assignment.csv")  # adjust path if needed
print("Step 1: Dataset loaded successfully (first 5 rows):\n", data.head(5), "\n")
print("Step 1: Dataset shape (rows, columns):", data.shape, "\n")

# ---------------- Step 2: Drop unnecessary columns ----------------
data = data.drop(columns=['car_ID', 'CarName'])
print("Step 2: Dropped 'car_ID' and 'CarName' (first 5 rows):\n", data.head(5), "\n")

# ---------------- Step 3: Define numeric and categorical features ----------------
num_features = [
    'symboling', 'wheelbase', 'carlength', 'carwidth', 'carheight',
    'curbweight', 'enginesize', 'boreratio', 'stroke', 'compressionratio',
    'horsepower', 'peakrpm', 'citympg', 'highwaympg'
]

cat_features = [
    'fueltype', 'aspiration', 'doornumber', 'carbody', 'drivewheel',
    'enginelocation', 'enginetype', 'cylindernumber', 'fuelsystem'
]

target = 'price'

print("Step 3: Numeric features:", num_features, "\n")
print("Step 3: Categorical features:", cat_features, "\n")
print("Step 3: Target variable:", target, "\n")

# ---------------- Step 4: Create preprocessing pipelines ----------------
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_features),
        ("cat", cat_transformer, cat_features)
    ]
)
preprocessor.set_output(transform="pandas")

# ---------------- Step 5: Apply preprocessing ----------------
X = preprocessor.fit_transform(data)
print("Step 5: Preprocessed input features (first 5 rows):\n", X.head(5), "\n")

y = data[[target]]
print("Step 5: Target variable (first 5 rows):\n", y.head(5), "\n")

# ---------------- Step 6: Feature engineering ----------------
X['price_per_curbweight'] = y[target] / (data['curbweight'] + 1)
print("Step 6: After feature engineering (first 5 rows):\n", X.head(5), "\n")

# ---------------- Step 7: Train-test split ----------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Step 7: X_train sample (first 5 rows):\n", X_train.head(5), "\n")
print("Step 7: y_train sample (first 5 rows):\n", y_train.head(5), "\n")
print("Step 7: X_test sample (first 5 rows):\n", X_test.head(5), "\n")
print("Step 7: y_test sample (first 5 rows):\n", y_test.head(5), "\n")


Step 1: Dataset loaded successfully (first 5 rows):
    car_ID  symboling                   CarName fueltype aspiration doornumber  \
0       1          3        alfa-romero giulia      gas        std        two   
1       2          3       alfa-romero stelvio      gas        std        two   
2       3          1  alfa-romero Quadrifoglio      gas        std        two   
3       4          2               audi 100 ls      gas        std       four   
4       5          2                audi 100ls      gas        std       four   

       carbody drivewheel enginelocation  wheelbase  ...  enginesize  \
0  convertible        rwd          front       88.6  ...         130   
1  convertible        rwd          front       88.6  ...         130   
2    hatchback        rwd          front       94.5  ...         152   
3        sedan        fwd          front       99.8  ...         109   
4        sedan        4wd          front       99.4  ...         136   

   fuelsystem  boreratio  s