In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import statsmodels.formula.api as smf

import warnings

warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("CarPrice_Assignment.csv")

In [3]:
df.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_ID            205 non-null    int64  
 1   symboling         205 non-null    int64  
 2   CarName           205 non-null    object 
 3   fueltype          205 non-null    object 
 4   aspiration        205 non-null    object 
 5   doornumber        205 non-null    object 
 6   carbody           205 non-null    object 
 7   drivewheel        205 non-null    object 
 8   enginelocation    205 non-null    object 
 9   wheelbase         205 non-null    float64
 10  carlength         205 non-null    float64
 11  carwidth          205 non-null    float64
 12  carheight         205 non-null    float64
 13  curbweight        205 non-null    int64  
 14  enginetype        205 non-null    object 
 15  cylindernumber    205 non-null    object 
 16  enginesize        205 non-null    int64  
 1

In [5]:
df.drop_duplicates(inplace = True)

In [6]:
df.shape

(205, 26)

In [7]:
df.nunique()

car_ID              205
symboling             6
CarName             147
fueltype              2
aspiration            2
doornumber            2
carbody               5
drivewheel            3
enginelocation        2
wheelbase            53
carlength            75
carwidth             44
carheight            49
curbweight          171
enginetype            7
cylindernumber        7
enginesize           44
fuelsystem            8
boreratio            38
stroke               37
compressionratio     32
horsepower           59
peakrpm              23
citympg              29
highwaympg           30
price               189
dtype: int64

In [8]:
df.drop(columns=["car_ID"], inplace=True)

In [9]:
# Extract brand from CarName and fix common typos seen in this dataset
df["CarBrand"] = df["CarName"].str.split().str[0].str.lower()
brand_map = {
    "toyouta":"toyota", "Nissan":"nissan", "maxda":"mazda",
    "vokswagen":"volkswagen", "vw":"volkswagen", "porcshce":"porsche"
}
df["CarBrand"] = df["CarBrand"].replace(brand_map)
df.drop(columns=["CarName"], inplace=True)

In [10]:
df.head()

Unnamed: 0,symboling,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,...,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price,CarBrand
0,3,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,...,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0,alfa-romero
1,3,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,...,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0,alfa-romero
2,1,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,...,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0,alfa-romero
3,2,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,...,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0,audi
4,2,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,...,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0,audi


In [11]:
# 3) Define target and features
X = df.drop(columns=["price"])
y = df["price"].values

In [12]:
# 4) Identify column types
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = X.select_dtypes(include=["int64","float64"]).columns.tolist()

In [13]:
# 5) Preprocess: scale numeric, one‑hot categorical
from sklearn.preprocessing import OneHotEncoder
preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), cat_cols),
    ]
)

In [14]:
df.head()

Unnamed: 0,symboling,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,...,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price,CarBrand
0,3,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,...,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0,alfa-romero
1,3,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,...,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0,alfa-romero
2,1,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,...,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0,alfa-romero
3,2,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,...,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0,audi
4,2,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,...,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0,audi


In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

In [16]:
# 7) Pipeline: preprocessing + linear regression
model = Pipeline(steps=[
    ("prep", preprocess),
    ("linreg", LinearRegression())
])

model.fit(X_train, y_train)

0,1,2
,steps,"[('prep', ...), ('linreg', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [None]:
import math

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = math.sqrt(mse)
mae  = mean_absolute_error(y_test, y_pred)
r2   = r2_score(y_test, y_pred)

print(f"Test RMSE: {rmse:.2f}")
print(f"Test MAE : {mae:.2f}")
print(f"Test R²  : {r2:.3f}")

Test RMSE: 2730.67
Test MAE : 1850.56
Test R²  : 0.892


In [18]:
# 9) 5‑fold cross‑validation R²
cv_r2 = cross_val_score(model, X, y, cv=5, scoring="r2")
print(f"CV R²: mean={cv_r2.mean():.3f}, std={cv_r2.std():.3f}")

CV R²: mean=-0.072, std=0.820


In [19]:
# Build a modeling DataFrame including the engineered brand
df_sm = df.copy()

# Example formula: add numeric features + categorical with C()
formula = """
price ~ wheelbase + carlength + carwidth + curbweight + enginesize + horsepower
       + citympg + highwaympg
       + C(fueltype) + C(aspiration) + C(doornumber) + C(carbody)
       + C(drivewheel) + C(enginelocation) + C(enginetype)
       + C(cylindernumber) + C(fuelsystem) + C(CarBrand)
"""

ols_model = smf.ols(formula=formula, data=df_sm).fit()
print(ols_model.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.959
Model:                            OLS   Adj. R-squared:                  0.945
Method:                 Least Squares   F-statistic:                     65.63
Date:                Sun, 19 Oct 2025   Prob (F-statistic):           1.43e-81
Time:                        11:34:11   Log-Likelihood:                -1804.1
No. Observations:                 205   AIC:                             3718.
Df Residuals:                     150   BIC:                             3901.
Df Model:                          54                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
Intercept         

In [20]:
# Example: build one new sample in the same raw schema as X (before preprocessing)
new_car = {
    "symboling": 3,
    "fueltype": "gas",
    "aspiration": "std",
    "doornumber": "two",
    "carbody": "hatchback",
    "drivewheel": "fwd",
    "enginelocation": "front",
    "wheelbase": 94.5,
    "carlength": 171.2,
    "carwidth": 65.5,
    "carheight": 52.8,
    "curbweight": 2337,
    "enginetype": "ohc",
    "cylindernumber": "four",
    "enginesize": 130,
    "fuelsystem": "mpfi",
    "boreratio": 3.47,
    "stroke": 2.68,
    "compressionratio": 9.0,
    "horsepower": 111,
    "peakrpm": 5000,
    "citympg": 21,
    "highwaympg": 27,
    "CarBrand": "alfa-romero"   # engineered earlier from CarName
}
new_X = pd.DataFrame([new_car])

# Use the fitted pipeline (preprocess + model)
pred_price = model.predict(new_X)
print(f"Predicted price: {pred_price[0]:.2f}")


Predicted price: 13438.15
