## Car Fuel Efficiency

In [17]:
import pandas as pd
import numpy as np
from math import sqrt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

### Question 1. Missing values (1 point)


'engine_displacement'

'horsepower'

'vehicle_weight'

'model_year'

In [3]:
df = pd.read_csv("car_fuel_efficiency.csv")

In [4]:
df

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.870990,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369
...,...,...,...,...,...,...,...,...,...,...,...
9699,140,5.0,164.0,2981.107371,17.3,2013,Europe,Diesel,Front-wheel drive,,15.101802
9700,180,,154.0,2439.525729,15.0,2004,USA,Gasoline,All-wheel drive,0.0,17.962326
9701,220,2.0,138.0,2583.471318,15.1,2008,USA,Diesel,All-wheel drive,-1.0,17.186587
9702,230,4.0,177.0,2905.527390,19.4,2011,USA,Diesel,Front-wheel drive,1.0,15.331551


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9704 entries, 0 to 9703
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   engine_displacement  9704 non-null   int64  
 1   num_cylinders        9222 non-null   float64
 2   horsepower           8996 non-null   float64
 3   vehicle_weight       9704 non-null   float64
 4   acceleration         8774 non-null   float64
 5   model_year           9704 non-null   int64  
 6   origin               9704 non-null   object 
 7   fuel_type            9704 non-null   object 
 8   drivetrain           9704 non-null   object 
 9   num_doors            9202 non-null   float64
 10  fuel_efficiency_mpg  9704 non-null   float64
dtypes: float64(6), int64(2), object(3)
memory usage: 834.1+ KB


### Question 2. Median for horse power (1 point)


49

99

149

199

In [6]:
df["horsepower"].median()

149.0

### Building a model after filling null with mean

In [7]:
# Identify Missing Values
print("\nMissing values per column:")
print(df.isna().sum())


Missing values per column:
engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64


In [None]:
# From inspection, 'horsepower' has missing values.
# Target variable (to predict fuel efficiency)
target = "fuel_efficiency_mpg"

# We'll use numeric columns as features
features = ['engine_displacement', 'num_cylinders', 'horsepower',
            'vehicle_weight', 'acceleration', 'model_year', 'num_doors']

# Drop rows with missing target
df = df.dropna(subset=[target]).reset_index(drop=True)

In [11]:
# Split Data (60/20/20)


def split_data(data, seed=42):
    """Split dataset into train, validation, and test sets."""
    train_val, test = train_test_split(data, test_size=0.2, random_state=seed)
    train, val = train_test_split(train_val, test_size=0.25, random_state=seed)  # 0.25 of 80% = 20%
    return train.reset_index(drop=True), val.reset_index(drop=True), test.reset_index(drop=True)

train, val, test = split_data(df, seed=42)
print("\nSplit sizes:")
print("Train:", len(train), "Validation:", len(val), "Test:", len(test))


Split sizes:
Train: 5822 Validation: 1941 Test: 1941


In [12]:
# Prepare X, y Data

def prepare_xy(data, fill_method, train_means=None):
    """Prepare feature matrix X and target vector y with chosen fill method."""
    X = data[features].copy()
    y = data[target].values
    if fill_method == 'zero':
        X = X.fillna(0)
    elif fill_method == 'mean':
        X = X.fillna(train_means)
    else:
        raise ValueError("Invalid fill method: use 'zero' or 'mean'")
    return X, y

In [13]:
train_means = train[features].mean()

In [18]:
# 5. Train and Evaluate Models
from sklearn.linear_model import LinearRegression

def train_and_evaluate(train, val, fill_method, train_means=None):
    """Train a Linear Regression model and compute RMSE on validation."""
    X_train, y_train = prepare_xy(train, fill_method, train_means)
    X_val, y_val = prepare_xy(val, fill_method, train_means)

    model = LinearRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)
    rmse = sqrt(mean_squared_error(y_val, y_pred))
    return round(rmse, 2)

# RMSE with filling missing values with 0
rmse_zero = train_and_evaluate(train, val, fill_method='zero')
# RMSE with filling missing values with mean
rmse_mean = train_and_evaluate(train, val, fill_method='mean', train_means=train_means)

print("\nRMSE (validation set):")
print(f"Fill with 0   → RMSE = {rmse_zero}")
print(f"Fill with mean → RMSE = {rmse_mean}")


RMSE (validation set):
Fill with 0   → RMSE = 0.5
Fill with mean → RMSE = 0.39


In [19]:

# 6. Choose Better Option

if rmse_mean < rmse_zero:
    print("\n✅ Filling with mean gives better RMSE.")
else:
    print("\n✅ Filling with 0 gives better RMSE.")


✅ Filling with mean gives better RMSE.
