In [21]:
import numpy as np
import pandas as pd

In [22]:
df=pd.read_csv('car_fuel_efficiency.csv')
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9704 entries, 0 to 9703
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   engine_displacement  9704 non-null   int64  
 1   num_cylinders        9222 non-null   float64
 2   horsepower           8996 non-null   float64
 3   vehicle_weight       9704 non-null   float64
 4   acceleration         8774 non-null   float64
 5   model_year           9704 non-null   int64  
 6   origin               9704 non-null   object 
 7   fuel_type            9704 non-null   object 
 8   drivetrain           9704 non-null   object 
 9   num_doors            9202 non-null   float64
 10  fuel_efficiency_mpg  9704 non-null   float64
dtypes: float64(6), int64(2), object(3)
memory usage: 834.1+ KB


In [24]:
df.describe()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,num_doors,fuel_efficiency_mpg
count,9704.0,9222.0,8996.0,9704.0,8774.0,9704.0,9202.0,9704.0
mean,199.708368,3.962481,149.657292,3001.280993,15.021928,2011.484027,-0.006412,14.985243
std,49.455319,1.999323,29.879555,497.89486,2.510339,6.659808,1.048162,2.556468
min,10.0,0.0,37.0,952.681761,6.0,2000.0,-4.0,6.200971
25%,170.0,3.0,130.0,2666.248985,13.3,2006.0,-1.0,13.267459
50%,200.0,4.0,149.0,2993.226296,15.0,2012.0,0.0,15.006037
75%,230.0,5.0,170.0,3334.957039,16.7,2017.0,1.0,16.707965
max,380.0,13.0,271.0,4739.077089,24.3,2023.0,4.0,25.967222


In [25]:
df.isna().sum()

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

In [26]:
df['horsepower'].median()

np.float64(149.0)

In [27]:
np.random.seed(42)

n = len(df)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

y_train = np.log1p(df_train.fuel_efficiency_mpg.values)
y_val = np.log1p(df_val.fuel_efficiency_mpg.values)


In [28]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [29]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [30]:
def prepare_X_fill_0(df, columns):
    df_num = df[columns].copy()
    df_num = df_num.fillna(0)
    return df_num.values


In [31]:
def prepare_X_fill_mean(df, columns, mean_dict):
    df_num = df[columns].copy()
    for col in columns:
        df_num[col] = df_num[col].fillna(mean_dict[col])
    return df_num.values


In [32]:
missing_cols = df.isnull().sum()
cols_with_na = missing_cols[missing_cols > 0].index.tolist()

# Only numeric columns (for linear regression)
numeric_cols_with_na = df[cols_with_na].select_dtypes(include='number').columns.tolist()


In [33]:

features = numeric_cols_with_na

# Option 1: Fill with 0
X_train_0 = prepare_X_fill_0(df_train, features)
X_val_0 = prepare_X_fill_0(df_val, features)

w0_0, w_0 = train_linear_regression(X_train_0, y_train)
y_pred_val_0 = w0_0 + X_val_0.dot(w_0)
rmse_0 = rmse(y_val, y_pred_val_0)

# Option 2: Fill with mean from train
mean_dict = df_train[features].mean().to_dict()

X_train_mean = prepare_X_fill_mean(df_train, features, mean_dict)
X_val_mean = prepare_X_fill_mean(df_val, features, mean_dict)

w0_mean, w_mean = train_linear_regression(X_train_mean, y_train)
y_pred_val_mean = w0_mean + X_val_mean.dot(w_mean)
rmse_mean = rmse(y_val, y_pred_val_mean)

# Print results
print("RMSE with fill=0:", round(rmse_0, 4))
print("RMSE with fill=mean:", round(rmse_mean, 4))

if rmse_mean < rmse_0:
    print(" Filling with MEAN gives better RMSE.")
else:
    print(" Filling with 0 gives better RMSE.")


RMSE with fill=0: 0.1711
RMSE with fill=mean: 0.1696
 Filling with MEAN gives better RMSE.


In [34]:
# Regularized training function (should already be defined)
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)

    return w[0], w[1:]

# Define feature set and prepare data
features = numeric_cols_with_na
X_train = prepare_X_fill_0(df_train, features)
X_val = prepare_X_fill_0(df_val, features)

# Try different regularization strengths
r_values = [0, 0.01, 0.1, 1, 5, 10, 100]
rmse_scores = {}

rmse_scores = {}
best_r = None
best_rmse = float('inf')  # initialize with a very high value

for r in r_values:
    w0, w = train_linear_regression_reg(X_train, y_train, r=r)
    y_pred = w0 + X_val.dot(w)
    score = rmse(y_val, y_pred)
    rounded_score = round(score, 2)
    
    rmse_scores[r] = rounded_score

    print(f"r={r:<5} -> RMSE: {round(score, 10)}")
    
    if score < best_rmse:
        best_rmse = score
        best_r = r

# Final output
print(f"\n Best r is {best_r} with RMSE = {round(best_rmse, 2)}")


r=0     -> RMSE: 0.1711184122
r=0.01  -> RMSE: 0.171118112
r=0.1   -> RMSE: 0.1711155318
r=1     -> RMSE: 0.1711016637
r=5     -> RMSE: 0.1712939046
r=10    -> RMSE: 0.1720776471
r=100   -> RMSE: 0.2327883475

 Best r is 1 with RMSE = 0.17


In [35]:


def prepare_X(df, features):
    df_num = df[features].copy()
    df_num = df_num.fillna(0)
    return df_num.values

features = df.select_dtypes(include='number').drop(columns=['fuel_efficiency_mpg']).columns.tolist()

rmse_scores = []

for seed in range(10):
    np.random.seed(seed)
    idx = np.arange(len(df))
    np.random.shuffle(idx)
    df_shuffled = df.iloc[idx].reset_index(drop=True)

    n = len(df)
    n_val = int(0.2 * n)
    n_test = int(0.2 * n)
    n_train = n - n_val - n_test

    df_train = df_shuffled.iloc[:n_train].copy()
    df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
    df_test = df_shuffled.iloc[n_train+n_val:].copy()

    y_train = df_train.fuel_efficiency_mpg.values
    y_val = df_val.fuel_efficiency_mpg.values

    del df_train['fuel_efficiency_mpg']
    del df_val['fuel_efficiency_mpg']

    X_train = prepare_X(df_train, features)
    X_val = prepare_X(df_val, features)

    w0, w = train_linear_regression(X_train, y_train)
    y_pred = w0 + X_val.dot(w)
    score = rmse(y_val, y_pred)
    rmse_scores.append(score)

    print(f"Seed {seed}: RMSE = {round(score, 4)}")

std = np.std(rmse_scores)
print(f"\n Standard deviation of RMSEs: {round(std, 3)}")


Seed 0: RMSE = 0.5089
Seed 1: RMSE = 0.5096
Seed 2: RMSE = 0.5113
Seed 3: RMSE = 0.5064
Seed 4: RMSE = 0.498
Seed 5: RMSE = 0.5154
Seed 6: RMSE = 0.5192
Seed 7: RMSE = 0.4958
Seed 8: RMSE = 0.5015
Seed 9: RMSE = 0.4997

 Standard deviation of RMSEs: 0.007


In [None]:


target_col = 'fuel_efficiency_mpg'

df[target_col] = pd.to_numeric(df[target_col], errors='coerce')
df = df.dropna(subset=[target_col])

df_encoded = pd.get_dummies(df, drop_first=True)

df_encoded = df_encoded.fillna(0)

def split_data(df, seed=9):
    np.random.seed(seed)
    indices = np.random.permutation(len(df))
    n = len(df)
    n_train = int(0.6 * n)
    n_val = int(0.2 * n)
    train_idx = indices[:n_train]
    val_idx = indices[n_train:n_train + n_val]
    test_idx = indices[n_train + n_val:]
    return df.iloc[train_idx], df.iloc[val_idx], df.iloc[test_idx]

train_df, val_df, test_df = split_data(df_encoded, seed=9)

combined_df = pd.concat([train_df, val_df])

X_train_val = combined_df.drop(columns=[target_col]).values.astype(np.float64) 
y_train_val = combined_df[target_col].values.astype(np.float64)

X_test = test_df.drop(columns=[target_col]).values.astype(np.float64)  
y_test = test_df[target_col].values.astype(np.float64)

def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred) ** 2))

def train_linear_regression_reg(X, y, r=0.0):
    n_samples, n_features = X.shape
    
    # Center data
    X_mean = X.mean(axis=0)
    y_mean = y.mean()
    X_centered = X - X_mean
    y_centered = y - y_mean
    
    I = np.eye(n_features)
    
    A = X_centered.T @ X_centered + r * I
    b = X_centered.T @ y_centered
    
    w = np.linalg.solve(A, b)
    w0 = y_mean - X_mean @ w
    
    return w0, w

r = 0.001
w0, w = train_linear_regression_reg(X_train_val, y_train_val, r=r)

y_pred = w0 + X_test.dot(w)

test_rmse = rmse(y_test, y_pred)
print(f"Test RMSE with r={r}: {round(test_rmse, 3)}")


Test RMSE with r=0.001: 0.502
