In [1]:
import pandas as pd

In [2]:
df=pd.read_csv("/content/data_2.csv")

In [3]:
df.columns

Index(['engine_displacement', 'num_cylinders', 'horsepower', 'vehicle_weight',
       'acceleration', 'model_year', 'origin', 'fuel_type', 'drivetrain',
       'num_doors', 'fuel_efficiency_mpg'],
      dtype='object')

In [4]:
df = df[['engine_displacement',  'horsepower', 'vehicle_weight',
        'model_year', 'fuel_efficiency_mpg']]

In [6]:
df.dtypes

Unnamed: 0,0
engine_displacement,int64
horsepower,float64
vehicle_weight,float64
model_year,int64
fuel_efficiency_mpg,float64


In [7]:
df.isna().sum()

Unnamed: 0,0
engine_displacement,0
horsepower,708
vehicle_weight,0
model_year,0
fuel_efficiency_mpg,0


In [9]:
median_val = df["horsepower"].median()
median_val

149.0

In [10]:
import numpy as np
np.random.seed(42)

n = len(df)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

In [19]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)

    return w[0], w[1:]

In [20]:
base = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']
def prepare_X(df):
    df_num = df[base]
    df_num = df_num.fillna(0)
    X = df_num.values
    return X
target = 'fuel_efficiency_mpg'
y_train = df_train[target]
y_val = df_val[target]

In [22]:
X_train = prepare_X(df_train)
w_0, w = train_linear_regression(X_train, y_train)

In [23]:
y_pred = w_0 + X_train.dot(w)

In [24]:

def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)
rmse(y_train, y_pred)


np.float64(0.5202614265099076)

In [25]:
mean_hp = df_train['horsepower'].mean()
base = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']
def prepare_X(df):
    df_num = df[base]
    df_num = df_num.fillna(mean_hp)
    X = df_num.values
    return X
target = 'fuel_efficiency_mpg'
y_train = df_train[target]
y_val = df_val[target]

In [26]:
X_train = prepare_X(df_train)
w_0, w = train_linear_regression(X_train, y_train)

In [27]:
y_pred = w_0 + X_train.dot(w)

In [28]:
rmse(y_train, y_pred)

np.float64(0.4624412137959964)

In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# 4. Split into train (60%), val (20%), test (20%)
df_full_train, df_temp = train_test_split(df, test_size=0.4, random_state=42)
df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=42)

# 5. Reset indexes
df_full_train = df_full_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# 6. Separate target and features
features = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']
target = 'fuel_efficiency_mpg'

# ----------------------------------------
# Option 1: Fill missing 'horsepower' with 0
# ----------------------------------------
df_train_0 = df_full_train.copy()
df_val_0 = df_val.copy()

df_train_0['horsepower'] = df_train_0['horsepower'].fillna(0)
df_val_0['horsepower'] = df_val_0['horsepower'].fillna(0)

# Train model
model_0 = LinearRegression()
model_0.fit(df_train_0[features], df_train_0[target])

# Predict and evaluate
y_pred_0 = model_0.predict(df_val_0[features])
rmse_0 = np.sqrt(mean_squared_error(df_val_0[target], y_pred_0))
rmse_0 = round(rmse_0, 2)

# ----------------------------------------
# Option 2: Fill missing 'horsepower' with mean (from train only)
# ----------------------------------------
df_train_mean = df_full_train.copy()
df_val_mean = df_val.copy()

mean_hp = df_train_mean['horsepower'].mean()
df_train_mean['horsepower'] = df_train_mean['horsepower'].fillna(mean_hp)
df_val_mean['horsepower'] = df_val_mean['horsepower'].fillna(mean_hp)

# Train model
model_mean = LinearRegression()
model_mean.fit(df_train_mean[features], df_train_mean[target])

# Predict and evaluate
y_pred_mean = model_mean.predict(df_val_mean[features])
rmse_mean = np.sqrt(mean_squared_error(df_val_mean[target], y_pred_mean))
rmse_mean = round(rmse_mean, 2)

# ----------------------------------------
# Print results
# ----------------------------------------
print(f"RMSE (fill with 0): {rmse_0}")
print(f"RMSE (fill with mean): {rmse_mean}")

if rmse_0 < rmse_mean:
    print("Better Option: Fill with 0")
elif rmse_0 > rmse_mean:
    print("Better Option: Fill with mean")
else:
    print("Both are equally good")

RMSE (fill with 0): 0.51
RMSE (fill with mean): 0.46
Better Option: Fill with mean


In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load data
url = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv'
df = pd.read_csv(url)

# Keep selected columns
cols = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']
df = df[cols]

# Shuffle and split
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df_train, df_temp = train_test_split(df, test_size=0.4, random_state=42)
df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=42)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

# Features and target
base = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']
target = 'fuel_efficiency_mpg'

# Prepare X matrix
def prepare_X(df):
    df_num = df[base].copy()
    df_num = df_num.fillna(0)
    X = df_num.values
    return X

# Custom regularized training function
def train_regularized_linear_regression(X, y, r):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg_matrix = r * np.eye(XTX.shape[0])
    XTX = XTX + reg_matrix

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)

    return w[0], w[1:]

# RMSE function
def rmse(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

# Prepare data
X_train = prepare_X(df_train)
X_val = prepare_X(df_val)
y_train = df_train[target].values
y_val = df_val[target].values

# Try different r values
r_values = [0, 0.01, 0.1, 1, 5, 10, 100]
scores = {}

for r in r_values:
    w0, w = train_regularized_linear_regression(X_train, y_train, r)
    y_pred = w0 + X_val.dot(w)
    score = rmse(y_val, y_pred)
    scores[r] = round(score, 2)
    print(f"r = {r} => RMSE = {round(score, 2)}")

# Find best r
best_r = min(scores, key=lambda k: (scores[k], k))  # Lowest RMSE, tie breaker: smallest r
print(f"\nBest r: {best_r}")


r = 0 => RMSE = 0.51
r = 0.01 => RMSE = 0.51
r = 0.1 => RMSE = 0.51
r = 1 => RMSE = 0.52
r = 5 => RMSE = 0.52
r = 10 => RMSE = 0.52
r = 100 => RMSE = 0.52

Best r: 0


In [31]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)

    return w[0], w[1:]

# RMSE calculation manually
def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred) ** 2))

# Manual function to shuffle and split data (60/20/20)
def manual_split(df, seed):
    np.random.seed(seed)
    indices = np.random.permutation(len(df))
    n = len(df)
    n_train = int(0.6 * n)
    n_val = int(0.2 * n)

    train_idx = indices[:n_train]
    val_idx = indices[n_train:n_train + n_val]
    test_idx = indices[n_train + n_val:]

    df_train = df.iloc[train_idx].reset_index(drop=True)
    df_val = df.iloc[val_idx].reset_index(drop=True)
    df_test = df.iloc[test_idx].reset_index(drop=True)

    return df_train, df_val, df_test

# Prepare X matrix by filling NAs with 0 and extracting numpy arrays
def prepare_X(df):
    df_num = df[base].fillna(0)
    return df_num.values

# Seeds to try
seeds = list(range(10))
rmse_scores = []

for seed in seeds:
    # Split data manually with current seed
    df_train, df_val, _ = manual_split(df, seed)

    # Prepare data
    X_train = prepare_X(df_train)
    y_train = df_train[target].values

    X_val = prepare_X(df_val)
    y_val = df_val[target].values

    # Train model without regularization (r=0)
    w0, w = train_linear_regression_reg(X_train, y_train, r=0)

    # Predict on validation set
    y_pred = w0 + X_val.dot(w)

    # Calculate RMSE manually
    score = rmse(y_val, y_pred)
    rmse_scores.append(score)

# Calculate standard deviation of RMSE scores
std = np.std(rmse_scores)
print("RMSE scores:", [round(s, 3) for s in rmse_scores])
print("Standard deviation:", round(std, 3))

RMSE scores: [np.float64(0.516), np.float64(0.509), np.float64(0.516), np.float64(0.527), np.float64(0.533), np.float64(0.518), np.float64(0.513), np.float64(0.53), np.float64(0.507), np.float64(0.521)]
Standard deviation: 0.008


In [32]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)

    return w[0], w[1:]

def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred) ** 2))

def manual_split(df, seed):
    np.random.seed(seed)
    indices = np.random.permutation(len(df))
    n = len(df)
    n_train = int(0.6 * n)
    n_val = int(0.2 * n)

    train_idx = indices[:n_train]
    val_idx = indices[n_train:n_train + n_val]
    test_idx = indices[n_train + n_val:]

    df_train = df.iloc[train_idx].reset_index(drop=True)
    df_val = df.iloc[val_idx].reset_index(drop=True)
    df_test = df.iloc[test_idx].reset_index(drop=True)

    return df_train, df_val, df_test

def prepare_X(df):
    df_num = df[base].fillna(0)
    return df_num.values

# Use seed 9
seed = 9
df_train, df_val, df_test = manual_split(df, seed)

# Combine train and val
df_train_val = pd.concat([df_train, df_val], ignore_index=True)

# Prepare features and target
X_train_val = prepare_X(df_train_val)
y_train_val = df_train_val[target].values

X_test = prepare_X(df_test)
y_test = df_test[target].values

# Train model with r=0.001
w0, w = train_linear_regression_reg(X_train_val, y_train_val, r=0.001)

# Predict on test set
y_pred = w0 + X_test.dot(w)

# Calculate RMSE
score = rmse(y_test, y_pred)
print("Test RMSE:", round(score, 3))

Test RMSE: 0.529
