In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# ----------------------------
# 1. Create synthetic dataset
# ----------------------------
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv')
np.random.seed(42)
n_samples = 100

# Single feature X
X = np.random.rand(n_samples, 1) * 10  

# Target y = 3*X + 7 + noise
y = 3 * X.squeeze() + 7 + np.random.randn(n_samples) * 2  

# Optional: introduce missing values
X[5] = np.nan
X[20] = np.nan

# --------------------------------
# 2. Handle missing values (mean)
# --------------------------------
mean_val = np.nanmean(X)
X = np.where(np.isnan(X), mean_val, X)

# ------------------------------
# 3. Split into train and test
# ------------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ------------------------------
# 4. Train Linear Regression
# ------------------------------
model = LinearRegression()
model.fit(X_train, y_train)

# ------------------------------
# 5. Predict and evaluate
# ------------------------------
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("Model Coefficient:", model.coef_)
print("Model Intercept:", model.intercept_)
print("RMSE:", rmse)


Model Coefficient: [2.92055273]
Model Intercept: 7.217708106835687
RMSE: 1.6362220902623177
