In [None]:
import pandas as pd

# 1) Load the CSV
df = pd.read_csv("data/auto-mpg.csv")

# 2) Look for columns that are "text" but mostly contain numbers
for col in df.columns:
    # If pandas thinks it's text (object), it *might* really be numeric
    if df[col].dtype == "object":
        # Try converting to numbers (anything that can't convert becomes NaN)
        as_num = pd.to_numeric(df[col], errors="coerce")

        # Count how many non-empty values exist, and how many become numbers
        non_missing = df[col].notna().sum()
        numeric_count = as_num.notna().sum()

        # If most of the non-missing values convert to numbers, flag it
        if non_missing > 0 and numeric_count / non_missing >= 0.9:
            print(f"Column '{col}' might be numeric but loaded as text.")
            print("  examples:", df[col].dropna().head(5).tolist())
            print()
    else:
        print(f"Column '{col}' appears to be numeric.")


In [None]:
df

In [None]:
# Split into training (70%) and test (30%) sets
train_df, test_df = train_test_split(
    df,
    test_size=0.30,
    random_state=seed
)

# Print dimensions of each set
print("Training set shape:", train_df.shape)
print("Test set shape:", test_df.shape)

In [None]:
# ---- TRAINING SET ----
print("Training set missing values (before):")
print(train_df.isna().sum())

# Fill missing values with column medians
train_df = train_df.fillna(train_df.median(numeric_only=True))

print("\nTraining set missing values (after):")
print(train_df.isna().sum())

In [None]:
# ---- TEST SET ----
print("\nTest set missing values (before):")
print(test_df.isna().sum())

# Fill missing values with column medians
test_df = test_df.fillna(test_df.median(numeric_only=True))

print("\nTest set missing values (after):")
print(test_df.isna().sum())


In [None]:
train_df

In [None]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

# -------------------------------
# Normalize continuous features
# -------------------------------

# Choose continuous columns (example ones)
continuous_cols = ["horsepower", "weight"]

scaler = StandardScaler()

# Fit on training data, transform both
train_df[continuous_cols] = scaler.fit_transform(train_df[continuous_cols])
test_df[continuous_cols] = scaler.transform(test_df[continuous_cols])

train_df

In [None]:
# --------------------------------
# Add polynomial features (squared)
# --------------------------------

poly = PolynomialFeatures(degree=2, include_bias=False)

# Apply only to horsepower for simplicity
## learn the poly model from training set
horsepower_train_poly = poly.fit_transform(train_df[["horsepower"]])

## aply the same transform to the test set
horsepower_test_poly  = poly.transform(test_df[["horsepower"]])

# Convert back to DataFrames
poly_feature_names = poly.get_feature_names_out(["horsepower"])

horsepower_train_poly = pd.DataFrame(
    horsepower_train_poly,
    columns=poly_feature_names,
    index=train_df.index
)

horsepower_test_poly = pd.DataFrame(
    horsepower_test_poly,
    columns=poly_feature_names,
    index=test_df.index
)

# Add squared term back to original data
train_df["horsepower_squared"] = horsepower_train_poly["horsepower^2"]
test_df["horsepower_squared"]  = horsepower_test_poly["horsepower^2"]

# Show result
print("Training columns:", train_df.columns.tolist())
print("Test columns:", test_df.columns.tolist())

# Let's start over and build a full model using Kfold approach

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

seed = 42

# Load data
df = pd.read_csv("data/auto-mpg.csv")

# Simple cleanup: drop rows with missing values
df = df.dropna()

# Separate features and target
X = df.drop(columns=["mpg"])
y = df["mpg"]

# 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=seed)

r2_scores = []
rmse_scores = []

for train_index, val_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # Train linear regression model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_val)

    # Metrics
    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_val, y_pred)

    rmse_scores.append(rmse)
    r2_scores.append(r2)

# Average metrics
print("Average R-squared:", np.mean(r2_scores))
print("Average RMSE:", np.mean(rmse_scores))


# And again, let's start fresh adding some automated model selection

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

seed = 42

# -----------------------
# Load and prepare data
# -----------------------
df = pd.read_csv("data/auto-mpg.csv")
df = df.dropna()

X = df.drop(columns=["mpg"])
y = df["mpg"]

# Split into training (70%) and test (30%)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=seed
)

# -----------------------
# Pick best polynomial degree using 5-fold CV on TRAINING only
# -----------------------
kf = KFold(n_splits=5, shuffle=True, random_state=42)

def cv_scores_for_degree(degree):
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    X_poly = poly.fit_transform(X_train)

    r2_list = []
    rmse_list = []

    # We'll wrap X_poly in a DataFrame so .iloc works easily
    X_poly = pd.DataFrame(X_poly)

    for tr_idx, val_idx in kf.split(X_poly):
        X_tr = X_poly.iloc[tr_idx]
        X_val = X_poly.iloc[val_idx]
        y_tr = y_train.iloc[tr_idx]
        y_val = y_train.iloc[val_idx]

        model = LinearRegression()
        model.fit(X_tr, y_tr)

        preds = model.predict(X_val)

        r2_list.append(r2_score(y_val, preds))
        rmse_list.append(np.sqrt(mean_squared_error(y_val, preds)))

    return np.mean(r2_list), np.mean(rmse_list)

results = {}
for degree in [1, 2]:
    avg_r2, avg_rmse = cv_scores_for_degree(degree)
    results[degree] = (avg_r2, avg_rmse)
    print(f"Degree {degree} CV -> Avg R²: {avg_r2:.4f}, Avg RMSE: {avg_rmse:.4f}")

# Choose best degree (highest R²; if tie, lowest RMSE)
best_degree = max(results.keys(), key=lambda d: (results[d][0], -results[d][1]))
print("\nBest polynomial degree:", best_degree)

# -----------------------
# Train FINAL model on full training set using best degree
# -----------------------
poly = PolynomialFeatures(degree=best_degree, include_bias=False)

X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)   # IMPORTANT: transform only

final_model = LinearRegression()
final_model.fit(X_train_poly, y_train)

# -----------------------
# Evaluate on test set
# -----------------------
test_preds = final_model.predict(X_test_poly)

test_r2 = r2_score(y_test, test_preds)
test_rmse = np.sqrt(mean_squared_error(y_test, test_preds))

print("\nTest set results:")
print("  R-squared:", test_r2)
print("  RMSE     :", test_rmse)

