In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import pickle
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import NearestNeighbors

# Load the Boston dataset from the URL
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep=r"\s+", skiprows=22, header=None)  # Fixed SyntaxWarning
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

# Original DataFrame (with B and LSTAT) - Training on the full data
df = pd.DataFrame(data, columns=[
    "CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT"
])
df['target'] = target

# --- Feature Engineering ---
poly = PolynomialFeatures(degree=2, include_bias=False)
X = df.drop('target', axis=1)
X_poly = poly.fit_transform(X)
X = pd.DataFrame(X_poly, columns=poly.get_feature_names_out(input_features=X.columns))

# --- Feature Scaling ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)

# --- Train/Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X, df['target'], test_size=0.2, random_state=42)

# --- Model Training (Ridge Regularization) ---
param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}  # Range of alpha values to try
grid_search = GridSearchCV(Ridge(), param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

model = grid_search.best_estimator_
print("Best alpha:", grid_search.best_params_['alpha'])

# --- Prediction and Evaluation ---
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

# --- Nearest Neighbors for Imputation ---
# Features used for finding nearest neighbors - EXCLUDE B and LSTAT
features_for_nn = ["CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO"]

#Data Frame now must only include KNN features to fix transform error

# Create a PolynomialFeatures object with 11 features for KNN scaling
poly_knn = PolynomialFeatures(degree=2, include_bias=False)

df_knn = df[features_for_nn]

X_poly_knn = poly_knn.fit_transform(df_knn)

#X_scaled_nn = scaler.transform(X_poly_knn) #This is not right since scaler knows everything
scaler_knn = StandardScaler()
X_scaled_nn = scaler_knn.fit_transform(X_poly_knn) # scale data here
#Create and fit KNN Model
knn = NearestNeighbors(n_neighbors=5) # You can adjust the number of neighbors
knn.fit(X_scaled_nn)

# --- Save the Model, Scaler, and KNN ---
filename = 'boston_model.pkl'
pickle.dump((model, scaler, poly, knn, features_for_nn,poly_knn,scaler_knn), open(filename, 'wb')) # Save KNN and Feature
print(f"Model saved as {filename}")

Best alpha: 1
Mean Squared Error: 11.205155608496593
R-squared: 0.8472033607757589
Model saved as boston_model.pkl
