In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder

np.random.seed(42)
num_samples = 100

data = {
    'area': np.random.randint(1000, 4000, num_samples),
    'bedrooms': np.random.randint(2, 7, num_samples),
    'location': np.random.choice(['Downtown', 'Suburb', 'Rural'], num_samples)
}
df = pd.DataFrame(data)

base_price = 50000
area_coeff = 150
bedroom_coeff = 75000
location_premium = df['location'].map({'Downtown': 100000, 'Suburb': 30000, 'Rural': -20000})
noise = np.random.randint(-30000, 30000, num_samples)

df['price'] = base_price + (df['area'] * area_coeff) + (df['bedrooms'] * bedroom_coeff) + location_premium + noise

df_processed = pd.get_dummies(df, columns=['location'], drop_first=True)

X = df_processed.drop('price', axis=1)
y = df_processed['price']

model = LinearRegression()

k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

r2_scores = cross_val_score(model, X, y, cv=kf, scoring='r2')
rmse_scores = cross_val_score(model, X, y, cv=kf, scoring='neg_root_mean_squared_error')
rmse_scores = -rmse_scores

print(f"--- K-Fold Cross-Validation Results (k={k}) ---")
print("\nModel: Linear Regression")
print(f"Features: {list(X.columns)}")

print("\n--- R-squared (R2) ---")
print(f"Scores for each fold: {np.round(r2_scores, 4)}")
print(f"Average R2 Score: {np.mean(r2_scores):.4f}")

print("\n--- Root Mean Squared Error (RMSE) ---")
print(f"Scores for each fold: {np.round(rmse_scores, 2)}")
print(f"Average RMSE: ${np.mean(rmse_scores):,.2f}")