In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline


In [2]:
# CELL 2: Load Split and Preprocessed Data
train_data = pd.read_csv('train_data_no_leakage.csv')
test_data = pd.read_csv('test_data_no_leakage.csv')
print("Train shape:", train_data.shape)
print("Test shape:", test_data.shape)


Train shape: (14700, 30)
Test shape: (3675, 30)


In [3]:
# CELL 3: Feature & Target Selection, Boolean Conversion
X_train = train_data.copy()
y_train = train_data['price']

X_test = test_data.copy()
y_test = test_data['price']

X_train = X_train.drop(columns=['price'])
X_test = X_test.drop(columns=['price'])

# Convert booleans to int
for col in X_train.select_dtypes(include='bool').columns:
    X_train[col] = X_train[col].astype(int)
for col in X_test.select_dtypes(include='bool').columns:
    X_test[col] = X_test[col].astype(int)


In [4]:
# CELL 4: Cross-validation Using Only Training Data
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LinearRegression())
])
cv_rmse = -cross_val_score(pipe, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
print("Cross-Validation (training set) RMSE mean ± std:", cv_rmse.mean(), cv_rmse.std())


Cross-Validation (training set) RMSE mean ± std: 334.5186396265075 1.029448172293405


In [5]:
# CELL 5: Final Model Fit and Test Evaluation
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LinearRegression()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\n📊 Model Evaluation on Test Set:")
print("Mean Absolute Error:", mae)
print("Root Mean Squared Error:", rmse)
print("R² Score:", r2)



📊 Model Evaluation on Test Set:
Mean Absolute Error: 287.83135158634593
Root Mean Squared Error: 332.81432069439376
R² Score: -0.003074326541932404
