In [25]:
import warnings

import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

from src.config import RAW_DATA_DIR

warnings.filterwarnings("ignore")

In [29]:
# Load data
df = pd.read_csv(RAW_DATA_DIR / "raw.csv")
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()[:10]}...")  # First 10
df.head()

Shape: (1460, 81)
Columns: ['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities']...


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [30]:
# Quick look at target variable
df["SalePrice"].describe()

count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

In [31]:
# Check missing values (quick overview)
missing = df.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)
print(f"Columns with missing values: {len(missing)}")
print(missing.head(10))

Columns with missing values: 19
PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
MasVnrType       872
FireplaceQu      690
LotFrontage      259
GarageType        81
GarageYrBlt       81
GarageFinish      81
dtype: int64


In [32]:
# SIMPLEST POSSIBLE APPROACH - Only use numerical columns
# Drop columns with too many missing values
# Drop Id column
# Drop target from features

# Keep only numerical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
numerical_cols.remove("SalePrice")  # Remove target
if "Id" in numerical_cols:
    numerical_cols.remove("Id")  # Remove ID

print(f"Using {len(numerical_cols)} numerical features")

# Create feature matrix and target
X = df[numerical_cols].copy()
y = df["SalePrice"].copy()

Using 36 numerical features


In [33]:
# Simple imputation - fill missing with median
X = X.fillna(X.median())

print(f"Final X shape: {X.shape}")
print(f"Any missing values left? {X.isnull().sum().sum()}")

Final X shape: (1460, 36)
Any missing values left? 0


In [34]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Train size: {len(X_train)}")
print(f"Test size: {len(X_test)}")

Train size: 1168
Test size: 292


In [35]:
# Train baseline model - Simple Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

print("Model trained!")

Model trained!


In [36]:
# Evaluate
def evaluate(y_true, y_pred, dataset_name):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)

    print(f"\n{dataset_name} Results:")
    print(f"  MAE:  ${mae:,.2f}")
    print(f"  RMSE: ${rmse:,.2f}")
    print(f"  R²:   {r2:.4f}")
    return mae, rmse, r2


train_results = evaluate(y_train, y_train_pred, "TRAIN")
test_results = evaluate(y_test, y_test_pred, "TEST")


TRAIN Results:
  MAE:  $21,066.67
  RMSE: $33,920.14
  R²:   0.8071

TEST Results:
  MAE:  $22,975.86
  RMSE: $36,836.91
  R²:   0.8231


In [37]:
# Visualize predictions
residuals = y_test - y_test_pred

# Create subplots with plotly
fig = make_subplots(rows=1, cols=2, subplot_titles=("Predictions vs Actual", "Residual Plot"))

# Left plot: Predictions vs Actual
fig.add_trace(
    go.Scatter(
        x=y_test,
        y=y_test_pred,
        mode="markers",
        marker=dict(color="blue", opacity=0.5),
        name="Predictions",
    ),
    row=1,
    col=1,
)

# Add perfect prediction line
min_val = y_test.min()
max_val = y_test.max()
fig.add_trace(
    go.Scatter(
        x=[min_val, max_val],
        y=[min_val, max_val],
        mode="lines",
        line=dict(color="red", dash="dash", width=2),
        name="Perfect Prediction",
    ),
    row=1,
    col=1,
)

# Right plot: Residuals
fig.add_trace(
    go.Scatter(
        x=y_test_pred,
        y=residuals,
        mode="markers",
        marker=dict(color="blue", opacity=0.5),
        name="Residuals",
    ),
    row=1,
    col=2,
)

# Add zero line
fig.add_trace(
    go.Scatter(
        x=[y_test_pred.min(), y_test_pred.max()],
        y=[0, 0],
        mode="lines",
        line=dict(color="red", dash="dash", width=2),
        name="Zero Line",
    ),
    row=1,
    col=2,
)

# Update axes labels
fig.update_xaxes(title_text="Actual Price", row=1, col=1)
fig.update_yaxes(title_text="Predicted Price", row=1, col=1)
fig.update_xaxes(title_text="Predicted Price", row=1, col=2)
fig.update_yaxes(title_text="Residuals", row=1, col=2)

# Update layout
fig.update_layout(height=500, width=1200, showlegend=True, title_text="Baseline Model Results")

fig.show()

In [38]:
# Check for overfitting
print("\nOverfitting Check:")
print(f"Train R²: {train_results[2]:.4f}")
print(f"Test R²:  {test_results[2]:.4f}")
print(f"Difference: {train_results[2] - test_results[2]:.4f}")

if train_results[2] - test_results[2] > 0.1:
    print("⚠️  Possible overfitting detected!")
else:
    print("✅ No significant overfitting")


Overfitting Check:
Train R²: 0.8071
Test R²:  0.8231
Difference: -0.0160
✅ No significant overfitting


In [39]:
# Save results for comparison
results = {
    "model": "LinearRegression",
    "features": "numerical_only",
    "n_features": len(numerical_cols),
    "train_mae": train_results[0],
    "test_mae": test_results[0],
    "train_r2": train_results[2],
    "test_r2": test_results[2],
}

print("\n📊 BASELINE RESULTS:")
for key, value in results.items():
    print(f"  {key}: {value}")


📊 BASELINE RESULTS:
  model: LinearRegression
  features: numerical_only
  n_features: 36
  train_mae: 21066.668643748908
  test_mae: 22975.856509153044
  train_r2: 0.8070976808213148
  test_r2: 0.8230899421141546
