In [6]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


In [7]:
DATA_DIR = Path("data")   # make sure train.csv & test.csv are in a folder called 'data'
train = pd.read_csv(DATA_DIR / "train.csv")
test  = pd.read_csv(DATA_DIR / "test.csv")

train.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [8]:
# Bathrooms = FullBath + BsmtFullBath + 0.5*(HalfBath + BsmtHalfBath)
for df in [train, test]:
    df["Bathrooms"] = (
        df[["FullBath", "BsmtFullBath"]].fillna(0).sum(axis=1)
        + 0.5 * df[["HalfBath", "BsmtHalfBath"]].fillna(0).sum(axis=1)
    )

FEATURES = ["GrLivArea", "BedroomAbvGr", "Bathrooms"]
X = train[FEATURES].fillna(0)
y = train["SalePrice"]

X.head()


Unnamed: 0,GrLivArea,BedroomAbvGr,Bathrooms
0,1710,3,3.5
1,1262,3,2.5
2,1786,3,3.5
3,1717,3,2.0
4,2198,4,3.5


In [9]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [10]:
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_val)

# Old sklearn: no 'squared' param → compute RMSE manually
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_val, y_pred)

print(f"Validation RMSE: {rmse:,.2f}")
print(f"R² Score: {r2:.4f}")


Validation RMSE: 51,222.58
R² Score: 0.6579


In [11]:
model.fit(X, y)
X_test = test[FEATURES].fillna(0)
test_pred = model.predict(X_test)


In [12]:
sub = pd.DataFrame({
    "Id": test["Id"],
    "SalePrice": test_pred
})

sub.to_csv("simple_linear_regression.csv", index=False)
sub.head()


Unnamed: 0,Id,SalePrice
0,1461,103507.7717
1,1462,137805.645467
2,1463,197737.620829
3,1464,195242.572339
4,1465,171823.109989
