Imports

In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error

def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

RANDOM_STATE = 4

Load Dataset

In [17]:
import pandas as pd

df = pd.read_csv("../data/train.csv")
df.head(10)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000



Inspect Dataset

In [9]:
print("Shape:", df.shape)
print("\nMissing values:")
print(df.isnull().sum().sort_values(ascending=False).head(20))

Shape: (1460, 81)

Missing values:
Id              0
MSSubClass      0
MSZoning        0
LotFrontage     0
LotArea         0
Street          0
Alley           0
LotShape        0
LandContour     0
Utilities       0
LotConfig       0
LandSlope       0
Neighborhood    0
Condition1      0
Condition2      0
BldgType        0
HouseStyle      0
OverallQual     0
OverallCond     0
YearBuilt       0
dtype: int64


In [1]:
#Split Dataset into Train and Test

In [20]:
# Select target variable
y = df['SalePrice']
X = df[['GrLivArea', 'OverallQual', 'Neighborhood', 'HouseStyle']]

# Split data
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

print("Train shape:", train_X.shape)
print("Test shape:", test_X.shape)

Train shape: (1168, 4)
Test shape: (292, 4)


In [27]:
#preprocessing

In [23]:
continuous_features = ['GrLivArea', 'OverallQual']
categorical_features = ['Neighborhood', 'HouseStyle']

for col in continuous_features + categorical_features:
    if col not in df.columns:
        raise ValueError(f"Column '{col}' not found in dataset. Check the column names!")

# Select features
X = df[continuous_features + categorical_features]
y = df['SalePrice']

# Split into train and test
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

for col in continuous_features:
    train_X[col] = train_X[col].fillna(train_X[col].median())
    test_X[col] = test_X[col].fillna(train_X[col].median())  # use train median for test

for col in categorical_features:
    train_X[col] = train_X[col].fillna(train_X[col].mode()[0])
    test_X[col] = test_X[col].fillna(train_X[col].mode()[0])  # use train mode for test

scaler = StandardScaler()
train_X[continuous_features] = scaler.fit_transform(train_X[continuous_features])
test_X[continuous_features] = scaler.transform(test_X[continuous_features])

train_X = pd.get_dummies(train_X, columns=categorical_features, drop_first=True)
test_X = pd.get_dummies(test_X, columns=categorical_features, drop_first=True)

test_X = test_X.reindex(columns=train_X.columns, fill_value=0)

print("Preprocessing complete!")
print("Train shape:", train_X.shape)
print("Test shape:", test_X.shape)

Preprocessing complete!
Train shape: (1168, 33)
Test shape: (292, 33)


In [25]:
#Model Training and Evaluation

In [26]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error
import numpy as np

# RMSLE function
def compute_rmsle(y_test, y_pred, precision=2):
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

# Initialize model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train model
model.fit(train_X, train_y)

# Predict
y_pred = model.predict(test_X)

# Evaluate
rmsle_value = compute_rmsle(test_y.to_numpy(), y_pred)
print("RMSLE:", rmsle_value)

RMSLE: 0.17
