In [4]:
import pandas as pd
import numpy as np

train = pd.read_csv("C:\\Users\\pratik\\Downloads\\house-prices-advanced-regression-techniques\\train.csv")
test = pd.read_csv("C:\\Users\\pratik\\Downloads\\house-prices-advanced-regression-techniques\\test.csv")

In [2]:
print("Train shape:", train.shape)
print("Test shape:", test.shape)

train.head()

Train shape: (1460, 81)
Test shape: (1459, 80)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
#Check missing values
train.isnull().sum().sort_values(ascending=False).head(25)

PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
MasVnrType       872
FireplaceQu      690
LotFrontage      259
GarageQual        81
GarageFinish      81
GarageType        81
GarageYrBlt       81
GarageCond        81
BsmtFinType2      38
BsmtExposure      38
BsmtCond          37
BsmtQual          37
BsmtFinType1      37
MasVnrArea         8
Electrical         1
Condition2         0
BldgType           0
Neighborhood       0
LandSlope          0
LotConfig          0
Condition1         0
dtype: int64

In [7]:
#Separate numeric and categorical columns
num_cols = list(train.select_dtypes(include=[np.number]).columns)
cat_cols = list(train.select_dtypes(include=['object']).columns)

# Remove SalePrice from num_cols (because test doesn't have it)
if "SalePrice" in num_cols:
    num_cols.remove("SalePrice")

#Fill missing values
train[num_cols] = train[num_cols].fillna(train[num_cols].median())
test[num_cols] = test[num_cols].fillna(train[num_cols].median())

train[cat_cols] = train[cat_cols].fillna("None")
test[cat_cols] = test[cat_cols].fillna("None")

print("Missing values after filling (train):")
train.isnull().sum().sort_values(ascending=False).head()

Missing values after filling (train):


Id             0
MSSubClass     0
MSZoning       0
LotFrontage    0
LotArea        0
dtype: int64

In [8]:
#Feature Encoding and Model Baseline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

# Separate target variable
y = train['SalePrice']
X = train.drop('SalePrice', axis=1)

# One-Hot Encode categorical features
X = pd.get_dummies(X, drop_first=True)
test_encoded = pd.get_dummies(test, drop_first=True)

# Align columns between train and test (make them same)
X, test_encoded = X.align(test_encoded, join='left', axis=1)
test_encoded = test_encoded.fillna(0)

print("Train shape after encoding:", X.shape)
print("Test shape after encoding:", test_encoded.shape)

# Train-validation split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Baseline Model: Random Forest
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Predictions & Evaluation
preds = model.predict(X_valid)
rmse = np.sqrt(mean_squared_error(y_valid, preds))

print("Baseline RMSE:", rmse)

Train shape after encoding: (1460, 261)
Test shape after encoding: (1459, 261)
Baseline RMSE: 29275.221040147808


In [9]:
# Copy the encoded data again (from previous step)
X_full = X.copy()
y_full = y.copy()

# Apply log transformation to target variable
y_full_log = np.log1p(y_full)

# Identify skewed numeric features
numeric_feats = X_full.select_dtypes(include=[np.number]).columns
skew_vals = X_full[numeric_feats].skew().sort_values(ascending=False)

# Select features with high skewness
skewed_features = skew_vals[skew_vals > 0.75].index

print("Number of skewed features:", len(skewed_features))

# Apply log1p transformation to reduce skew
X_full[skewed_features] = np.log1p(X_full[skewed_features])

print("Skew transformation completed!")


Number of skewed features: 21
Skew transformation completed!


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

# Split again (same random_state for fair comparison)
X_train, X_valid, y_train, y_valid = train_test_split(
    X_full, y_full_log, test_size=0.2, random_state=42
)

# Train model
model2 = RandomForestRegressor(random_state=42)
model2.fit(X_train, y_train)

# Predict & evaluate
preds = model2.predict(X_valid)
rmse = np.sqrt(mean_squared_error(y_valid, preds))

print("Improved RMSE (log target):", rmse)

Improved RMSE (log target): 0.14759158736185857


In [11]:
import pandas as pd
import numpy as np

# Work on encoded dataset used in Step 4
data = X_full.copy()

# Create new feature — House Age and Remodel Age
data['HouseAge'] = 2025 - train['YearBuilt']   # simple assumption (you can use YrSold column later)
data['RemodAge'] = 2025 - train['YearRemodAdd']

# Total Square Foot living area
data['TotalSF'] = (
    train['TotalBsmtSF'] + 
    train['1stFlrSF'] + 
    train['2ndFlrSF']
)

# Total Bathrooms
data['TotalBath'] = (
    train['FullBath'] +
    (0.5 * train['HalfBath']) +
    train['BsmtFullBath'] +
    (0.5 * train['BsmtHalfBath'])
)

# Whether house has been remodeled
data['Remodeled'] = (train['YearRemodAdd'] != train['YearBuilt']).astype(int)

print("New features added! Total columns:", data.shape[1])

New features added! Total columns: 266


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

# Use log-transformed target variable
X_train, X_valid, y_train, y_valid = train_test_split(
    data, y_full_log, test_size=0.2, random_state=42
)

model3 = GradientBoostingRegressor(random_state=42)
model3.fit(X_train, y_train)

preds = model3.predict(X_valid)
rmse = np.sqrt(mean_squared_error(y_valid, preds))

print("RMSE after Feature Engineering + GradientBoosting:", rmse)


RMSE after Feature Engineering + GradientBoosting: 0.13722905215986322
