**STEP 1: IMPORT LIBRARIES**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings("ignore")

**STEP 2: LOAD AND COMBINE DATA**

In [2]:
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')


In [3]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
train['TrainFlag'] = 1
test['TrainFlag'] = 0
test['SalePrice'] = np.nan

data = pd.concat([train, test], axis=0, ignore_index=True)
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,TrainFlag
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,,,,0,2,2008,WD,Normal,208500.0,1
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,,,,0,5,2007,WD,Normal,181500.0,1
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,,,,0,9,2008,WD,Normal,223500.0,1
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,,,,0,2,2006,WD,Abnorml,140000.0,1
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,,,,0,12,2008,WD,Normal,250000.0,1


**STEP 3: FEATURE ENGINEERING**

In [5]:
data['TotalSF'] = data['TotalBsmtSF'] + data['1stFlrSF'] + data['2ndFlrSF']
data['TotalBathrooms'] = (data['FullBath'] + 0.5 * data['HalfBath'] + data['BsmtFullBath'] + 0.5 * data['BsmtHalfBath'])
data['HasPool'] = data['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
data['IsRemodeled'] = (data['YearBuilt'] != data['YearRemodAdd']).astype(int)
data['HouseAge'] = data['YrSold'] - data['YearBuilt']
data['RemodAge'] = data['YrSold'] - data['YearRemodAdd']
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,SaleType,SaleCondition,SalePrice,TrainFlag,TotalSF,TotalBathrooms,HasPool,IsRemodeled,HouseAge,RemodAge
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,WD,Normal,208500.0,1,2566.0,3.5,0,0,5,5
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,WD,Normal,181500.0,1,2524.0,2.5,0,0,31,31
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,WD,Normal,223500.0,1,2706.0,3.5,0,1,7,6
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,WD,Abnorml,140000.0,1,2473.0,2.0,0,1,91,36
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,WD,Normal,250000.0,1,3343.0,3.5,0,0,8,8


**STEP 4: HANDLE MISSING VALUES**

In [6]:
data.isnull().sum()

Id                  0
MSSubClass          0
MSZoning            4
LotFrontage       486
LotArea             0
                 ... 
TotalBathrooms      2
HasPool             0
IsRemodeled         0
HouseAge            0
RemodAge            0
Length: 88, dtype: int64

In [7]:
for col in data.select_dtypes(include='object').columns:
    data[col] = data[col].fillna('None')

for col in data.select_dtypes(include=['int64', 'float64']).columns:
    data[col] = data[col].fillna(data[col].median())

**STEP 5: LABEL ENCODING**

In [8]:
for col in data.select_dtypes(include='object').columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])

**STEP 6: SPLIT TRAIN AND TEST BACK**

In [9]:
train_cleaned = data[data['TrainFlag'] == 1].drop('TrainFlag', axis=1)
test_cleaned = data[data['TrainFlag'] == 0].drop(['TrainFlag', 'SalePrice'], axis=1)

X = train_cleaned.drop(['Id', 'SalePrice'], axis=1)
y = np.log1p(train_cleaned['SalePrice'])  # log1p
X_test_final = test_cleaned.drop('Id', axis=1)

**Scaling (only feature matrix, not target)**

In [10]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test_final)

**STEP 7: SPLIT TRAIN-VALIDATION FOR LOCAL TEST**

In [11]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

**STEP 8: TRAIN DEFAULT REGRESSION MODELS**

In [12]:
def train_and_evaluate(model, name):
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    rmse = np.sqrt(mean_squared_error(y_valid, preds))
    r2 = r2_score(y_valid, preds)
    mae = mean_absolute_error(y_valid, preds)
    print(f"{name} RMSE: {rmse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")
    return (name, model, rmse, mae, r2)

results = []
results.append(train_and_evaluate(LinearRegression(), "Linear Regression"))
results.append(train_and_evaluate(Lasso(alpha=0.001), "Lasso Regression"))
results.append(train_and_evaluate(SVR(C=1.0, kernel='rbf'), "Support Vector Regression"))
results.append(train_and_evaluate(RandomForestRegressor(n_estimators=100, random_state=42), "Random Forest"))
results.append(train_and_evaluate(XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42), "XGBoost"))

results_df = pd.DataFrame(results, columns=["Model", "Instance", "RMSE", "MAE", "R2"])
print("\nRegression Models Comparison:\n", results_df.sort_values(by="RMSE")[["Model", "RMSE", "MAE", "R2"]])

Linear Regression RMSE: 0.1628, MAE: 0.1012, R²: 0.8580
Lasso Regression RMSE: 0.1532, MAE: 0.1050, R²: 0.8742
Support Vector Regression RMSE: 0.2091, MAE: 0.1450, R²: 0.7657
Random Forest RMSE: 0.1442, MAE: 0.0963, R²: 0.8886
XGBoost RMSE: 0.1382, MAE: 0.0895, R²: 0.8976

Regression Models Comparison:
                        Model      RMSE       MAE        R2
4                    XGBoost  0.138225  0.089538  0.897615
3              Random Forest  0.144213  0.096272  0.888552
1           Lasso Regression  0.153192  0.104962  0.874242
0          Linear Regression  0.162798  0.101239  0.857975
2  Support Vector Regression  0.209119  0.144972  0.765657


In [13]:
# Sort untuned results by RMSE (lower is better)
best_untuned_row = results_df.sort_values(by="RMSE").iloc[0]
print("\nBest Untuned Model:")
print(f"Model: {best_untuned_row['Model']}")
print(f"RMSE: {best_untuned_row['RMSE']:.4f}")
print(f"MAE:  {best_untuned_row['MAE']:.4f}")
print(f"R²:   {best_untuned_row['R2']:.4f}")


Best Untuned Model:
Model: XGBoost
RMSE: 0.1382
MAE:  0.0895
R²:   0.8976


**STEP 9: BEST MODEL PREDICT**

In [14]:
results_df = pd.DataFrame(results, columns=["Model", "Instance", "RMSE", "MAE", "R2"])
best_untuned = results_df.sort_values("RMSE").iloc[0]["Instance"]

best_untuned.fit(X, y)
untuned_preds = np.expm1(best_untuned.predict(X_test_final))


In [15]:
submission_untuned = pd.DataFrame({
    'Id': test['Id'],
    'SalePrice': untuned_preds.astype(float)
})
submission_untuned.to_csv('submission_untuned.csv', index=False)
print("submission_untuned.csv saved.")

submission_untuned.csv saved.


In [16]:
sub = pd.read_csv('/kaggle/working/submission_untuned.csv')
sub.head()

Unnamed: 0,Id,SalePrice
0,1461,126430.578125
1,1462,166793.375
2,1463,180129.796875
3,1464,195927.46875
4,1465,178825.703125


**STEP 10: TUNING FUNCTION**

In [17]:
def tune_model(name, model, param_grid):
    print(f"\n🔧 Tuning {name}...")
    grid = GridSearchCV(model, param_grid, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1)
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    preds = best_model.predict(X_valid)
    rmse = np.sqrt(mean_squared_error(y_valid, preds))
    mae = mean_absolute_error(y_valid, preds)
    r2 = r2_score(y_valid, preds)
    print(f"{name} Best Params: {grid.best_params_}")
    print(f"{name} RMSE: {rmse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")
    return (name, best_model, rmse, mae, r2)

**STEP 11: PARAMETER GRIDS**

In [18]:
param_grids = {
    'Linear Regression': {},  # no params
    'Lasso Regression': {'alpha': [0.0001, 0.001, 0.01, 0.1, 1]},
    'Support Vector Regression': {'C': [0.1, 1, 10], 'gamma': ['scale', 0.01, 0.1], 'kernel': ['rbf']},
    'Random Forest': {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    },
    'XGBoost': {
        'n_estimators': [100, 200],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.05, 0.1],
        'subsample': [0.8, 1.0]
    }
}


**STEP 12: TUNE ALL MODELS**

In [19]:
models_to_tune = {
    'Linear Regression': LinearRegression(),
    'Lasso Regression': Lasso(random_state=42),
    'Support Vector Regression': SVR(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(objective='reg:squarederror', random_state=42)
}

tuned_results = []
for name, model in models_to_tune.items():
    tuned_results.append(tune_model(name, model, param_grids[name]))


🔧 Tuning Linear Regression...
Linear Regression Best Params: {}
Linear Regression RMSE: 0.1628, MAE: 0.1012, R²: 0.8580

🔧 Tuning Lasso Regression...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Lasso Regression Best Params: {'alpha': 0.01}
Lasso Regression RMSE: 0.1527, MAE: 0.1039, R²: 0.8751

🔧 Tuning Support Vector Regression...
Support Vector Regression Best Params: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Support Vector Regression RMSE: 0.1961, MAE: 0.1268, R²: 0.7939

🔧 Tuning Random Forest...
Random Forest Best Params: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Random Forest RMSE: 0.1442, MAE: 0.0963, R²: 0.8886

🔧 Tuning XGBoost...
XGBoost Best Params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}
XGBoost RMSE: 0.1317, MAE: 0.0866, R²: 0.9071


In [20]:
tuned_df = pd.DataFrame(tuned_results, columns=["Model", "BestModel", "RMSE", "MAE", "R2_Score"])
print("\nTuned Model Comparison:\n", tuned_df.sort_values("RMSE")[["Model", "RMSE", "MAE", "R2_Score"]])


Tuned Model Comparison:
                        Model      RMSE       MAE  R2_Score
4                    XGBoost  0.131700  0.086578  0.907053
3              Random Forest  0.144213  0.096272  0.888552
1           Lasso Regression  0.152693  0.103933  0.875059
0          Linear Regression  0.162798  0.101239  0.857975
2  Support Vector Regression  0.196129  0.126774  0.793868


In [21]:
# Sort tuned results by RMSE (lower is better)
best_tuned_row = tuned_df.sort_values(by="RMSE").iloc[0]
print("\nBest Tuned Model:")
print(f"Model: {best_tuned_row['Model']}")
print(f"RMSE: {best_tuned_row['RMSE']:.4f}")
print(f"MAE:  {best_tuned_row['MAE']:.4f}")
print(f"R²:   {best_tuned_row['R2_Score']:.4f}")


Best Tuned Model:
Model: XGBoost
RMSE: 0.1317
MAE:  0.0866
R²:   0.9071


In [22]:
tuned_df = pd.DataFrame(tuned_results, columns=["Model", "BestModel", "RMSE", "MAE", "R2_Score"])
best_tuned = tuned_df.sort_values("RMSE").iloc[0]["BestModel"]

best_tuned.fit(X, y)
tuned_preds = np.expm1(best_tuned.predict(X_test_final))

In [23]:
submission_tuned = pd.DataFrame({
    'Id': test['Id'],
    'SalePrice': tuned_preds.astype(float)
})
submission_tuned.to_csv('submission_tuned.csv', index=False)
print("submission_tuned.csv saved.")

submission_tuned.csv saved.


In [24]:
sub1 = pd.read_csv('/kaggle/working/submission_tuned.csv')
sub1.head()

Unnamed: 0,Id,SalePrice
0,1461,132642.40625
1,1462,163252.640625
2,1463,179033.203125
3,1464,193615.109375
4,1465,188701.5625
