## Import Libraries

In [1]:
# %load_ext autoreload
%reload_ext autoreload
%autoreload 2

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
from IPython import display

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

## Data Load

In [2]:
df_train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
df_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [4]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

In [5]:
df_train.isna().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [6]:
df_test.isna().sum()

Id                 0
MSSubClass         0
MSZoning           4
LotFrontage      227
LotArea            0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           1
SaleCondition      0
Length: 80, dtype: int64

In [7]:
df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [8]:
df_train.shape

(1460, 81)

In [9]:
df_test.shape

(1459, 80)

## Preprocessing

In [10]:
# xóa các cột có nhiều Missing Values

df_train = df_train.drop(columns=['Alley','MasVnrType', 'FireplaceQu','PoolQC','Fence','MiscFeature'])
df_test = df_test.drop(columns=['Alley','MasVnrType', 'FireplaceQu','PoolQC','Fence','MiscFeature'])
print("---")

---


In [11]:
# Điền median các cột numeric 
num_cols=df_train.drop(columns=['SalePrice']).select_dtypes(include=['int64', 'float64','int32']).columns

median_values = df_train[num_cols].median()

df_train[num_cols] = df_train[num_cols].fillna(median_values)
df_test[num_cols] = df_test[num_cols].fillna(median_values)
print("---")

---


In [12]:
for cols in df_train.drop(columns=['SalePrice']).columns:
    mode = df_train[cols].mode()[0]
    df_train[cols].fillna(mode, inplace=True)
    df_test[cols].fillna(mode, inplace=True)
print("---")

---


In [13]:
pd.set_option('display.max_rows', None)

In [14]:
df_train.isna().sum()

Id               0
MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
Street           0
LotShape         0
LandContour      0
Utilities        0
LotConfig        0
LandSlope        0
Neighborhood     0
Condition1       0
Condition2       0
BldgType         0
HouseStyle       0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
RoofStyle        0
RoofMatl         0
Exterior1st      0
Exterior2nd      0
MasVnrArea       0
ExterQual        0
ExterCond        0
Foundation       0
BsmtQual         0
BsmtCond         0
BsmtExposure     0
BsmtFinType1     0
BsmtFinSF1       0
BsmtFinType2     0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
Heating          0
HeatingQC        0
CentralAir       0
Electrical       0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     0
BsmtHalfBath     0
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr     0
KitchenQual      0
TotRmsAbvGrd

In [15]:
df_test.isna().sum()

Id               0
MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
Street           0
LotShape         0
LandContour      0
Utilities        0
LotConfig        0
LandSlope        0
Neighborhood     0
Condition1       0
Condition2       0
BldgType         0
HouseStyle       0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
RoofStyle        0
RoofMatl         0
Exterior1st      0
Exterior2nd      0
MasVnrArea       0
ExterQual        0
ExterCond        0
Foundation       0
BsmtQual         0
BsmtCond         0
BsmtExposure     0
BsmtFinType1     0
BsmtFinSF1       0
BsmtFinType2     0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
Heating          0
HeatingQC        0
CentralAir       0
Electrical       0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     0
BsmtHalfBath     0
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr     0
KitchenQual      0
TotRmsAbvGrd

## ENCODING

 ### onehot các cột object 

In [16]:

cat_columns=df_train.select_dtypes(include=['object']).columns
num_columns=df_train.select_dtypes(include=['int64','float64']).columns
num_columns_test=df_train.select_dtypes(include=['int64','float64']).drop(columns=['SalePrice']).columns

In [17]:
ohe=OneHotEncoder(drop='first', handle_unknown='ignore', sparse=False)
encoded_train = ohe.fit_transform(df_train[cat_columns])
encoded_test = ohe.transform(df_test[cat_columns])

In [18]:
encoded_df_train = pd.DataFrame(encoded_train,columns=ohe.get_feature_names_out(cat_columns), index=df_train.index)
encoded_df_test = pd.DataFrame(encoded_test, columns=ohe.get_feature_names_out(cat_columns), index=df_test.index)

In [19]:
num_df_train = df_train[num_columns]
num_df_test = df_test[num_columns_test]

In [20]:
df_train = pd.concat([num_df_train,encoded_df_train], axis=1)
df_test = pd.concat([num_df_test,encoded_df_test], axis=1)

In [21]:
df_train.shape

(1460, 231)

In [22]:
df_test.shape

(1459, 230)

## SCALING

In [23]:
scaler = StandardScaler()


# num_columns_test không chứa cột SalePrice, được định nghĩa ở onehot 
df_train[num_columns_test] = scaler.fit_transform(df_train[num_columns_test])
df_test[num_columns_test]  = scaler.transform(df_test[num_columns_test])

print("✓ Hoàn thành Scaling")

✓ Hoàn thành Scaling


In [24]:
df_train = df_train.drop(columns=['Id'])
df_test = df_test.drop(columns=['Id'])

In [25]:
df_train.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,0.073375,-0.220875,-0.207142,0.651479,-0.5172,1.050994,0.878668,0.514104,0.575425,-0.288653,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,-0.872563,0.46032,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.57075,1.171992,-0.288653,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.073375,-0.084636,0.07348,0.651479,-0.5172,0.984752,0.830215,0.325915,0.092907,-0.288653,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.309859,-0.44794,-0.096897,0.651479,-0.5172,-1.863632,-0.720298,-0.57075,-0.499274,-0.288653,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.073375,0.641972,0.375148,1.374795,-0.5172,0.951632,0.733308,1.366489,0.463568,-0.288653,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [26]:
df_test.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,-0.872563,0.46032,0.110763,-0.795151,0.381743,-0.340077,-1.15638,-0.57075,0.053428,0.604293,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,-0.872563,0.505733,0.37585,-0.071836,0.381743,-0.43944,-1.30174,0.027027,1.051363,-0.288653,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.073375,0.187842,0.332053,-0.795151,-0.5172,0.852269,0.6364,-0.57075,0.761852,-0.288653,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.073375,0.369494,-0.054002,-0.071836,0.381743,0.88539,0.6364,-0.460051,0.347326,-0.288653,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,1.492282,-1.219961,-0.552407,1.374795,-0.5172,0.686666,0.345679,-0.57075,-0.39619,-0.288653,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


## Train

In [27]:
print("\n=== TRAINING FULL MODEL ===")


X_train_full = df_train.drop('SalePrice', axis=1)
y_train_full = df_train['SalePrice']

model = Lasso(alpha=0.0005, max_iter=10000)
model.fit(X_train_full, y_train_full)

y_pred = model.predict(X_train_full)

rmsle = np.sqrt(mean_squared_error(np.log1p(y_train_full), np.log1p(y_pred)))

r2 = r2_score(y_train_full, y_pred)

print(f"RMSLE: {rmsle:.4f}")
print(f"R-squared: {r2:.4f}")
print("✓ Đã huấn luyện mô hình trên toàn bộ dữ liệu train")



=== TRAINING FULL MODEL ===
RMSLE: 0.1120
R-squared: 0.9301
✓ Đã huấn luyện mô hình trên toàn bộ dữ liệu train


## Submission

In [28]:
test_data_orig = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

test_preds = model.predict(df_test)

submission = pd.DataFrame({
    "Id": test_data_orig["Id"],
    "SalePrice": test_preds
})

submission.to_csv("submission.csv", index=False)

print("Submission file 'submission.csv' created.")

Submission file 'submission.csv' created.
