In [39]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error

In [40]:
df = pd.read_csv('train.csv')

In [41]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [42]:
df.size

118260

In [43]:
X = df.drop(columns=["SalePrice","Id"])
y = df['SalePrice']

In [44]:
df.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [45]:
df.duplicated().sum()

0

In [46]:
numeric_features = X.select_dtypes(include = ["int64","float64"]).columns
categorial_features = X.select_dtypes(include=["object"]).columns

In [47]:
#Preprocessing using pipelines
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                                      ('scaler', StandardScaler())])

In [48]:
categorial_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                                        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [49]:
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features), 
    ('cat', categorial_transformer, categorial_features)])

In [50]:
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", Ridge())  
])

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [52]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [53]:
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print("RMSE:", rmse)
print("R²:", r2)
print("MSE:", mse)


RMSE: 29842.045170639027
R²: 0.8838970291599837
MSE: 890547659.9664601


In [54]:
#Hyperparameter Tuning using GridSearchCV
param_grid = {
    "regressor__alpha": [0.1, 1.0, 10.0]  
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best params:", grid_search.best_params_)
print("Best CV score:", -grid_search.best_score_)

Best params: {'regressor__alpha': 10.0}
Best CV score: 32577.86583955041


In [55]:
y_pred_best = grid_search.predict(X_test)
rmse_best = np.sqrt(mean_squared_error(y_test, y_pred_best))
r2_best = r2_score(y_test, y_pred_best)

print("Tuned Validation RMSE:", rmse_best)
print("Tuned Validation R²:", r2_best)


Tuned Validation RMSE: 30646.412085994125
Tuned Validation R²: 0.8775537638980034


In [56]:
test_df = pd.read_csv('test.csv')
test_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [57]:
X_test = test_df.drop(columns=["Id"])  
test_preds = grid_search.predict(X_test)

In [58]:
print(test_preds)

[104060.22498776 151247.14601053 169713.83485334 ... 158532.52855631
  97932.1701329  229555.13136618]


In [59]:
submission = pd.DataFrame({
    "Id": test_df["Id"],       
    "SalePrice": test_preds   
})
submission.to_csv("submission.csv", index=False)
