In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')

train_ids = train['Id']
test_ids = test['Id']

y = train['SalePrice']
train.drop(['Id','SalePrice'], axis=1, inplace=True)
test.drop(['Id'], axis=1, inplace=True)

data = pd.concat([train, test], axis=0).reset_index(drop=True)

In [3]:
num_cols = data.select_dtypes(include=[np.number]).columns
data[num_cols] = data[num_cols].fillna(data[num_cols].median())

cat_cols = data.select_dtypes(include=['object']).columns
data[cat_cols] = data[cat_cols].fillna("None")

In [4]:
from sklearn.preprocessing import LabelEncoder, RobustScaler

In [5]:
for col in cat_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])

In [6]:
X = data.iloc[:len(y), :]
X_test = data.iloc[len(y):, :]

y_log = np.log1p(y)

In [7]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline

In [8]:
ridge_model = make_pipeline(RobustScaler(), Ridge(alpha=15))

kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse = np.sqrt(-cross_val_score(ridge_model, X, y_log,
                                 scoring="neg_mean_squared_error",
                                 cv=kf))
print("Ridge RMSE:", rmse.mean())

Ridge RMSE: 0.15806706313022528


In [9]:
ridge_model.fit(X, y_log)
preds = ridge_model.predict(X_test)

final_preds = np.expm1(preds)

In [10]:
submission = pd.DataFrame({
    "Id": test_ids,
    "SalePrice": final_preds
})

submission.to_csv("submission.csv", index=False)
print("submission.csv created!")

submission.csv created!
