In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [3]:
import numpy as np
import pandas as pd

# ======================
# 1) Load data
# ======================
train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test  = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

# ======================
# 2) Feature engineering: TotalSF
# ======================
X = train.drop("SalePrice", axis=1).copy()
X["TotalSF"] = (
    X["TotalBsmtSF"].fillna(0)
    + X["1stFlrSF"]
    + X["2ndFlrSF"]
)

X_test = test.copy()
X_test["TotalSF"] = (
    X_test["TotalBsmtSF"].fillna(0)
    + X_test["1stFlrSF"]
    + X_test["2ndFlrSF"]
)

# log target
y = np.log1p(train["SalePrice"])

# ======================
# 3) Column split
# ======================
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns

# ======================
# 4) Preprocessing + ElasticNet
# ======================
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import cross_val_score

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

model = ElasticNet(
    alpha=0.001,      
    l1_ratio=0.5,     
    random_state=42
)

clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

# ======================
# 5) Cross-validation
# ======================
scores = cross_val_score(
    clf, X, y,
    cv=5,
    scoring="neg_root_mean_squared_error"
)

print("CV RMSE (log):", -scores.mean())

# ======================
# 6) Train full + predict
# ======================
clf.fit(X, y)
test_preds_log = clf.predict(X_test)
test_preds = np.expm1(test_preds_log)

# ======================
# 7) Submission + sanity check
# ======================
submission = pd.DataFrame({
    "Id": test["Id"],
    "SalePrice": test_preds
})

print(submission["SalePrice"].describe())

submission.to_csv("/kaggle/working/submission.csv", index=False)
print("Saved submission.csv")


CV RMSE (log): 0.13601070767141976
count      1459.000000
mean     175703.738938
std       76252.494177
min       46275.796611
25%      124948.684336
50%      155782.841110
75%      206720.646531
max      860693.207232
Name: SalePrice, dtype: float64
Saved submission.csv
