In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# **Load the datasets**

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
# Store test IDs for submission
test_ids = test['Id']

In [4]:
# Drop Id column from both datasets
train = train.drop('Id', axis=1)
test = test.drop('Id', axis=1)

In [5]:
# Separate target variable
y = train['SalePrice']
X = train.drop('SalePrice', axis=1)

In [7]:
# Define categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist() + ['MSSubClass']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

In [8]:
# Convert MSSubClass to string for categorical treatment
X['MSSubClass'] = X['MSSubClass'].astype(str)
test['MSSubClass'] = test['MSSubClass'].astype(str)

# **Feature engineering**

In [9]:
def engineer_features(df):
    # Total square footage
    df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
    # House age
    df['HouseAge'] = df['YrSold'] - df['YearBuilt']
    # Remodel age
    df['RemodAge'] = df['YrSold'] - df['YearRemodAdd']
    # Total bathrooms
    df['TotalBath'] = df['FullBath'] + 0.5 * df['HalfBath'] + df['BsmtFullBath'] + 0.5 * df['BsmtHalfBath']
    # Has pool
    df['HasPool'] = df['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
    return df

# **Apply feature engineering**

In [10]:
X = engineer_features(X)
test = engineer_features(test)
numerical_cols += ['TotalSF', 'HouseAge', 'RemodAge', 'TotalBath', 'HasPool']

# **Preprocessing pipelines**

In [11]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])



# **Combine transformers**

In [12]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# **Define the model**

In [13]:
model = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=6,
    random_state=42
)

# Create the full pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# Split data for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)


# **Evaluate using cross-validation**

In [14]:
scores = cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
rmse_scores = np.sqrt(-scores)
print(f"Cross-validation RMSE scores: {rmse_scores}")
print(f"Average RMSE: {rmse_scores.mean():.2f} (+/- {rmse_scores.std() * 2:.2f})")

# Predict on test set
test_predictions = pipeline.predict(test)

# Create submission file
submission = pd.DataFrame({'Id': test_ids, 'SalePrice': test_predictions})
submission.to_csv('final.csv', index=False)
print("Submission file created: final.csv")

Cross-validation RMSE scores: [25962.39002865 32263.53086691 29611.98027826 21398.78725536
 28739.46192955]
Average RMSE: 27595.23 (+/- 7388.19)
Submission file created: final.csv
