In [7]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/home-data-for-ml-course/sample_submission.csv
/kaggle/input/home-data-for-ml-course/sample_submission.csv.gz
/kaggle/input/home-data-for-ml-course/train.csv.gz
/kaggle/input/home-data-for-ml-course/data_description.txt
/kaggle/input/home-data-for-ml-course/test.csv.gz
/kaggle/input/home-data-for-ml-course/train.csv
/kaggle/input/home-data-for-ml-course/test.csv


# **Import Libraries**

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

# **Data Preparation**

In [9]:
# Load training and test data
train_df = pd.read_csv("/kaggle/input/home-data-for-ml-course/train.csv")
test_df = pd.read_csv("/kaggle/input/home-data-for-ml-course/test.csv")
print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}\n")

# Adds new derived features to the housing data
def add_features(df):
    df = df.copy()
    
    # Total square footage
    df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
    
    # Total bathrooms
    df['TotalBaths'] = (df['FullBath'] + 0.5 * df['HalfBath'] + df['BsmtFullBath'] + 0.5 * df['BsmtHalfBath'])
    
    # Age of house
    df['HouseAge'] = df['YrSold'] - df['YearBuilt']
    
    # Years since remodel
    df['YearsSinceRemodel'] = df['YrSold'] - df['YearRemodAdd']
    
    # Total porch area
    df['TotalPorchSF'] = (df['OpenPorchSF'] + df['EnclosedPorch'] + df['3SsnPorch'] + df['ScreenPorch'])
    
    # Has pool (binary feature)
    df['HasPool'] = (df['PoolArea'] > 0).astype(int)
    
    # Has garage (binary feature)
    df['HasGarage'] = (df['GarageArea'] > 0).astype(int)
    
    # Has basement (binary feature)
    df['HasBsmt'] = (df['TotalBsmtSF'] > 0).astype(int)
    
    # Has fireplace (binary feature)
    df['HasFireplace'] = (df['Fireplaces'] > 0).astype(int)
    
    # Overall quality * overall condition interaction
    df['OverallGrade'] = df['OverallQual'] * df['OverallCond']

    # 3 neigborhoods with highest mean saleprice
    df['ExpensiveArea'] = df['Neighborhood'].isin(["NoRidge", "StoneBr", "NridgHt"]).astype(int) 

    # Total number of "conditions" (features close to the house that may cause disturbances)
    df['TotalCond'] = (df['Condition1'] != "Norm").astype(int) + (df['Condition2'] != "Norm").astype(int)
 
    return df

# Apply feature engineering to both datasets
train_df_enhanced = add_features(train_df)
test_df_enhanced = add_features(test_df)

# Split into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(
    train_df_enhanced.drop('SalePrice', axis=1),
    train_df_enhanced['SalePrice'],
    test_size=0.2,
    random_state=42
)
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

Training data shape: (1460, 81)
Test data shape: (1459, 80)

X_train shape: (1168, 92)
X_test shape: (292, 92)


  return op(a, b)
  return op(a, b)


# **Preprocessing Pipeline & Model Pipeline Creation**

In [10]:
# Identify feature types
categorical_cols = [cname for cname in X_train.columns
                    if X_train[cname].dtype == "object" 
                    and cname != 'Id']  

numerical_cols = [cname for cname in X_train.columns
                  if X_train[cname].dtype in ['int64', 'float64']
                  and cname != 'Id']  

cols = categorical_cols + numerical_cols

print(f"Selected {len(categorical_cols)} categorical features: {categorical_cols[:5]}...")
print(f"Selected {len(numerical_cols)} numerical features: {numerical_cols[:5]}...")
print(f"Total features: {len(cols)}")

# Create processed datasets
train = X_train[cols].copy()
valid = X_test[cols].copy()
test = test_df_enhanced[cols].copy() 

# Numerical data pipeline
numerical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical data pipeline
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combined preprocessing
preprocessor = ColumnTransformer([
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])
print("Preprocessing pipeline created\n")

# Model pipeline
model = XGBRegressor(random_state=42)
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])
print("XGBoost model pipeline ready\n")

Selected 43 categorical features: ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour']...
Selected 48 numerical features: ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond']...
Total features: 91
Preprocessing pipeline created

XGBoost model pipeline ready



# **Hyperparameter Tuning**

In [11]:
# Hyperparameter tuning
param_dist = {
    'model__n_estimators': randint(100, 1000),
    'model__learning_rate': uniform(0.01, 0.19),
    'model__max_depth': randint(3, 9),
    'model__reg_alpha': uniform(0, 10),
    'model__reg_lambda': uniform(1, 100),
    'model__subsample': uniform(0.7, 0.3),
    'model__colsample_bytree': uniform(0.7, 0.3),
}

random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=100,
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=1,
    random_state=42,
    error_score='raise'
)
print("Randomized search configured with 100 iterations\n")

Randomized search configured with 100 iterations



# **Model Training & Validation**

In [12]:
# Model training
random_search.fit(train, y_train)
best_model = random_search.best_estimator_
print("\nTraining complete!")
print(f"Best parameters: {random_search.best_params_}\n")

# Model validation
val_predictions = best_model.predict(valid)
val_mae = mean_absolute_error(y_test, val_predictions)
print("Validation MAE:", val_mae)

# Convert predictions into format that can be submitted to compettition 
test_predictions = best_model.predict(test)
output = pd.DataFrame({'Id': test_df["Id"], 'SalePrice': test_predictions})
output.to_csv('submission.csv', index=False)
print("Submission file 'submission.csv' created successfully!")
print(f"Predicted price range: ${output['SalePrice'].min():,.0f} to ${output['SalePrice'].max():,.0f}")

Fitting 5 folds for each of 100 candidates, totalling 500 fits

Training complete!
Best parameters: {'model__colsample_bytree': 0.962295281456325, 'model__learning_rate': 0.10789485042420006, 'model__max_depth': 4, 'model__n_estimators': 160, 'model__reg_alpha': 1.9228901880867078, 'model__reg_lambda': 5.086861626647886, 'model__subsample': 0.7506805189216493}

Validation MAE: 14941.83344927226
Submission file 'submission.csv' created successfully!
Predicted price range: $57,044 to $550,348
