In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/home-data-for-ml-course/sample_submission.csv
/kaggle/input/home-data-for-ml-course/sample_submission.csv.gz
/kaggle/input/home-data-for-ml-course/train.csv.gz
/kaggle/input/home-data-for-ml-course/data_description.txt
/kaggle/input/home-data-for-ml-course/test.csv.gz
/kaggle/input/home-data-for-ml-course/train.csv
/kaggle/input/home-data-for-ml-course/test.csv


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

In [5]:
# Read data
train_df = pd.read_csv("/kaggle/input/home-data-for-ml-course/train.csv")
test_df = pd.read_csv("/kaggle/input/home-data-for-ml-course/test.csv")

# Feature engineering function
def add_features(df):
    df = df.copy()
    
    # Total square footage
    df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
    
    # Total bathrooms
    df['TotalBaths'] = (df['FullBath'] + 0.5 * df['HalfBath'] + 
                       df['BsmtFullBath'] + 0.5 * df['BsmtHalfBath'])
    
    # Age of house
    df['HouseAge'] = df['YrSold'] - df['YearBuilt']
    
    # Years since remodel
    df['YearsSinceRemodel'] = df['YrSold'] - df['YearRemodAdd']
    
    # Total porch area
    df['TotalPorchSF'] = (df['OpenPorchSF'] + df['EnclosedPorch'] + 
                         df['3SsnPorch'] + df['ScreenPorch'])
    
    # Has pool (binary feature)
    df['HasPool'] = (df['PoolArea'] > 0).astype(int)
    
    # Has garage (binary feature)
    df['HasGarage'] = (df['GarageArea'] > 0).astype(int)
    
    # Has basement (binary feature)
    df['HasBsmt'] = (df['TotalBsmtSF'] > 0).astype(int)
    
    # Has fireplace (binary feature)
    df['HasFireplace'] = (df['Fireplaces'] > 0).astype(int)
    
    # Overall quality * overall condition interaction
    df['OverallGrade'] = df['OverallQual'] * df['OverallCond']
    
    return df

# Apply feature engineering to both datasets
train_df_enhanced = add_features(train_df)
test_df_enhanced = add_features(test_df)

# Split data FIRST to avoid leakage
X_train, X_test, y_train, y_test = train_test_split(
    train_df_enhanced.drop('SalePrice', axis=1),
    train_df_enhanced['SalePrice'],
    test_size=0.2,
    random_state=42
)

# Feature selection
categorical_cols = [cname for cname in X_train.columns
                    if X_train[cname].dtype == "object" 
                    and cname != 'Id']  # Include ALL categorical features

numerical_cols = [cname for cname in X_train.columns
                  if X_train[cname].dtype in ['int64', 'float64']
                  and cname != 'Id']  # Include ALL numerical features

cols = categorical_cols + numerical_cols

print(f"Selected {len(categorical_cols)} categorical features: {categorical_cols[:5]}...")
print(f"Selected {len(numerical_cols)} numerical features: {numerical_cols[:5]}...")
print(f"Total features: {len(cols)}")

# Processed data
train = X_train[cols].copy()
valid = X_test[cols].copy()
test = test_df_enhanced[cols].copy() 

numerical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer([
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

# Model pipeline
model = XGBRegressor(random_state=42)
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

# Hyperparameter tuning
param_dist = {
    'model__n_estimators': randint(100, 1000),
    'model__learning_rate': uniform(0.01, 0.19),
    'model__max_depth': randint(3, 9),
    'model__reg_alpha': uniform(0, 10),
    'model__reg_lambda': uniform(1, 100),
    'model__subsample': uniform(0.7, 0.3),
    'model__colsample_bytree': uniform(0.7, 0.3),
}

random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=100,
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=1,
    random_state=42,
    error_score='raise'
)

# Fit and validate
random_search.fit(train, y_train)
best_model = random_search.best_estimator_
val_predictions = best_model.predict(valid)
val_mae = mean_absolute_error(y_test, val_predictions)
print("Validation MAE:", val_mae)

# Final submission
test_predictions = best_model.predict(test)
output = pd.DataFrame({'Id': test_df["Id"], 'SalePrice': test_predictions})
output.to_csv('submission.csv', index=False)

Selected 43 categorical features: ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour']...
Selected 46 numerical features: ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond']...
Total features: 89
Fitting 5 folds for each of 100 candidates, totalling 500 fits


  return op(a, b)
  return op(a, b)


Validation MAE: 15797.611488655823


In [4]:
# Read data
train_df = pd.read_csv("/kaggle/input/home-data-for-ml-course/train.csv")
test_df = pd.read_csv("/kaggle/input/home-data-for-ml-course/test.csv")

# Clean and split
train_df.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = train_df['SalePrice']
train_df.drop(['SalePrice'], axis=1, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(train_df, y, train_size=0.8, test_size=0.2, random_state=42)

# Select features
categorical_cols = [cname for cname in X_train.columns if
                    X_train[cname].nunique() < 10 and 
                    X_train[cname].dtype == "object"]

numerical_cols = [cname for cname in X_train.columns if 
                  X_train[cname].dtype in ['int64', 'float64']]

cols = categorical_cols + numerical_cols

train = X_train[cols].copy()
valid = X_test[cols].copy()
test = test_df.copy()  # ✅ don't drop columns here

# Pipelines
numerical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer([
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

# Model pipeline
model = XGBRegressor(random_state=42)
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

# Hyperparameter tuning
param_dist = {
    'model__n_estimators': randint(100, 1000),
    'model__learning_rate': uniform(0.01, 0.19),
    'model__max_depth': randint(3, 9),
    'model__reg_alpha': uniform(0, 10),
    'model__reg_lambda': uniform(1, 100),
    'model__subsample': uniform(0.7, 0.3),
    'model__colsample_bytree': uniform(0.7, 0.3),
}

random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=100,
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=1,
    random_state=42,
    error_score='raise'
)

# Fit model
random_search.fit(train, y_train)

# Evaluate
best_model = random_search.best_estimator_
val_predictions = best_model.predict(valid)
val_mae = mean_absolute_error(y_test, val_predictions)
print("Validation MAE:", val_mae)

# Predict on Kaggle test set
test_predictions = best_model.predict(test)

# Create submission
output = pd.DataFrame({'Id': test_df["Id"], 'SalePrice': test_predictions})
output.to_csv('submission5.csv', index=False)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Validation MAE: 16943.706897474316
