In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
import lightgbm as lgb
from catboost import CatBoostRegressor
from scipy.stats import randint, uniform

# Step 1: Import train.csv and test.csv
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Store the original test ID for reference later
test_ids = test.index

# Step 2: Concatenate the train and test datasets
# Create a new column 'Source' to distinguish between train and test data
train['Source'] = 1
test['Source'] = 0

# Concatenate train and test data
combined = pd.concat([train, test], axis=0, ignore_index=True)

# Step 3: Check for Null Values and analyze it in combined data
null_values = combined.isnull().sum()
print("Null Values in each column:\n", null_values[null_values > 0])
print("\nTotal Null Values: ", null_values.sum())

# Separate numeric and categorical columns
numeric_cols = combined.select_dtypes(include=['number']).columns
categorical_cols = combined.select_dtypes(include=['object']).columns

# Fill missing values for numeric columns with the median
combined[numeric_cols] = combined[numeric_cols].fillna(combined[numeric_cols].median())

# Fill missing values for categorical columns with the mode (most frequent value)
for col in categorical_cols:
    mode_val = combined[col].mode()[0]  # Get the most frequent value
    combined[col].fillna(mode_val, inplace=True)

# Drop features with more than 500 null values
null_values = combined.isnull().sum()
features_to_drop = null_values[null_values > 500].index
print(f"\nDropping features with more than 500 null values: {features_to_drop.tolist()}")
combined.drop(columns=features_to_drop, inplace=True)

# Update categorical columns after dropping features
categorical_cols = combined.select_dtypes(include=['object']).columns

# Step 4: One-hot encoding for each string-based feature
# Apply one-hot encoding to categorical variables
combined = pd.get_dummies(combined, columns=categorical_cols, drop_first=True)

print("Data after one-hot encoding:\n", combined.head())

# Step 5: Split combined data back into train and test sets
train_data = combined[combined['Source'] == 1].drop('Source', axis=1)
test_data = combined[combined['Source'] == 0].drop('Source', axis=1)

# Ensure the columns match exactly between train and test data
X = train_data.drop(columns=['SalePrice'])
y = train_data['SalePrice']

# Step 6: Split the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Define base models with Random Search for hyperparameter tuning
# Define the hyperparameter grid for each model
xgb_param_grid = {
    'n_estimators': randint(500, 1500),
    'learning_rate': uniform(0.01, 0.1),
    'max_depth': randint(3, 15)
}

lgb_param_grid = {
    'n_estimators': randint(500, 1500),
    'learning_rate': uniform(0.01, 0.1),
    'num_leaves': randint(20, 40)
}

cat_param_grid = {
    'iterations': randint(500, 1500),
    'learning_rate': uniform(0.01, 0.1),
    'depth': randint(5, 15)
}

# Initialize the models
xgb = XGBRegressor(random_state=42)
lgbm = lgb.LGBMRegressor(random_state=42)
catboost = CatBoostRegressor(random_seed=42, verbose=0)

# Initialize RandomizedSearchCV for each model
xgb_random_search = RandomizedSearchCV(estimator=xgb, param_distributions=xgb_param_grid, 
                                       n_iter=10, cv=5, verbose=1, random_state=42, n_jobs=-1)
lgbm_random_search = RandomizedSearchCV(estimator=lgbm, param_distributions=lgb_param_grid, 
                                        n_iter=10, cv=5, verbose=1, random_state=42, n_jobs=-1)
catboost_random_search = RandomizedSearchCV(estimator=catboost, param_distributions=cat_param_grid, 
                                            n_iter=10, cv=5, verbose=1, random_state=42, n_jobs=-1)

# Fit Random Search for each model
xgb_random_search.fit(X_train, y_train)
lgbm_random_search.fit(X_train, y_train)
catboost_random_search.fit(X_train, y_train)

# Retrieve the best models
best_xgb = xgb_random_search.best_estimator_
best_lgbm = lgbm_random_search.best_estimator_
best_catboost = catboost_random_search.best_estimator_

# Step 8: Define the stacking model with the best hyperparameters
estimators = [
    ('xgb', best_xgb),
    ('lgbm', best_lgbm),
    ('catboost', best_catboost)
]

# Initialize the stacking model with Ridge as the final estimator
stacking_model = StackingRegressor(estimators=estimators, final_estimator=Ridge())

# Train the stacking model on the training data
stacking_model.fit(X_train, y_train)

# Step 9: Predict on the validation set
y_val_pred = stacking_model.predict(X_val)

# Calculate the logarithm of the predictions and the actual values
y_val_log_pred = np.log1p(y_val_pred)
y_val_log_actual = np.log1p(y_val)

# Step 10: Calculate Logarithmic RMSE and R² score
log_rmse = np.sqrt(mean_squared_error(y_val_log_actual, y_val_log_pred))
r2 = r2_score(y_val_log_actual, y_val_log_pred)

print(f"\nValidation Logarithmic RMSE: {log_rmse}")
print(f"Validation R² Score: {r2}")

# Step 11: Predict on the test set using the entire training data
stacking_model.fit(X, y)  # Re-train on the entire dataset before predicting on the test set
y_test_pred = stacking_model.predict(test_data[X_train.columns])

# Handle any extremely large or small predictions
y_test_pred = np.clip(y_test_pred, 0, np.percentile(y_test_pred, 99))

# Step 12: Save the predictions to a CSV file
output = pd.DataFrame({'Id': test_ids, 'SalePrice': y_test_pred})
output.to_csv('test_predictions.csv', index=False)

print("\nPredictions saved to test_predictions.csv")

Null Values in each column:
 MSZoning           4
LotFrontage      486
Alley           2721
Utilities          2
Exterior1st        1
Exterior2nd        1
MasVnrType      1766
MasVnrArea        23
BsmtQual          81
BsmtCond          82
BsmtExposure      82
BsmtFinType1      79
BsmtFinSF1         1
BsmtFinType2      80
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
Electrical         1
BsmtFullBath       2
BsmtHalfBath       2
KitchenQual        1
Functional         2
FireplaceQu     1420
GarageType       157
GarageYrBlt      159
GarageFinish     159
GarageCars         1
GarageArea         1
GarageQual       159
GarageCond       159
PoolQC          2909
Fence           2348
MiscFeature     2814
SaleType           1
SalePrice       1459
dtype: int64

Total Null Values:  17166

Dropping features with more than 500 null values: []
Data after one-hot encoding:
    Id  MSSubClass  LotFrontage  LotArea  OverallQual  OverallCond  YearBuilt  \
0   1          60         65.0  

found 0 physical cores < 1
  File "c:\Users\utkar\.conda\envs\py310\lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001483 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3395
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 155
[LightGBM] [Info] Start training from score 181441.541952
Fitting 5 folds for each of 10 candidates, totalling 50 fits


4 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\utkar\.conda\envs\py310\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\utkar\.conda\envs\py310\lib\site-packages\catboost\core.py", line 5807, in fit
    return self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline,
  File "c:\Users\utkar\.conda\envs\py310\lib\site-packages\catboost\core.py", line 2396, in _fit
    self._train(
  File "c:\Users\utkar\.conda\envs\py310\lib\site-packages\catboost\c

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001721 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3395
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 155
[LightGBM] [Info] Start training from score 181441.541952
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001257 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3175
[LightGBM] [Info] Number of data points in the train set: 934, number of used features: 147
[LightGBM] [Info] Start training from score 181121.274090
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001046 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3166
[LightGBM] [Info] Number of data points in the train set: 934, number of used features: 147
[LightGBM] [Info] S