In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import StackingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from xgboost import XGBRegressor
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder
from sklearn.impute import SimpleImputer

# Step 1: Import train.csv and test.csv
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Step 2: Store the original test ID for reference later
test_ids = test['Id']

# Step 3: Concatenate the train and test datasets
# Create a new column 'Source' to distinguish between train and test data
train['Source'] = 1
test['Source'] = 0
combined = pd.concat([train, test], axis=0, ignore_index=True)

# Step 4: Drop features with more than 500 null values
null_values = combined.isnull().sum()
features_to_drop = null_values[null_values > 500].index
print(f"\nDropping features with more than 500 null values: {features_to_drop.tolist()}")
combined.drop(columns=features_to_drop, inplace=True)

# Step 5: Handle missing values
numeric_cols = combined.select_dtypes(include=['number']).columns
categorical_cols = combined.select_dtypes(include=['object']).columns

# Optionally, create missing indicators
for col in categorical_cols:
    combined[col + '_missing'] = combined[col].isnull().astype(int)
for col in numeric_cols:
    combined[col + '_missing'] = combined[col].isnull().astype(int)

# Fill missing values
combined[numeric_cols] = combined[numeric_cols].fillna(combined[numeric_cols].median())
combined[categorical_cols] = combined[categorical_cols].fillna(combined[categorical_cols].mode().iloc[0])

# Step 6: Feature Engineering (optional)
# Example: Total Square Footage and House Age
combined['TotalSF'] = combined['TotalBsmtSF'] + combined['GrLivArea']
combined['HouseAge'] = combined['YrSold'] - combined['YearBuilt']

# Log transform skewed features
skewed_features = ['GrLivArea', 'TotalBsmtSF', '1stFlrSF', 'TotalSF']
for feature in skewed_features:
    if feature in combined.columns:
        combined[feature] = np.log1p(combined[feature])

# Step 7: Encode categorical variables
# Use Target Encoding for categorical variables
encoder = TargetEncoder(cols=categorical_cols)
combined[categorical_cols] = encoder.fit_transform(combined[categorical_cols], combined['SalePrice'])

# Step 8: One-hot encoding (if applicable, else Target Encoding might suffice)
# If you still want to use one-hot for some categorical features with few categories
# combined = pd.get_dummies(combined, columns=categorical_cols, drop_first=True)

# Alternatively, keep target-encoded values as numerical.

# Step 9: Split combined data back into train and test sets
train_data = combined[combined['Source'] == 1].drop(['Source', 'Id'], axis=1)
test_data = combined[combined['Source'] == 0].drop(['Source', 'Id', 'SalePrice'], axis=1)

# Separate features and target
X = train_data.drop('SalePrice', axis=1)
y = np.log1p(train_data['SalePrice'])

# Step 10: Feature Selection (optional)
# Optionally perform feature selection here

# Step 11: Split the training data into train and validation sets using K-Fold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []
for train_index, val_index in kf.split(X):
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]
    
    # Define base models
    estimators = [
        ('xgb', XGBRegressor(n_estimators=1000, learning_rate=0.05, max_depth=10, random_state=42)),
        ('lgbm', lgb.LGBMRegressor(n_estimators=1000, learning_rate=0.05, num_leaves=31, random_state=42)),
        ('catboost', CatBoostRegressor(iterations=1000, learning_rate=0.1, depth=10, random_seed=42, verbose=0))
    ]
    
    # Define stacking regressor
    stacking_model = StackingRegressor(estimators=estimators, final_estimator=Ridge())
    
    # Fit the model
    stacking_model.fit(X_train_fold, y_train_fold)
    
    # Predict on validation
    y_val_pred = stacking_model.predict(X_val_fold)
    
    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_val_fold, y_val_pred))
    cv_scores.append(rmse)
    print(f'Fold RMSE: {rmse}')

print(f'Average CV RMSE: {np.mean(cv_scores)}')

# Step 12: Train on entire training data
stacking_model.fit(X, y)

# Step 13: Predict on test set
y_test_pred = stacking_model.predict(test_data)

# Inverse log transform
y_test_pred = np.expm1(y_test_pred)

# Clip predictions if necessary
y_test_pred = np.clip(y_test_pred, 0, np.percentile(y_test_pred, 99))

# Step 14: Save the predictions
output = pd.DataFrame({'Id': test_ids, 'SalePrice': y_test_pred})
output.to_csv('test_predictions.csv', index=False)
print("\nPredictions saved to test_predictions.csv")



Dropping features with more than 500 null values: ['Alley', 'MasVnrType', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature', 'SalePrice']


  combined[col + '_missing'] = combined[col].isnull().astype(int)
  combined[col + '_missing'] = combined[col].isnull().astype(int)
  combined[col + '_missing'] = combined[col].isnull().astype(int)
  combined[col + '_missing'] = combined[col].isnull().astype(int)
  combined['TotalSF'] = combined['TotalBsmtSF'] + combined['GrLivArea']
  combined['HouseAge'] = combined['YrSold'] - combined['YearBuilt']


KeyError: 'SalePrice'