In [27]:
import pandas as pd
import numpy as np

# Load the training and test datasets
train_data_path = 'train.csv'
test_data_path = 'test.csv'

train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

In [28]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.base import BaseEstimator, TransformerMixin

# Custom Transformer for Feature Engineering
class FeatureEngineering(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X['TotalSF'] = X['TotalBsmtSF'] + X['1stFlrSF'] + X['2ndFlrSF']
        X['Age'] = X['YrSold'] - X['YearBuilt']
        X['YearsSinceRemodel'] = X['YrSold'] - X['YearRemodAdd']
        X['AreaRatio'] = X['GrLivArea'] / X['LotArea']
        X['LotFrontage'] = X.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
        X['MSZoning'] = X['MSZoning'].fillna(X['MSZoning'].mode()[0])

        X['Remodeled'] = (X['YearBuilt'] != X['YearRemodAdd']).astype(int)
        X['BsmtFinRatio'] = X['BsmtFinSF1'] / X['TotalBsmtSF'].replace(0, 1)
        X['GarageLotRatio'] = X['GarageArea'] / X['LotArea']
        X['HasMasVnr'] = (X['MasVnrArea'] > 0).astype(int)
        X['HasGarage'] = (X['GarageArea'] > 0).astype(int)
        X['HasBsmt'] = (X['TotalBsmtSF'] > 0).astype(int)
        X['LogLotArea'] = np.log(X['LotArea'] + 1)
        
                # Interaction features
        X['QualSF'] = X['OverallQual'] * X['TotalSF']
        
        return X
    
# The ordinal encoding for ordinal columns
ordinal_encoding = {
    'ExterQual': ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'ExterCond': ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'BsmtQual': ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'BsmtCond': ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'HeatingQC': ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'KitchenQual': ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'FireplaceQu': ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'GarageQual': ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'GarageCond': ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'PoolQC': ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
    # Add other ordinal columns with their respective order if needed
}
# Define columns
numerical_cols = train_data.select_dtypes(include=['int64', 'float64']).columns.drop('SalePrice')
categorical_cols = train_data.select_dtypes(include=['object']).columns
ordinal_cols = ['ExterQual', 
                'ExterCond', 
                'BsmtQual', 
                'BsmtCond', 
                'HeatingQC', 
                'KitchenQual', 
                'FireplaceQu', 
                'GarageQual', 
                'GarageCond', 
                'PoolQC'
               ]

# Pipeline for numerical features
numerical_transformer = Pipeline(steps=[
    ('imputer', IterativeImputer()), # Using IterativeImputer
    ('scaler', RobustScaler())
])

# Pipeline for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Pipeline for ordinal features
ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(categories=[ordinal_encoding[col] for col in ordinal_cols]))
])

# Combining transformers into a ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols),
    ('ord', ordinal_transformer, ordinal_cols)
])

# Adjust the feature selection step
full_pipeline = Pipeline(steps=[
    ('feature_eng', FeatureEngineering()),
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(f_regression, k='all'))
])

# Separate the target variable in training data
y_train = train_data['SalePrice']
X_train = train_data.drop('SalePrice', axis=1)

# Apply the full pipeline to the training and test data
X_train_processed = full_pipeline.fit_transform(X_train, y_train)
X_test_processed = full_pipeline.transform(test_data)



In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.preprocessing import OneHotEncoder

# Custom RMSE log scorer function
def rmse_log(y_true, y_pred):
    y_true = np.where(y_true <= 0, np.finfo(float).eps, y_true)
    y_pred = np.where(y_pred <= 0, np.finfo(float).eps, y_pred)
    return np.sqrt(mean_squared_error(np.log(y_true), np.log(y_pred)))

# Split the data into features and target variable
X = X_train_processed
y = y_train

# Split the data into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Initialize and train the models
rf_model = RandomForestRegressor(random_state=42)
gb_model = GradientBoostingRegressor(
                                    n_estimators=3000, 
                                    learning_rate=0.05, 
                                    max_depth=4,
                                    max_features='sqrt',
                                    min_samples_leaf=15,
                                    min_samples_split=10,
                                    loss='huber',
                                    random_state=42)
cb_model = CatBoostRegressor(
                                    iterations=3000,
                                    learning_rate=0.05,
                                    depth=4,
                                    loss_function='RMSE',
                                    random_state=42,
                                    l2_leaf_reg=3,
                                    border_count=128,
                                    bagging_temperature=1,
                                    early_stopping_rounds=50,
                                    boosting_type='Ordered',
                                    verbose=False
)


rf_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)
cb_model.fit(X_train, y_train)

# Predict and evaluate
rf_pred = rf_model.predict(X_test)
gb_pred = gb_model.predict(X_test)
cb_pred = cb_model.predict(X_test)

rf_rmse_log_score = rmse_log(y_test, rf_pred)
gb_rmse_log_score = rmse_log(y_test, gb_pred)
cb_rmse_log_score = rmse_log(y_test, cb_pred)

# Print the RMSE log scores
print("Random Forest RMSE Log Score:", rf_rmse_log_score)
print("Gradient Boosting RMSE Log Score:", gb_rmse_log_score)
print("CatBoost RMSE Log Score:", cb_rmse_log_score)


Random Forest RMSE Log Score: 0.15172760646172523
Gradient Boosting RMSE Log Score: 0.1369481443849903
CatBoost RMSE Log Score: 0.12593825277924778


In [25]:
# Assuming test_data contains an 'Id' column for identification
test_ids = test_data['Id']

# Predict house prices using the trained models
rf_test_pred = rf_model.predict(X_test_processed)
gb_test_pred = gb_model.predict(X_test_processed)
cb_test_pred = cb_model.predict(X_test_processed)

# Average predictions from both models (or you can choose one)
average_pred = (cb_test_pred)
# Create a DataFrame for submission
submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': average_pred
})

# Save the DataFrame to a CSV file
submission.to_csv('house_price_predictions.csv', index=False)

# Provide the path for downloading the file
csv_file_path = 'house_price_predictions.csv'
csv_file_path


'house_price_predictions.csv'