In [17]:
#!/usr/bin/env python
# coding: utf-8

# Import libraries
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_log_error
import xgboost as xgb
import lightgbm as lgb

warnings.filterwarnings('ignore')

# In[1]: Load datasets
training_data = pd.read_csv("my_train.csv", sep=',')
validation_data = pd.read_csv("my_dev.csv", sep=',')

# In[2]: Prepare train and validation features
# Separate target variable and features in training data
target_train = training_data['SalePrice']
features_train = training_data.drop(['Id', 'SalePrice'], axis=1)

# Separate target variable and features in validation data
target_validation = validation_data['SalePrice']
features_validation = validation_data.drop(['Id', 'SalePrice'], axis=1)

# In[3]: Handle missing values
# Fill specific columns with 0 in training data
train_fill_columns = ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']
features_train[train_fill_columns] = features_train[train_fill_columns].fillna(0)

# Fill specific columns with 0 in validation data
validation_fill_columns = ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']
features_validation[validation_fill_columns] = features_validation[validation_fill_columns].fillna(0)

# In[4]: Identify numerical and categorical columns
numerical_features = list(features_train.select_dtypes(include=[np.number]).columns)
categorical_features = list(features_train.select_dtypes(exclude=[np.number]).columns)

# Ensure 'MSSubClass' is treated as categorical
column_to_convert = 'MSSubClass'
if column_to_convert in numerical_features:
    numerical_features.remove(column_to_convert)
    categorical_features.append(column_to_convert)

# Convert categorical columns to string type
features_train[categorical_features] = features_train[categorical_features].astype(str)
features_validation[categorical_features] = features_validation[categorical_features].astype(str)

# In[5]: Define preprocessor
numerical_processor = 'passthrough'
categorical_processor = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

preprocessor = ColumnTransformer([
    ('num', numerical_processor, numerical_features),
    ('cat', categorical_processor, categorical_features)
])

# Fit and transform data
preprocessor.fit(features_train)
processed_features_train = preprocessor.transform(features_train)
processed_features_validation = preprocessor.transform(features_validation)

# In[6]: Prepare data for training and validation
X_train = processed_features_train
y_train = target_train

X_validation = processed_features_validation
y_validation = target_validation

# Apply log transformation to target
log_target_train = np.log1p(y_train)

# In[7]: Polynomial Features for Important Columns
important_features = [
    'EnclosedPorch', 'GarageYrBlt', 'KitchenAbvGr', 'BsmtHalfBath', 'MSSubClass',
    'WoodDeckSF', 'GarageArea', 'BsmtFullBath', '1stFlrSF', 'BsmtFinSF1', 
    'YearRemodAdd', 'YearBuilt', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond'
]
important_numerical_features = [col for col in numerical_features if col in important_features]

poly_features = PolynomialFeatures(degree=2, include_bias=False)
train_poly = poly_features.fit_transform(features_train[important_numerical_features])
validation_poly = poly_features.transform(features_validation[important_numerical_features])

X_train_poly = np.concatenate((X_train, train_poly), axis=1)
X_validation_poly = np.concatenate((X_validation, validation_poly), axis=1)

# In[8]: Train and Evaluate Multiple Models
# Define models
models = {
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, random_state=42),
    "XGBoost": xgb.XGBRegressor(n_estimators=200, learning_rate=0.1, random_state=42),
    "LightGBM": lgb.LGBMRegressor(n_estimators=200, learning_rate=0.1, random_state=42),
    "K-Nearest Neighbors": KNeighborsRegressor(n_neighbors=5)
}

# Train and predict with each model
results = {}
for name, model in models.items():
    if name in ["Ridge Regression"]:  # Uses polynomial features
        model.fit(X_train_poly, log_target_train)
        predictions_log = model.predict(X_validation_poly)
    else:  # Uses original features
        model.fit(X_train, y_train)
        predictions_log = np.log1p(model.predict(X_validation))
    
    predictions = np.exp(predictions_log) - 1
    rmsle = np.sqrt(mean_squared_log_error(y_validation, predictions))
    results[name] = rmsle

# Print RMSLE for each model
print("\nModel Performance (RMSLE):")
for model_name, score in results.items():
    print(f"{model_name}: {score:.5f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001159 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3317
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 210
[LightGBM] [Info] Start training from score 182159.048706

Model Performance (RMSLE):
Random Forest: 0.14001
Gradient Boosting: 0.12721
XGBoost: 0.13002
LightGBM: 0.12590
K-Nearest Neighbors: 0.20266


In [None]:
#LIGHTGBM

In [33]:
#!/usr/bin/env python
# coding: utf-8

# Import necessary libraries
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
import lightgbm as lgb

warnings.filterwarnings('ignore')

# Load datasets
train_data = pd.read_csv("my_train.csv")
test_set = pd.read_csv("test.csv")

# Prepare train and test features
labels = train_data['SalePrice']
train_features = train_data.drop(['Id', 'SalePrice'], axis=1)
test_features = test_set.drop(['Id'], axis=1)

# Handle missing values
columns_to_replace_train_features = ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']
train_features[columns_to_replace_train_features] = train_features[columns_to_replace_train_features].fillna(0)

columns_to_replace_test = [
    'LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 
    'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageCars', 
    'GarageArea', 'GarageYrBlt'
]
test_features[columns_to_replace_test] = test_features[columns_to_replace_test].fillna(0)

# Identify numerical and categorical columns
numeric_columns = list(train_features.select_dtypes(include=[np.number]).columns)
categorical_columns = list(train_features.select_dtypes(exclude=[np.number]).columns)

# Ensure 'MSSubClass' is treated as categorical
column_to_move = 'MSSubClass'
if column_to_move in numeric_columns:
    numeric_columns.remove(column_to_move)
    categorical_columns.append(column_to_move)

# Convert categorical columns to string type
train_features[categorical_columns] = train_features[categorical_columns].astype(str)
test_features[categorical_columns] = test_features[categorical_columns].astype(str)

# Define preprocessor
num_processor = 'passthrough'
cat_processor = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

preprocessor = ColumnTransformer([
    ('num', num_processor, numeric_columns),
    ('cat', cat_processor, categorical_columns)
])

# Fit and transform data
preprocessor.fit(train_features)
train_processed_data = preprocessor.transform(train_features)
test_processed_data = preprocessor.transform(test_features)

# Add polynomial features for selected important features
important_features = [
    'GarageArea', 'KitchenAbvGr', 'BsmtFullBath', '1stFlrSF', 
    'BsmtFinSF1', 'YearRemodAdd', 'YearBuilt', 'LotFrontage', 
    'LotArea', 'OverallQual', 'OverallCond'
]
important_numeric_columns = [col for col in numeric_columns if col in important_features]

poly = PolynomialFeatures(degree=2, include_bias=False)
train_poly_features = poly.fit_transform(train_features[important_numeric_columns])
test_poly_features = poly.transform(test_features[important_numeric_columns])

X_train_poly = np.concatenate((train_processed_data, train_poly_features), axis=1)
X_test_poly = np.concatenate((test_processed_data, test_poly_features), axis=1)

# Log-transform the target variable
log_of_label = np.log(labels)

lgb_model = lgb.LGBMRegressor(
    max_depth=4,              # Limit tree depth
    max_bin=255,              # Reduce the number of bins
    n_estimators=200,         # Increase estimators
    learning_rate=0.1,        # Moderate learning rate
    feature_fraction=0.8,     # Use 80% of features for each split
    bagging_fraction=0.8,     # Use 80% of data for each iteration
    bagging_freq=5,           # Perform bagging every 5 iterations
    random_state=42
)

lgb_model.fit(X_train_poly, log_of_label)

# Predict on test set
log_of_test_label = lgb_model.predict(X_test_poly)
final_price_predictions = np.exp(log_of_test_label)

# Create submission file
submission_df = pd.DataFrame({'Id': test_set['Id'], 'SalePrice': final_price_predictions})
submission_df.to_csv('C:/Users/badhe/Downloads/hw3-data/LightGBM_Predictions.csv', index=False)

print("Predictions saved to 'LightGBM_Predictions.csv'")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002149 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16748
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 287
[LightGBM] [Info] Start training from score 12.029784
Predictions saved to 'LightGBM_Predictions.csv'
