In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

# Load the data
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# Preprocessing
# Fill missing values
for col in df_train.columns:
    if col in df_test.columns:  # Ensure the column exists in both datasets
        if df_train[col].dtype == "object":
            df_train[col] = df_train[col].fillna("None")
            df_test[col] = df_test[col].fillna("None")
        else:
            df_train[col] = df_train[col].fillna(0)
            df_test[col] = df_test[col].fillna(0)


# Encode categorical variables
label_encoders = {}
for col in df_train.select_dtypes(include="object").columns:
    le = LabelEncoder()
    combined_data = pd.concat([df_train[col], df_test[col]], axis=0).fillna("None")
    le.fit(combined_data)  # Fit on combined data to include all categories
    df_train[col] = le.transform(df_train[col].fillna("None"))
    df_test[col] = le.transform(df_test[col].fillna("None"))
    label_encoders[col] = le

if 'SalePrice' not in df_train.columns:
    raise ValueError("The 'SalePrice' column is missing from the training dataset.")

# Feature and target separation
X = df_train.drop(columns=["Id", "SalePrice"])
y = np.log1p(df_train["SalePrice"])
X_test = df_test.drop(columns="Id")

# Split the training set into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training with Random Forest
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)

# Predictions and evaluation
val_preds_rf = rf.predict(X_val)
rmse_rf = np.sqrt(mean_squared_error(y_val, val_preds_rf))
print(f"Random Forest RMSE: {rmse_rf:.4f}")

# Model training with Gradient Boosting
gb = GradientBoostingRegressor(random_state=42)
gb.fit(X_train, y_train)

val_preds_gb = gb.predict(X_val)
rmse_gb = np.sqrt(mean_squared_error(y_val, val_preds_gb))
print(f"Gradient Boosting RMSE: {rmse_gb:.4f}")

# Final model prediction
final_preds = gb.predict(X_test)
final_preds_exp = np.expm1(final_preds)

# Create submission file
submission = pd.DataFrame({"Id": df_test["Id"], "SalePrice": final_preds_exp})
submission.to_csv("submission.csv", index=False)

print("Submission file created: submission.csv")


Random Forest RMSE: 0.1448
Gradient Boosting RMSE: 0.1412
Submission file created: submission.csv
