In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Load datasets
train_path = "/mnt/data/train.csv"
test_path = "/mnt/data/test.csv"
submission_path = "/mnt/data/sample_submission.csv"

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Handle missing values (fill with median)
train_df.fillna(train_df.median(), inplace=True)
test_df.fillna(test_df.median(), inplace=True)

# Separate features and target
X = train_df.drop(columns=['FloodProbability'])
y = train_df['FloodProbability']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_scaled = scaler.transform(test_df)

# Train-test split (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

### Model 1: XGBoost
xgb_model = xgb.XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=6,
                             subsample=0.8, colsample_bytree=0.8, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_val)
xgb_mse = mean_squared_error(y_val, xgb_preds)
xgb_r2 = r2_score(y_val, xgb_preds)

### Model 2: LightGBM
lgb_model = lgb.LGBMRegressor(n_estimators=500, learning_rate=0.05, max_depth=6, 
                              subsample=0.8, colsample_bytree=0.8, random_state=42)
lgb_model.fit(X_train, y_train)
lgb_preds = lgb_model.predict(X_val)
lgb_mse = mean_squared_error(y_val, lgb_preds)
lgb_r2 = r2_score(y_val, lgb_preds)

### Model 3: Random Forest
rf_model = RandomForestRegressor(n_estimators=500, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_val)
rf_mse = mean_squared_error(y_val, rf_preds)
rf_r2 = r2_score(y_val, rf_preds)

# Compare model performances
print(f"🔹 XGBoost MSE: {xgb_mse:.5f}, R² Score: {xgb_r2:.5f}")
print(f"🔹 LightGBM MSE: {lgb_mse:.5f}, R² Score: {lgb_r2:.5f}")
print(f"🔹 Random Forest MSE: {rf_mse:.5f}, R² Score: {rf_r2:.5f}")

# Select best model based on R² score (higher is better)
best_model = None
if xgb_r2 > lgb_r2 and xgb_r2 > rf_r2:
    best_model = xgb_model
    best_name = "XGBoost"
elif lgb_r2 > rf_r2:
    best_model = lgb_model
    best_name = "LightGBM"
else:
    best_model = rf_model
    best_name = "Random Forest"

print(f"✅ Best Model: {best_name}")

# Make predictions on test data
final_preds = best_model.predict(test_scaled)
final_preds = np.clip(final_preds, 0, 1)  # Ensure values are between 0-1

# Save submission file
submission = pd.read_csv(submission_path)
submission['FloodProbability'] = final_preds
submission.to_csv("submission.csv", index=False)
print("📁 Submission file saved as submission.csv!")


FileNotFoundError: [Errno 2] No such file or directory: 'house_prices.csv'