In [11]:
!pip install xgboost lightgbm --quiet

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb

In [3]:
# Use gdown to download from Google Drive
!gdown --id 128XLT44uu9bVSBPd6JiiYyv_M4LJP-3x -O train.csv
!gdown --id 1JMUa5MIwyx3QEXk3KKt_u0AeMIDUjJUx -O test.csv

# Load into pandas
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Check first rows
print(train.head())
print(test.head())


Downloading...
From: https://drive.google.com/uc?id=128XLT44uu9bVSBPd6JiiYyv_M4LJP-3x
To: /content/train.csv
100% 2.32M/2.32M [00:00<00:00, 215MB/s]
Downloading...
From: https://drive.google.com/uc?id=1JMUa5MIwyx3QEXk3KKt_u0AeMIDUjJUx
To: /content/test.csv
100% 452k/452k [00:00<00:00, 125MB/s]
     house_id        sale_date  num_bedrooms  num_bathrooms  living_area  \
0  8902000050  20141027T000000             3           1.75       1720.0   
1  4325700085  20150325T000000             3           1.00       1310.0   
2  7732410420  20140617T000000             3           2.50       2590.0   
3  4039701280  20150408T000000             3           2.25       2440.0   
4  5379805120  20150424T000000             2           1.00        740.0   

   lot_area  num_floors  is_waterfront  view_rating condition_index  ...  \
0    7200.0         1.0              0          0.0               3  ...   
1    8514.0         1.0              0          0.0               4  ...   
2    7720.0         

In [4]:
# Target
y = train['target_price']

# Features
X = train.drop(['house_id', 'sale_date', 'target_price'], axis=1)
X_test = test.drop(['house_id', 'sale_date'], axis=1)
test_ids = test['house_id']


In [6]:
# Separate numeric and categorical columns
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Fill missing values
X[numeric_cols] = X[numeric_cols].fillna(X[numeric_cols].median())
X[categorical_cols] = X[categorical_cols].fillna('Unknown')

X_test[numeric_cols] = X_test[numeric_cols].fillna(X_test[numeric_cols].median())
X_test[categorical_cols] = X_test[categorical_cols].fillna('Unknown')

# One-hot encode categorical columns
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
X_test = pd.get_dummies(X_test, columns=categorical_cols, drop_first=True)

# Align test set columns to train set (in case some categories missing)
X_test = X_test.reindex(columns=X.columns, fill_value=0)

# Scale numeric features (optional)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)


In [7]:
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)


In [9]:
# Define models
models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(n_estimators=200, random_state=42),
    "XGBoost": xgb.XGBRegressor(n_estimators=200, random_state=42, learning_rate=0.1),
    "LightGBM": lgb.LGBMRegressor(n_estimators=200, random_state=42)
}

# Store metrics
results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_val)

    rmse = np.sqrt(mean_squared_error(y_val, preds))  # RMSE
    mae = mean_absolute_error(y_val, preds)
    r2 = r2_score(y_val, preds)

    results[name] = [rmse, mae, r2]

    print(f"\n🔹 {name} Results")
    print(f"RMSE: {rmse:.2f}, MAE: {mae:.2f}, R2: {r2:.4f}")




🔹 LinearRegression Results
RMSE: 1661393.78, MAE: 150649.04, R2: 0.0255

🔹 RandomForest Results
RMSE: 1692157.69, MAE: 111351.55, R2: -0.0109

🔹 XGBoost Results
RMSE: 1792768.10, MAE: 113124.71, R2: -0.1347
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010281 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2519
[LightGBM] [Info] Number of data points in the train set: 14523, number of used features: 119
[LightGBM] [Info] Start training from score 559588.063738

🔹 LightGBM Results
RMSE: 1757473.79, MAE: 174855.91, R2: -0.0905




In [10]:
metrics_df = pd.DataFrame(results, index=['RMSE', 'MAE', 'R2']).T
print("\n===== Model Performance Summary =====")
print(metrics_df)



===== Model Performance Summary =====
                          RMSE            MAE        R2
LinearRegression  1.661394e+06  150649.040832  0.025504
RandomForest      1.692158e+06  111351.545696 -0.010919
XGBoost           1.792768e+06  113124.712234 -0.134705
LightGBM          1.757474e+06  174855.913997 -0.090467


In [18]:
from sklearn.ensemble import RandomForestRegressor

# Train on full data
best_model = RandomForestRegressor(n_estimators=200, random_state=42)
best_model.fit(X_scaled, y)
print("Best Model:", best_model)

# Predict on test set
test_preds = best_model.predict(X_test_scaled)

# Check predictions
print("First 10 predictions:", test_preds[:10])


Best Model: RandomForestRegressor(n_estimators=200, random_state=42)
First 10 predictions: [ 384599.21159407  884305.99470484 1133614.21039871 1942607.87556605
  686903.69618977  250626.45701173  802396.40176562  640838.96757956
  410319.13463041  545774.45776265]


In [14]:
submission = pd.DataFrame({
    "house_id": test_ids,
    "predicted_price": test_preds
})
submission.to_csv("EM20_QudraLisa_Task2_HousePrice.csv", index=False)
print(submission.head())

     house_id  predicted_price
0  2591820310     3.845992e+05
1  7974200820     8.843060e+05
2  7701450110     1.133614e+06
3  9522300010     1.942608e+06
4  9510861140     6.869037e+05


In [15]:
from google.colab import files

# Download the submission CSV
files.download("EM20_QudraLisa_Task2_HousePrice.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>