In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import mean_squared_error

np.random.seed(2137)

In [2]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [3]:
potential_features = ['market_volatility',
 'dim_m2',
 'loc_code_693f303c',
 'n_rooms',
 'estimated_maintenance_cost',
 'year_built',
 'n_poi',
 'obj_type_2a6d5c01',
 'has_lift',
 '2010_2020',
 'loc_code_378f340c',
 'dist_centre',
 'has_park',
 'has_sec', 
 'infrastructure_quality',
 'floor_max',
 'src_year_2024',
 '2000_2010',
 'after_2020',
 'has_balcony',
 'dist_uni_log',
 'dist_sch_log',
 'loc_code_e0cff11b',
 'obj_type_0d6c4dfc',
 'dist_clinic_log',
 'has_store',
 'loc_code_8d5a4f0c',
 'loc_code_533f6886',
 'floor_no',
 'loc_code_81b10147',
 'dist_pharma_log',
 '1990_2000',
 'loc_code_765f79ed',
 '1970_1980',
 'last_floor',
 ]

In [23]:
X = df_train[potential_features]
y = df_train["price_z"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=420)

print("Train shape:", X_train.shape)
print("Validation shape:", X_val.shape)

Train shape: (100130, 35)
Validation shape: (25033, 35)


In [24]:
X_test = df_test[potential_features]
y_test = df_test["price_z"]

In [25]:
# Scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)         # train
X_test = scaler.transform(X_test)   # test

In [33]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
import pandas as pd

# === Define K-Fold CV ===
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# === OLS (baseline - trained once on full training set) ===
ols = LinearRegression()
ols.fit(X_train, y_train)
rmse_ols = mean_squared_error(y_val, ols.predict(X_val), squared=False)

# === Ridge Regression with K-Fold CV ===
ridge = Ridge()
ridge_params = {"alpha": [0.01, 0.1, 1.0, 10.0, 100.0]}
ridge_cv = RandomizedSearchCV(ridge, ridge_params, n_iter=3, cv=cv,
                              scoring='neg_root_mean_squared_error', random_state=420)
ridge_cv.fit(X_train, y_train)
rmse_ridge = mean_squared_error(y_val, ridge_cv.predict(X_val), squared=False)

# === Lasso Regression with K-Fold CV ===
lasso = Lasso(max_iter=5000)
lasso_params = {"alpha": [0.0001, 0.001, 0.01, 0.1, 1.0]}
lasso_cv = RandomizedSearchCV(lasso, lasso_params, n_iter=3, cv=cv,
                              scoring='neg_root_mean_squared_error', random_state=420)
lasso_cv.fit(X_train, y_train)
rmse_lasso = mean_squared_error(y_val, lasso_cv.predict(X_val), squared=False)

# === ElasticNet Regression with K-Fold CV ===
enet = ElasticNet(max_iter=5000)
enet_params = {"alpha": [0.01, 0.1, 1.0], "l1_ratio": [0.1, 0.6, 1.0]}
enet_cv = RandomizedSearchCV(enet, enet_params, n_iter=3, cv=cv,
                             scoring='neg_root_mean_squared_error', random_state=420)
enet_cv.fit(X_train, y_train)
rmse_enet = mean_squared_error(y_val, enet_cv.predict(X_val), squared=False)

# === Compare models by RMSE ===
results = pd.DataFrame({
    "Model": ["OLS", "Ridge", "Lasso", "ElasticNet"],
    "RMSE": [rmse_ols, rmse_ridge, rmse_lasso, rmse_enet]
}).sort_values("RMSE")

print(results)



        Model          RMSE
0         OLS  94957.726797
3  ElasticNet  94957.741504
2       Lasso  94957.877456
1       Ridge  94958.210880




In [32]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)

enet = ElasticNet(max_iter=10000)
enet_params = {"alpha": [0.01, 0.1, 1.0], "l1_ratio": [0.1,0.6, 1.0]}
enet_cv = RandomizedSearchCV(enet, enet_params, n_iter=5, cv=cv,
                             scoring='neg_root_mean_squared_error', random_state=420)
enet_cv.fit(X_train, y_train)
rmse_enet = mean_squared_error(y_val, enet_cv.predict(X_val), squared=False)

print(rmse_enet)

94957.87745592996




In [29]:
# === Retrain best ElasticNet model on full training data ===
best_enet = ElasticNet(
    alpha=enet_cv.best_params_["alpha"],
    l1_ratio=enet_cv.best_params_["l1_ratio"],
    max_iter=5000
)
best_enet.fit(X, y)  # use full df_train (X and y)

# === Predict on test set ===
y_test_pred = best_enet.predict(X_test)
rmse_test = mean_squared_error(y_test, y_test_pred, squared=False)

print("Final RMSE on test set:", round(rmse_test, 5))

Final RMSE on test set: 96928.47854




In [31]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error

# === Step 1: Hyperparameter tuning using CV ===
ridge = Ridge(max_iter=5000)
ridge_params = {"alpha": [0.01, 0.1, 1.0, 10.0, 100.0]}
ridge_cv = RandomizedSearchCV(
    ridge,
    ridge_params,
    n_iter=3,
    cv=cv,
    scoring='neg_root_mean_squared_error',
    random_state=420
)
ridge_cv.fit(X_train, y_train)

# === Step 2: Retrain best model on full training data ===
best_ridge = Ridge(
    alpha=ridge_cv.best_params_["alpha"],
    max_iter=5000
)
best_ridge.fit(X, y)  # X and y are full training data

# === Step 3: Predict on test set ===
y_test_pred = best_ridge.predict(X_test)
rmse_test_ridge = mean_squared_error(y_test, y_test_pred, squared=False)

print("Final RMSE on test set (Ridge):", round(rmse_test_ridge, 5))

Final RMSE on test set (Ridge): 96928.68312




In [30]:
# === Predict and un-log ===
y_test_pred_log = best_ridge.predict(X_test)          # predicted log(price)
y_test_true_log = y_test                             # true log(price)

y_test_pred_price = np.exp(y_test_pred_log)
y_test_true_price = np.exp(y_test_true_log)

# === Calculate RMSE on real price scale ===
rmse_price = np.sqrt(mean_squared_error(y_test_true_price, y_test_pred_price))

print("Final RMSE on real prices:", round(rmse_price, 2))

  y_test_pred_price = np.exp(y_test_pred_log)
  result = getattr(ufunc, method)(*inputs, **kwargs)


ValueError: Input contains infinity or a value too large for dtype('float64').

# Making final predictions

In [44]:
final_test_df = pd.read_csv("final_test.csv")

X_final_test = final_test_df[potential_features]
log_preds = best_enet.predict(X_final_test)
price_preds = np.exp(log_preds)



In [45]:
submission = pd.DataFrame({
    "id": final_test_df["Unnamed: 0"],
    "price_pred": price_preds
})

# === Save to CSV ===
#submission.to_csv("final_predictions.csv", index=False)