In [3]:
import joblib

data = joblib.load('ml_data.pkl')

X_train = data['X_train']
X_test  = data['X_test']
y_train = data['y_train']
y_test  = data['y_test']
X_submit = data['X_submit']


In [4]:
# KNN Regression with GridSearchCV (FULL CODE)

import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsRegressor())
])

param_grid = {
    'knn__n_neighbors': [3, 5, 7, 9, 11, 15],
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan']
}

# GridSearchCV
grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=5,
    scoring='neg_root_mean_squared_error',  # RMSE
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train, y_train)

best_knn = grid.best_estimator_

print("Best Parameters:", grid.best_params_)
print("Best CV RMSE:", -grid.best_score_)

y_pred = best_knn.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Test RMSE:", rmse)
print("Test R2 Score:", r2)


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Parameters: {'knn__metric': 'manhattan', 'knn__n_neighbors': 11, 'knn__weights': 'distance'}
Best CV RMSE: 0.2145274798848337


[WinError 2] The system cannot find the file specified
  File "C:\Users\ASUS\anaconda3\envs\test1\Lib\site-packages\joblib\externals\loky\backend\context.py", line 247, in _count_physical_cores
    cpu_count_physical = _count_physical_cores_win32()
  File "C:\Users\ASUS\anaconda3\envs\test1\Lib\site-packages\joblib\externals\loky\backend\context.py", line 299, in _count_physical_cores_win32
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "C:\Users\ASUS\anaconda3\envs\test1\Lib\subprocess.py", line 554, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ASUS\anaconda3\envs\test1\Lib\subprocess.py", line 1039, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^

Test RMSE: 0.211193976765885
Test R2 Score: 0.8383689098327826


In [5]:
# Decision Tree Regression with GridSearchCV (FULL)

import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

dt = DecisionTreeRegressor(random_state=42)

param_grid = {
    'max_depth': [3, 5, 7, 10, 15, None],
    'min_samples_split': [2, 10, 30, 50],
    'min_samples_leaf': [1, 5, 10, 20],
    'max_features': [None, 'sqrt', 'log2']
}

# GridSearchCV
grid = GridSearchCV(
    estimator=dt,
    param_grid=param_grid,
    cv=5,
    scoring='neg_root_mean_squared_error',  # RMSE
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train, y_train)

best_dt = grid.best_estimator_

print("Best Parameters:", grid.best_params_)
print("Best CV RMSE:", -grid.best_score_)

y_pred = best_dt.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Test RMSE:", rmse)
print("Test R2 Score:", r2)

train_pred = best_dt.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))

print("Train RMSE:", train_rmse)
print("RMSE Gap (Train - Test):", train_rmse - rmse)


Fitting 5 folds for each of 288 candidates, totalling 1440 fits
Best Parameters: {'max_depth': 15, 'max_features': None, 'min_samples_leaf': 5, 'min_samples_split': 30}
Best CV RMSE: 0.2168443154342746
Test RMSE: 0.2162636534336514
Test R2 Score: 0.8305159173483874
Train RMSE: 0.15742318314556503
RMSE Gap (Train - Test): -0.058840470288086366


In [6]:
# Random Forest Regression with GridSearchCV (FULL)

import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

rf = RandomForestRegressor(
    random_state=42,
    n_jobs=-1
)

param_grid = {
    'n_estimators': [200, 400],
    'max_depth': [None,5, 10, 20, 30],
    'min_samples_split': [2, 10, 30],
    'min_samples_leaf': [1, 5, 10],
    'max_features': ['sqrt', 'log2']
}

# GridSearchCV
grid = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    scoring='neg_root_mean_squared_error',  # RMSE
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train, y_train)

best_rf = grid.best_estimator_

print("Best Parameters:", grid.best_params_)
print("Best CV RMSE:", -grid.best_score_)

y_pred = best_rf.predict(X_test)

# Evaluate on test set
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Test RMSE:", rmse)
print("Test R2 Score:", r2)

train_pred = best_rf.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))

print("Train RMSE:", train_rmse)
print("RMSE Gap (Train - Test):", train_rmse - rmse)


Fitting 5 folds for each of 180 candidates, totalling 900 fits
Best Parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 400}
Best CV RMSE: 0.18287569905894435
Test RMSE: 0.18030324473294768
Test R2 Score: 0.8821935874932131
Train RMSE: 0.06933388417663111
RMSE Gap (Train - Test): -0.11096936055631657


In [8]:
# Linear Regression - Full Code

import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

lr_model = LinearRegression()

lr_model.fit(X_train, y_train)

y_pred_log = lr_model.predict(X_test)

#  Evaluate on log scale
rmse_log = np.sqrt(mean_squared_error(y_test, y_pred_log))
r2 = r2_score(y_test, y_pred_log)

print("Linear Regression RMSE (log-price):", rmse_log)
print("Linear Regression R2 Score:", r2)

#  Convert predictions back to actual price
y_test_pred = np.expm1(y_pred_log)
y_test_true = np.expm1(y_test)

rmse_price = np.sqrt(mean_squared_error(y_test_true, y_test_pred))

print("Linear Regression RMSE (actual price):", rmse_price)


Linear Regression RMSE (log-price): 0.2485068803400851
Linear Regression R2 Score: 0.7762110281488033
Linear Regression RMSE (actual price): 176161.35042429125


In [9]:
# XGBoost Regression with GridSearchCV (FULL)

import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

xgb = XGBRegressor(
    objective='reg:squarederror',
    random_state=42,
    n_jobs=-1
)

param_grid = {
    'n_estimators': [300, 500],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# GridSearchCV
grid = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    cv=5,
    scoring='neg_root_mean_squared_error',  # RMSE on log(price)
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train, y_train)

best_xgb = grid.best_estimator_

print("Best Parameters:", grid.best_params_)
print("Best CV RMSE (log-price):", -grid.best_score_)

 # Predict on test data (log scale)
y_pred_log = best_xgb.predict(X_test)

# Evaluate on log scale
rmse_log = np.sqrt(mean_squared_error(y_test, y_pred_log))
r2 = r2_score(y_test, y_pred_log)

print("Test RMSE (log-price):", rmse_log)
print("Test R2 Score:", r2)

# Convert back to actual price
y_test_pred = np.expm1(y_pred_log)
y_test_true = np.expm1(y_test)

rmse_price = np.sqrt(mean_squared_error(y_test_true, y_test_pred))
print("Test RMSE (actual price):", rmse_price)

# Overfitting check
train_pred_log = best_xgb.predict(X_train)
train_rmse_log = np.sqrt(mean_squared_error(y_train, train_pred_log))

print("Train RMSE (log-price):", train_rmse_log)
print("RMSE Gap (Train - Test):", train_rmse_log - rmse_log)


Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 500, 'subsample': 0.8}
Best CV RMSE (log-price): 0.1656083904070192
Test RMSE (log-price): 0.1640085632274159
Test R2 Score: 0.9025246295710806
Test RMSE (actual price): 115944.18830360653
Train RMSE (log-price): 0.12601334878469334
RMSE Gap (Train - Test): -0.03799521444272255


In [10]:
# LightGBM Regression with GridSearchCV (FULL)

import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

lgbm = lgb.LGBMRegressor(
    objective='regression',
    random_state=42,
    n_jobs=-1
)

param_grid = {
    'n_estimators': [300, 500, 700],
    'learning_rate': [0.03, 0.05, 0.1],
    'max_depth': [-1, 5, 7],
    'num_leaves': [31, 50, 70],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}
 # GridSearchCV
grid = GridSearchCV(
    estimator=lgbm,
    param_grid=param_grid,
    cv=5,
    scoring='neg_root_mean_squared_error',  # RMSE on log(price)
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train, y_train)

best_lgbm = grid.best_estimator_

print("Best Parameters:", grid.best_params_)
print("Best CV RMSE (log-price):", -grid.best_score_)

# 6. Predict on test data (log scale)
y_pred_log = best_lgbm.predict(X_test)

# 7. Evaluate on log scale
rmse_log = np.sqrt(mean_squared_error(y_test, y_pred_log))
r2 = r2_score(y_test, y_pred_log)

print("Test RMSE (log-price):", rmse_log)
print("Test R2 Score:", r2)

# 8. Convert back to actual price
y_test_pred = np.expm1(y_pred_log)
y_test_true = np.expm1(y_test)

rmse_price = np.sqrt(mean_squared_error(y_test_true, y_test_pred))
print("Test RMSE (actual price):", rmse_price)

# 9. Overfitting check
train_pred_log = best_lgbm.predict(X_train)
train_rmse_log = np.sqrt(mean_squared_error(y_train, train_pred_log))

print("Train RMSE (log-price):", train_rmse_log)
print("RMSE Gap (Train - Test):", train_rmse_log - rmse_log)


Fitting 5 folds for each of 324 candidates, totalling 1620 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001184 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1805
[LightGBM] [Info] Number of data points in the train set: 12967, number of used features: 16
[LightGBM] [Info] Start training from score 13.043959
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.03, 'max_depth': 7, 'n_estimators': 700, 'num_leaves': 31, 'subsample': 0.8}
Best CV RMSE (log-price): 0.16354761582130012
Test RMSE (log-price): 0.16274975480461298
Test R2 Score: 0.9040151850258936
Test RMSE (actual price): 116674.26292044243
Train RMSE (log-price): 0.1264557348683431
RMSE Gap (Train - Test): -0.03629401993626988




In [11]:
X_train.info()

AttributeError: 'numpy.ndarray' object has no attribute 'info'