In [80]:
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

In [None]:
# Loading training data
# Define your row range (e.g., from row 100 to row 599)
start_row = 2
end_row = 1201  # Note: this is exclusive
data = pd.read_excel(r'D:\Transformed_data_norm.xlsx',
                     #skiprows=range(1, start_row), # Skip rows before your start_row (excluding header)
                     nrows=end_row - start_row # Number of rows to read
)
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

x_train = X
y_train = y

In [82]:
Nrows= end_row-start_row
Nrows

1199

In [83]:
# XGB parameter tuning
xgb_params = {
    'n_estimators': [100, 200, 300, 500, 1000, 1500, 2000, 2500],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 0.5, 1],        # L1 regularization
    'reg_lambda': [0.5, 1.0, 1.5, 2]      # L2 regularization
}

#from xgboost import XGBRegressor
#from sklearn.model_selection import RandomizedSearchCV

xgb = XGBRegressor(random_state=42)

xgb_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=xgb_params,
    n_iter=50,  # You can increase if you want better exploration
    scoring='r2',
    cv=5,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

xgb_search.fit(x_train, y_train)
best_xgb = xgb_search.best_estimator_
#print("Best Parameters for XGBoost:")
#print(xgb_search.best_params_)


Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [84]:
# RF parameter tuning
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestRegressor(random_state=42)
rf_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=rf_params,
    n_iter=50,
    scoring='r2',
    cv=5,
    verbose=1,
    n_jobs=-1,
    random_state=42
)
rf_search.fit(x_train, y_train)
best_rf = rf_search.best_estimator_
#print("\nBest Parameters for Random Forest:")
#print(rf_search.best_params_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [85]:
# KNN parameter tuning
knn_params = {
    'n_neighbors': list(range(3, 20)),
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # 1: Manhattan, 2: Euclidean
}

knn = KNeighborsRegressor()
knn_search = RandomizedSearchCV(
    estimator=knn,
    param_distributions=knn_params,
    n_iter=50,
    scoring='r2',
    cv=5,
    verbose=1,
    n_jobs=-1
)
knn_search.fit(x_train, y_train)
best_knn = knn_search.best_estimator_
#print("\nBest Parameters for KNN:")
#print(knn_search.best_params_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [86]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, r2_score

# Use R² scoring across 5 folds
r2_scorer = make_scorer(r2_score)

models = {
    'XGBoost': (xgb_search, best_xgb),
    'Random Forest': (rf_search, best_rf),
    'KNN': (knn_search, best_knn)
}

for name, (search, model) in models.items():
    scores = cross_val_score(model, x_train, y_train, cv=5, scoring=r2_scorer)
    mean_r2 = scores.mean()
    print(f"\n{name} Results:")
    print("Best Parameters:", search.best_params_)
    print(f"Mean Cross-Validated R²: {mean_r2:.4f}")



XGBoost Results:
Best Parameters: {'subsample': 0.6, 'reg_lambda': 2, 'reg_alpha': 1, 'n_estimators': 2000, 'max_depth': 5, 'learning_rate': 0.05, 'colsample_bytree': 1.0}
Mean Cross-Validated R²: 0.6693

Random Forest Results:
Best Parameters: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 20}
Mean Cross-Validated R²: 0.6234

KNN Results:
Best Parameters: {'weights': 'distance', 'p': 1, 'n_neighbors': 6}
Mean Cross-Validated R²: 0.5366
