In [1]:
import pandas as pd

data_path = "C:\\Users\\OmarFCB\\Dropbox\\PC\\Downloads\\merged_with_ffill.csv"
data = pd.read_csv(data_path)

print(data.head())


                  time  proton_vx_gse  proton_vy_gse  proton_vz_gse  \
0  2016-07-26 00:00:00     -374.08124      19.475000      16.175000   
1  2016-07-26 00:01:00     -373.60000      22.053333      16.660000   
2  2016-07-26 00:02:00     -373.56876      21.312500      16.556250   
3  2016-07-26 00:03:00     -374.10000      16.353334      16.333334   
4  2016-07-26 00:04:00     -373.99374      20.231250      17.500000   

   proton_vx_gsm  proton_vy_gsm  proton_vz_gsm  proton_speed  proton_density  \
0     -374.08124      20.493750      14.843750     374.95000        7.333750   
1     -373.60000      23.113335      15.153334     374.64000        7.315333   
2     -373.56876      22.375000      15.081250     374.53750        7.228750   
3     -374.10000      17.413334      15.206667     374.83334        7.299333   
4     -373.99374      21.362500      16.100000     374.96250        7.182500   

   proton_temperature    K  hour  dayofweek  quarter  month  year  dayofyear  \
0          1

In [2]:
from sklearn.model_selection import train_test_split

X = data.drop(['K', 'time'], axis=1)  # 'time' column is dropped for simplicity 
y = data['K']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


import xgboost as xgb
from sklearn.metrics import mean_squared_error

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

param = {
    'objective': 'reg:squarederror',
    'tree_method': 'gpu_hist',  
    'eval_metric': 'rmse'
}
num_round = 100

bst = xgb.train(param, dtrain, num_round, evals=[(dtest, 'test')])

y_pred_xgb = bst.predict(dtest)

rmse_xgb = mean_squared_error(y_test, y_pred_xgb, squared=False)
print(f"XGBoost GPU RMSE: {rmse_xgb}")


In [4]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error


param_dist = {
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': np.arange(3, 10, 1),
    'min_child_weight': np.arange(1, 10, 1),
    'gamma': [i/10.0 for i in range(0,5)],
    'subsample': [i/10.0 for i in range(6,10)],
    'colsample_bytree': [i/10.0 for i in range(6,10)],
    'reg_lambda': [1e-5, 1e-2, 0.1, 1, 10],
    'reg_alpha': [1e-5, 1e-2, 0.1, 1, 10]
}

model = xgb.XGBRegressor(tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor')

random_search = RandomizedSearchCV(model, param_distributions=param_dist, scoring='neg_mean_squared_error', n_iter=50, cv=3, verbose=2, random_state=42, n_jobs=1)

random_search.fit(X_train, y_train)

print(random_search.best_params_)


Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] END colsample_bytree=0.7, gamma=0.1, learning_rate=0.1, max_depth=5, min_child_weight=5, reg_alpha=0.1, reg_lambda=10, subsample=0.8; total time=   5.3s
[CV] END colsample_bytree=0.7, gamma=0.1, learning_rate=0.1, max_depth=5, min_child_weight=5, reg_alpha=0.1, reg_lambda=10, subsample=0.8; total time=   6.7s
[CV] END colsample_bytree=0.7, gamma=0.1, learning_rate=0.1, max_depth=5, min_child_weight=5, reg_alpha=0.1, reg_lambda=10, subsample=0.8; total time=   7.9s
[CV] END colsample_bytree=0.7, gamma=0.2, learning_rate=0.5, max_depth=5, min_child_weight=2, reg_alpha=1, reg_lambda=0.01, subsample=0.9; total time=   7.7s
[CV] END colsample_bytree=0.7, gamma=0.2, learning_rate=0.5, max_depth=5, min_child_weight=2, reg_alpha=1, reg_lambda=0.01, subsample=0.9; total time=   7.7s
[CV] END colsample_bytree=0.7, gamma=0.2, learning_rate=0.5, max_depth=5, min_child_weight=2, reg_alpha=1, reg_lambda=0.01, subsample=0.9; total tim

[CV] END colsample_bytree=0.6, gamma=0.3, learning_rate=0.1, max_depth=5, min_child_weight=1, reg_alpha=0.01, reg_lambda=1e-05, subsample=0.6; total time=   7.8s
[CV] END colsample_bytree=0.6, gamma=0.3, learning_rate=0.1, max_depth=5, min_child_weight=1, reg_alpha=0.01, reg_lambda=1e-05, subsample=0.6; total time=   7.8s
[CV] END colsample_bytree=0.8, gamma=0.3, learning_rate=0.5, max_depth=8, min_child_weight=2, reg_alpha=1e-05, reg_lambda=1, subsample=0.7; total time=  14.2s
[CV] END colsample_bytree=0.8, gamma=0.3, learning_rate=0.5, max_depth=8, min_child_weight=2, reg_alpha=1e-05, reg_lambda=1, subsample=0.7; total time=  14.4s
[CV] END colsample_bytree=0.8, gamma=0.3, learning_rate=0.5, max_depth=8, min_child_weight=2, reg_alpha=1e-05, reg_lambda=1, subsample=0.7; total time=  14.4s
[CV] END colsample_bytree=0.9, gamma=0.2, learning_rate=0.01, max_depth=3, min_child_weight=6, reg_alpha=1, reg_lambda=10, subsample=0.9; total time=   6.1s
[CV] END colsample_bytree=0.9, gamma=0.2, 

[CV] END colsample_bytree=0.7, gamma=0.1, learning_rate=0.5, max_depth=8, min_child_weight=1, reg_alpha=0.01, reg_lambda=1e-05, subsample=0.9; total time=  14.0s
[CV] END colsample_bytree=0.6, gamma=0.0, learning_rate=0.01, max_depth=6, min_child_weight=1, reg_alpha=0.1, reg_lambda=0.01, subsample=0.9; total time=   9.7s
[CV] END colsample_bytree=0.6, gamma=0.0, learning_rate=0.01, max_depth=6, min_child_weight=1, reg_alpha=0.1, reg_lambda=0.01, subsample=0.9; total time=   9.7s
[CV] END colsample_bytree=0.6, gamma=0.0, learning_rate=0.01, max_depth=6, min_child_weight=1, reg_alpha=0.1, reg_lambda=0.01, subsample=0.9; total time=   9.7s
[CV] END colsample_bytree=0.7, gamma=0.1, learning_rate=0.1, max_depth=7, min_child_weight=6, reg_alpha=0.1, reg_lambda=1, subsample=0.9; total time=  11.0s
[CV] END colsample_bytree=0.7, gamma=0.1, learning_rate=0.1, max_depth=7, min_child_weight=6, reg_alpha=0.1, reg_lambda=1, subsample=0.9; total time=  11.0s
[CV] END colsample_bytree=0.7, gamma=0.1,

In [5]:
random_search.best_params_

{'subsample': 0.6,
 'reg_lambda': 1,
 'reg_alpha': 10,
 'min_child_weight': 9,
 'max_depth': 9,
 'learning_rate': 0.5,
 'gamma': 0.0,
 'colsample_bytree': 0.8}

In [6]:
best_params = random_search.best_params_
best_xgb_model = xgb.XGBRegressor(**best_params, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor')
best_xgb_model.fit(X_train, y_train)


XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.8,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0.0, gpu_id=0, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.5, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=9, max_leaves=0, min_child_weight=9,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='gpu_predictor', random_state=0,
             reg_alpha=10, reg_lambda=1, ...)

In [7]:
y_pred_best = best_xgb_model.predict(X_test)


In [8]:
from sklearn.metrics import mean_squared_error

rmse_best = mean_squared_error(y_test, y_pred_best, squared=False)
print(f"RMSE with Best Hyperparameters: {rmse_best}")


RMSE with Best Hyperparameters: 0.4117276355840797


In [9]:
print(1)


1


In [14]:
best_xgb_model = xgb.XGBRegressor(**best_params, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor')
best_xgb_model.fit(X_train, y_train)


XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.8,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0.0, gpu_id=0, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.5, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=9, max_leaves=0, min_child_weight=9,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='gpu_predictor', random_state=0,
             reg_alpha=10, reg_lambda=1, ...)

In [16]:
y_pred_best = best_xgb_model.predict(X_test)
from sklearn.metrics import mean_squared_error

rmse_best = mean_squared_error(y_test, y_pred_best, squared=False)
print(f"RMSE with Best Hyperparameters: {rmse_best}")


RMSE with Best Hyperparameters: 0.4117276355840797


In [17]:
best_xgb_model.save_model('model.xgb')




In [18]:
import xgboost as xgb
loaded_model = xgb.XGBRegressor()
loaded_model.load_model('model.xgb')


In [21]:
import pickle
with open('xg.pkl', 'wb') as f:
    pickle.dump(best_xgb_model, f)


In [1]:
data

NameError: name 'data' is not defined