In [1]:
import optuna

In [2]:
import xgboost as xgb
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_squared_error

In [3]:
# We are using admission prediction dataset

df = pd.read_csv("Admission_Prediction.csv")

In [4]:
df

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337.0,118.0,4.0,4.5,4.5,9.65,1,0.92
1,2,324.0,107.0,4.0,4.0,4.5,8.87,1,0.76
2,3,,104.0,3.0,3.0,3.5,8.00,1,0.72
3,4,322.0,110.0,3.0,3.5,2.5,8.67,1,0.80
4,5,314.0,103.0,2.0,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...,...
495,496,332.0,108.0,5.0,4.5,4.0,9.02,1,0.87
496,497,337.0,117.0,5.0,5.0,5.0,9.87,1,0.96
497,498,330.0,120.0,5.0,4.5,5.0,9.56,1,0.93
498,499,312.0,103.0,4.0,4.0,5.0,8.43,0,0.73


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Serial No.         500 non-null    int64  
 1   GRE Score          485 non-null    float64
 2   TOEFL Score        490 non-null    float64
 3   University Rating  485 non-null    float64
 4   SOP                500 non-null    float64
 5   LOR                500 non-null    float64
 6   CGPA               500 non-null    float64
 7   Research           500 non-null    int64  
 8   Chance of Admit    500 non-null    float64
dtypes: float64(7), int64(2)
memory usage: 35.3 KB


In [6]:
df.describe()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
count,500.0,485.0,490.0,485.0,500.0,500.0,500.0,500.0,500.0
mean,250.5,316.558763,107.187755,3.121649,3.374,3.484,8.57644,0.56,0.72174
std,144.481833,11.274704,6.112899,1.14616,0.991004,0.92545,0.604813,0.496884,0.14114
min,1.0,290.0,92.0,1.0,1.0,1.0,6.8,0.0,0.34
25%,125.75,308.0,103.0,2.0,2.5,3.0,8.1275,0.0,0.63
50%,250.5,317.0,107.0,3.0,3.5,3.5,8.56,1.0,0.72
75%,375.25,325.0,112.0,4.0,4.0,4.0,9.04,1.0,0.82
max,500.0,340.0,120.0,5.0,5.0,5.0,9.92,1.0,0.97


In [7]:
df.isnull().sum()

Serial No.            0
GRE Score            15
TOEFL Score          10
University Rating    15
SOP                   0
LOR                   0
CGPA                  0
Research              0
Chance of Admit       0
dtype: int64

In [8]:
# We should handle the null values, as of now we using median

df["GRE Score"] = df["GRE Score"].fillna(df["GRE Score"].median())
df["TOEFL Score"] = df["TOEFL Score"].fillna(df["TOEFL Score"].median())
df["University Rating"] = df["University Rating"].fillna(df["University Rating"].median())

In [9]:
df.isnull().sum()

Serial No.           0
GRE Score            0
TOEFL Score          0
University Rating    0
SOP                  0
LOR                  0
CGPA                 0
Research             0
Chance of Admit      0
dtype: int64

In [10]:
df.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337.0,118.0,4.0,4.5,4.5,9.65,1,0.92
1,2,324.0,107.0,4.0,4.0,4.5,8.87,1,0.76
2,3,317.0,104.0,3.0,3.0,3.5,8.0,1,0.72
3,4,322.0,110.0,3.0,3.5,2.5,8.67,1,0.8
4,5,314.0,103.0,2.0,2.0,3.0,8.21,0,0.65


In [11]:
# removing unnecessary column and target column

X = df.drop(["Serial No.", "Chance of Admit"], axis=1)
y = df[["Chance of Admit"]]

In [12]:
X

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
0,337.0,118.0,4.0,4.5,4.5,9.65,1
1,324.0,107.0,4.0,4.0,4.5,8.87,1
2,317.0,104.0,3.0,3.0,3.5,8.00,1
3,322.0,110.0,3.0,3.5,2.5,8.67,1
4,314.0,103.0,2.0,2.0,3.0,8.21,0
...,...,...,...,...,...,...,...
495,332.0,108.0,5.0,4.5,4.0,9.02,1
496,337.0,117.0,5.0,5.0,5.0,9.87,1
497,330.0,120.0,5.0,4.5,5.0,9.56,1
498,312.0,103.0,4.0,4.0,5.0,8.43,0


In [13]:
y

Unnamed: 0,Chance of Admit
0,0.92
1,0.76
2,0.72
3,0.80
4,0.65
...,...
495,0.87
496,0.96
497,0.93
498,0.73


In [14]:
# Here we can skip the scaling method, because we are using decision trees, if we want to use other algorithms, then we should
# use scaling.

# For trees, scaling is not required

Do train, test split before standardization, that's the good practice, (this method is good, data leakage will not happen)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=25)

In [16]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(375, 7)
(125, 7)
(375, 1)
(125, 1)


In [17]:
std_sca = StandardScaler()

In [18]:
X_train = std_sca.fit_transform(X_train)

In [19]:
X_train

array([[ 2.90198444e-01, -5.41014617e-01, -7.62029204e-02, ...,
        -1.02555725e+00, -1.32267264e-03,  9.10641693e-01],
       [-1.42855793e+00, -3.59179829e-02, -7.62029204e-02, ...,
         5.17087690e-02, -3.81591057e-01,  9.10641693e-01],
       [ 4.71120167e-01, -3.59179829e-02, -7.62029204e-02, ...,
         5.17087690e-02, -1.83190161e-01,  9.10641693e-01],
       ...,
       [-7.95331895e-01, -3.72649072e-01,  7.89739357e-01, ...,
        -1.02555725e+00, -1.02639397e+00,  9.10641693e-01],
       [ 6.52041890e-01,  6.37544196e-01, -7.62029204e-02, ...,
        -1.56419026e+00,  3.78945712e-01,  9.10641693e-01],
       [-7.04871034e-01, -3.72649072e-01,  1.65568163e+00, ...,
         5.17087690e-02, -1.78560807e-02, -1.09812675e+00]])

In [20]:
X_test = std_sca.transform(X_test)

In [21]:
X_test

array([[-3.43027587e-01, -2.22467007e+00, -9.42145198e-01,
        -8.60761806e-01, -2.10282327e+00, -7.28792625e-01,
        -1.09812675e+00],
       [ 4.71120167e-01, -5.41014617e-01, -7.62029204e-02,
         1.26350357e-01,  5.90341779e-01,  4.45079344e-01,
         9.10641693e-01],
       [-1.15717534e+00, -3.72649072e-01, -1.80808748e+00,
        -3.67205724e-01, -2.10282327e+00, -1.77039733e+00,
        -1.09812675e+00],
       [ 3.80659306e-01,  1.14264083e+00,  7.89739357e-01,
         6.19906438e-01,  1.66760780e+00,  9.08014768e-01,
        -1.09812675e+00],
       [ 1.19480706e+00,  9.74275286e-01,  1.65568163e+00,
         1.60701860e+00,  5.90341779e-01,  1.22214952e+00,
         9.10641693e-01],
       [ 8.32963614e-01,  1.47937192e+00, -7.62029204e-02,
         1.26350357e-01,  5.90341779e-01,  9.41081584e-01,
         9.10641693e-01],
       [-8.85792757e-01,  1.32447562e-01, -9.42145198e-01,
         6.19906438e-01,  5.17087690e-02, -1.43972917e+00,
        -1.0981267

In [22]:
!nvidia-smi

'nvidia-smi' is not recognized as an internal or external command,
operable program or batch file.


In [23]:
print(X)
print(y)

     GRE Score  TOEFL Score  University Rating  SOP  LOR  CGPA  Research
0        337.0        118.0                4.0  4.5  4.5  9.65         1
1        324.0        107.0                4.0  4.0  4.5  8.87         1
2        317.0        104.0                3.0  3.0  3.5  8.00         1
3        322.0        110.0                3.0  3.5  2.5  8.67         1
4        314.0        103.0                2.0  2.0  3.0  8.21         0
..         ...          ...                ...  ...  ...   ...       ...
495      332.0        108.0                5.0  4.5  4.0  9.02         1
496      337.0        117.0                5.0  5.0  5.0  9.87         1
497      330.0        120.0                5.0  4.5  5.0  9.56         1
498      312.0        103.0                4.0  4.0  5.0  8.43         0
499      327.0        113.0                4.0  4.5  4.5  9.04         0

[500 rows x 7 columns]
     Chance of Admit
0               0.92
1               0.76
2               0.72
3               

We are finding the better parameters by using optuna, thats why we writing like this.

In [24]:
def objective(trail, data=X, target=y):

    # splitting data into train&test
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.25, random_state=30)

    # Choosing best hyperparameter
    param = {
        "tree_method":"gpu_hist", 
        "reg_lambda" : trail.suggest.loguniform("reg_lambda", le-4, 10.0),
        "gamma" :trail.suggest_loguniform("gamma", le-4, 10.0),
        "colsample_bytree" : trail.suggest_categorical("colsample_bytree", [.1,.2,.3,.4,.5,.6,.7,.8,.9,1]),
        "subsample" : trail.suggest_categorical("subsample_bytree", [.1,.2,.3,.4,.5,.6,.7,.8,.9,1]),
        "learning_rate" : trail.suggest_categorical("learning_rate", [.00001,.00003,.008,.02,.01,1,8]),
        "n_estimator" : 300,
        "max_depth":trail.suggest_categorical("max_depth", [3,4,5,6,7,8,9,10,11,12]),
        "random_state":trail.suggest_categorical("random_state", [10,20,30,2000,3454,243123]),
        "min_child_weight":trail.suggest_int("min_child_weight", 1,200)
    }

    # model fitting
    Xgb_reg_model = xgb.XGBRegressor(**param) # we are passing the entire dictionary

    Xgb_reg_model.fit(train_x, train_y, set=[test_x, test_y], verbose=True)

    #model predicting
    pred_xgb = Xgb_reg_model.predict(test_x)

    mse = mean_squared_error(test_y, pred_xgb)

    return mse


In [25]:
find_params = optuna.create_study()
find_params.optimize(objective, n_trials=10)
find_params.best_trails.params

# It should work, I dont know why its showing error

[I 2023-07-31 11:30:17,989] A new study created in memory with name: no-name-37e59397-8f8d-483b-95a0-1f3b7b7b84dc
[W 2023-07-31 11:30:17,995] Trial 0 failed with parameters: {} because of the following error: AttributeError("'Trial' object has no attribute 'suggest'").
Traceback (most recent call last):
  File "c:\Users\msant\anaconda3\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\msant\AppData\Local\Temp/ipykernel_4932/1943312502.py", line 9, in objective
    "reg_lambda" : trail.suggest.loguniform("reg_lambda", le-4, 10.0),
AttributeError: 'Trial' object has no attribute 'suggest'
[W 2023-07-31 11:30:18,001] Trial 0 failed with value None.


AttributeError: 'Trial' object has no attribute 'suggest'

Until now, we are train our model with different different hyper parameter, by using optuna

In [32]:
find_params.trials_dataframe()

# with respect to each and every trails we can find out the value here

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_alpha,params_colsample_bytree,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_random_state,params_subsample,state
0,0,,2023-07-31 11:33:56.389910,2023-07-31 11:33:56.414872,0 days 00:00:00.024962,4.390767,0.7,0.725202,0.008,6,192,20,0.4,FAIL


In [34]:
optuna.visualization.plot_optimization_history(find_params)

[W 2023-07-31 11:38:14,132] There are no complete trials.


In [35]:
optuna.visualization.plot_slice(find_params)

[W 2023-07-31 11:41:49,666] Your study does not have any completed trials.


In [36]:
optuna.visualization.plot_contour(find_params,params=['alpha','lambda'])

[W 2023-07-31 11:42:18,538] Your study does not have any completed trials.


In [37]:
best_params={'lambda': 0.7816338595163784,
 'alpha': 0.25683131748959687,
 'colsample_bytree': 0.1,
 'subsample': 0.9,
 'learning_rate': 1,
 'max_depth': 10,
 'random_state': 20,
 'min_child_weight': 124}

In [38]:
model=xgb.XGBRegressor(**best_params)

In [40]:
model.fit(X_train,y_train)

XGBRegressor(alpha=0.25683131748959687, base_score=None, booster=None,
             callbacks=None, colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.1, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, lambda=0.7816338595163784,
             learning_rate=1, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=10,
             max_leaves=None, min_child_weight=124, missing=nan,
             monotone_constraints=None, n_estimators=100, n_jobs=None,
             num_parallel_tree=None, ...)

In [41]:
y_pred=model.predict(X_test)

In [42]:
r2score=[0,1]
from sklearn.metrics import r2_score

r2_score(y_test,y_pred)

0.7615024365974317

In [43]:
from sklearn.ensemble import RandomForestRegressor
model2=RandomForestRegressor()
model2.fit(X_train,y_train)
y_pred2=model2.predict(X_test)
r2_score(y_test,y_pred2)



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



0.8212176612627666