In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale
from sklearn import model_selection
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import neighbors
from sklearn.svm import SVR

In [2]:
df=pd.read_csv("Hitters.csv")
df=df.dropna()
dms = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
y = df["Salary"]
X_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis=1).astype('float64')
X = pd.concat([X_, dms[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [3]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [4]:
from catboost import CatBoostRegressor

In [5]:
catb_model=CatBoostRegressor().fit(X_train, y_train)

Learning rate set to 0.031674
0:	learn: 437.6430699	total: 49.4ms	remaining: 49.3s
1:	learn: 431.3923642	total: 51.6ms	remaining: 25.7s
2:	learn: 424.8820360	total: 53.8ms	remaining: 17.9s
3:	learn: 418.2514904	total: 57.5ms	remaining: 14.3s
4:	learn: 412.6394021	total: 59.6ms	remaining: 11.9s
5:	learn: 406.6247020	total: 61.7ms	remaining: 10.2s
6:	learn: 400.5321206	total: 63.8ms	remaining: 9.05s
7:	learn: 394.6683437	total: 65.8ms	remaining: 8.16s
8:	learn: 388.2496484	total: 67.8ms	remaining: 7.46s
9:	learn: 382.9448842	total: 69.8ms	remaining: 6.91s
10:	learn: 377.2600080	total: 71.9ms	remaining: 6.46s
11:	learn: 372.4829606	total: 73.9ms	remaining: 6.08s
12:	learn: 366.6823437	total: 76ms	remaining: 5.77s
13:	learn: 362.6076230	total: 78ms	remaining: 5.49s
14:	learn: 358.0107745	total: 80ms	remaining: 5.25s
15:	learn: 353.2802665	total: 82ms	remaining: 5.04s
16:	learn: 348.5646265	total: 84.3ms	remaining: 4.87s
17:	learn: 343.6407912	total: 86.3ms	remaining: 4.71s
18:	learn: 339.2

In [6]:
y_pred=catb_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

351.194631344607

In [7]:
#Model Tuning

In [11]:
catb_params = {
    'learning_rate': [0.01, 0.1],
    'depth': [4, 6],
    'iterations': [100, 200]
}

In [12]:
catb_model=CatBoostRegressor()

In [13]:
catb_cv_model=GridSearchCV(catb_model, catb_params, cv=5, n_jobs=-1, verbose=2).fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


  _data = np.array(data, dtype=dtype, copy=copy,


0:	learn: 422.3624834	total: 6.08ms	remaining: 1.21s
1:	learn: 403.8107955	total: 8.76ms	remaining: 867ms
2:	learn: 386.6923849	total: 12.4ms	remaining: 812ms
3:	learn: 369.1363930	total: 15.4ms	remaining: 756ms
4:	learn: 355.2627946	total: 17.7ms	remaining: 689ms
5:	learn: 341.2333201	total: 21.5ms	remaining: 696ms
6:	learn: 328.2582777	total: 25.6ms	remaining: 705ms
7:	learn: 315.7512689	total: 29.3ms	remaining: 704ms
8:	learn: 304.3563870	total: 31.6ms	remaining: 670ms
9:	learn: 295.0824320	total: 35ms	remaining: 665ms
10:	learn: 285.9264237	total: 38.3ms	remaining: 658ms
11:	learn: 277.6262209	total: 42.8ms	remaining: 670ms
12:	learn: 269.8797844	total: 46.1ms	remaining: 664ms
13:	learn: 262.5253392	total: 52.1ms	remaining: 692ms
14:	learn: 255.9397985	total: 55.1ms	remaining: 679ms
15:	learn: 249.7140221	total: 58.4ms	remaining: 671ms
16:	learn: 243.5437215	total: 61.7ms	remaining: 664ms
17:	learn: 237.2205711	total: 65.1ms	remaining: 658ms
18:	learn: 230.8696751	total: 68.3ms	rem

In [14]:
catb_cv_model.best_params_

{'depth': 6, 'iterations': 200, 'learning_rate': 0.1}

In [15]:
catb_tuned=CatBoostRegressor(iterations=200, learning_rate=0.1, depth=6).fit(X_train, y_train)

0:	learn: 422.3624834	total: 3.64ms	remaining: 725ms
1:	learn: 403.8107955	total: 6.31ms	remaining: 625ms
2:	learn: 386.6923849	total: 8.88ms	remaining: 583ms
3:	learn: 369.1363930	total: 11.3ms	remaining: 556ms
4:	learn: 355.2627946	total: 14.5ms	remaining: 564ms
5:	learn: 341.2333201	total: 16.9ms	remaining: 546ms
6:	learn: 328.2582777	total: 21.5ms	remaining: 594ms
7:	learn: 315.7512689	total: 25.4ms	remaining: 609ms
8:	learn: 304.3563870	total: 29.3ms	remaining: 622ms
9:	learn: 295.0824320	total: 32.2ms	remaining: 612ms
10:	learn: 285.9264237	total: 34.2ms	remaining: 588ms
11:	learn: 277.6262209	total: 38ms	remaining: 595ms
12:	learn: 269.8797844	total: 39.9ms	remaining: 574ms
13:	learn: 262.5253392	total: 42.8ms	remaining: 568ms
14:	learn: 255.9397985	total: 46.5ms	remaining: 574ms
15:	learn: 249.7140221	total: 49.5ms	remaining: 569ms
16:	learn: 243.5437215	total: 51.5ms	remaining: 554ms
17:	learn: 237.2205711	total: 53.6ms	remaining: 542ms
18:	learn: 230.8696751	total: 57.3ms	rem

In [16]:
y_pred=catb_tuned.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

354.65431436332744