# CatBoost Algorithm

- A boosting ml algorithm which focuses on categorical features mainly for prediction.
- A recommended algorithm when trying to build a ml model which works on data with various categorical input features.
- Encodes  categorical features automatically.

In [65]:
# import necessary libraries 
import seaborn as sb 
from catboost import CatBoostRegressor,CatBoostClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score,accuracy_score,classification_report

In [66]:
# load the titanic dataset 
data = sb.load_dataset('titanic') 
data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [67]:
# check for missing values 
data.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [68]:
# dropping deck column as it has too many missing values 
data = data.drop(['deck','alive'],axis=1) 
# imputing missing values for age, embark_town and embarked columns 
cat_imputer = SimpleImputer(strategy='most_frequent') 
data[['embark_town','embarked']] = cat_imputer.fit_transform(data[['embark_town','embarked']]) 
num_imputer = SimpleImputer(strategy='median') 
data[['age']] = num_imputer.fit_transform(data[['age']])
data.isnull().sum() 

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    0
alone          0
dtype: int64

In [69]:
# seperating features and target variable 
X = data.drop('survived',axis=1) 
y = data['survived'] 
# train test split 
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42) 
# defining categorical features 
cat_features = [x for x in X_train.columns if X_train[x].dtype == 'object' or  X_train[x].dtype == 'category']
cat_features

['sex', 'embarked', 'class', 'who', 'embark_town']

In [70]:
# initializing CatBoostClassifier 
catboost_classifier = CatBoostClassifier(iterations=1000,learning_rate=0.01,depth=6,cat_features=cat_features,verbose=100)
# fitting the model 
catboost_classifier.fit(X_train,y_train)

0:	learn: 0.6862334	total: 54.1ms	remaining: 54.1s
100:	learn: 0.4298078	total: 4.24s	remaining: 37.7s
200:	learn: 0.3888754	total: 8.34s	remaining: 33.1s
300:	learn: 0.3709319	total: 12.8s	remaining: 29.6s
400:	learn: 0.3585788	total: 16.4s	remaining: 24.5s
500:	learn: 0.3488760	total: 19.1s	remaining: 19s
600:	learn: 0.3400971	total: 21.8s	remaining: 14.5s
700:	learn: 0.3291671	total: 24.6s	remaining: 10.5s
800:	learn: 0.3212338	total: 28.6s	remaining: 7.1s
900:	learn: 0.3096680	total: 33.5s	remaining: 3.68s
999:	learn: 0.2985733	total: 38s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1c1d1570950>

In [71]:
# classification metrics
y_pred = catboost_classifier.predict(X_test) 
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

0.8212290502793296
              precision    recall  f1-score   support

           0       0.81      0.90      0.86       105
           1       0.84      0.70      0.76        74

    accuracy                           0.82       179
   macro avg       0.83      0.80      0.81       179
weighted avg       0.82      0.82      0.82       179



In [74]:
# seperating target and features for regression task
X = data.drop('fare',axis=1) 
y = data['fare'] 
# train test split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42) 
# initializing CatBoostRegressor
catboost_regressor = CatBoostRegressor(iterations=1000,learning_rate=0.001,depth=6,cat_features=cat_features,verbose=100)
# fitting the model 
catboost_regressor.fit(X_train,y_train)

0:	learn: 51.9118405	total: 37.9ms	remaining: 37.9s
100:	learn: 49.9321247	total: 3.44s	remaining: 30.6s
200:	learn: 48.1742423	total: 6.85s	remaining: 27.2s
300:	learn: 46.5816058	total: 10.5s	remaining: 24.4s
400:	learn: 45.1646156	total: 14s	remaining: 20.9s
500:	learn: 43.9122793	total: 17.3s	remaining: 17.3s
600:	learn: 42.7781713	total: 20.7s	remaining: 13.8s
700:	learn: 41.7667228	total: 24.1s	remaining: 10.3s
800:	learn: 40.8473672	total: 27.8s	remaining: 6.9s
900:	learn: 40.0076370	total: 31.6s	remaining: 3.47s
999:	learn: 39.2616343	total: 35.2s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x1c1cd11b230>

In [75]:
# regression metrics 
y_pred = catboost_regressor.predict(X_test) 
print("MAE:",mean_absolute_error(y_test,y_pred)) 
print("MSE:",mean_squared_error(y_test,y_pred)) 
print("R2 Score:",r2_score(y_test,y_pred))

MAE: 16.904755696554005
MSE: 891.4658490154071
R2 Score: 0.4239053991075413
