In [1]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.3-cp310-cp310-manylinux2014_x86_64.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.3


In [2]:
#  import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report


In [7]:
#  import titanic dataset using seaborn

df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [10]:
#  impute missing values in age and fare column using knn imputer

from sklearn.impute import KNNImputer

# Define the KNN imputer
knn_imputer = KNNImputer(n_neighbors=5)

# Impute missing values in age and fare columns
df['age'] = knn_imputer.fit_transform(df['age'].values.reshape(-1, 1))
df['fare'] = knn_imputer.fit_transform(df['fare'].values.reshape(-1, 1))


In [11]:
#  impute missing values in embark_town and embarked using mode

# Impute missing values in embark_town and embarked columns using mode
df['embark_town'] = df['embark_town'].fillna(df['embark_town'].mode()[0])
df['embarked'] = df['embarked'].fillna(df['embarked'].mode()[0])


In [12]:
df.isnull().sum()

survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
embarked         0
class            0
who              0
adult_male       0
deck           688
embark_town      0
alive            0
alone            0
dtype: int64

In [13]:
#  drop the deck column

df = df.drop(columns=['deck'])


In [20]:
# convert each category column to category  and add this as a new column in DataFrame
categorical_cols=df.select_dtypes(include=['category','object']).columns
df[categorical_cols]=df[categorical_cols].astype('category')

In [21]:
df.info(

)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   survived         891 non-null    int64   
 1   pclass           891 non-null    int64   
 2   sex              891 non-null    category
 3   age              891 non-null    float64 
 4   sibsp            891 non-null    int64   
 5   parch            891 non-null    int64   
 6   fare             891 non-null    float64 
 7   embarked         891 non-null    category
 8   class            891 non-null    category
 9   who              891 non-null    category
 10  adult_male       891 non-null    bool    
 11  embark_town      891 non-null    category
 12  alive            891 non-null    category
 13  alone            891 non-null    bool    
 14  sex_cat          891 non-null    category
 15  embarked_cat     891 non-null    category
 16  who_cat          891 non-null    category
 1

In [22]:
#  split data into X and y

# Separate features and target
X = df.drop('survived', axis=1)
y = df['survived']


In [24]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [28]:
from catboost import CatBoostClassifier

# Assuming categorical_cols is a pandas Index object containing the names of categorical columns
cat_features = categorical_cols.tolist()

model = CatBoostClassifier(learning_rate=0.1, n_estimators=100, random_state=42)
model.fit(X_train, y_train, cat_features=cat_features)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)


0:	learn: 0.6091715	total: 57.2ms	remaining: 5.66s
1:	learn: 0.5341322	total: 59.5ms	remaining: 2.92s
2:	learn: 0.4778213	total: 63.5ms	remaining: 2.05s
3:	learn: 0.4281978	total: 68.1ms	remaining: 1.63s
4:	learn: 0.3898390	total: 73.9ms	remaining: 1.4s
5:	learn: 0.3502180	total: 78.5ms	remaining: 1.23s
6:	learn: 0.3171075	total: 83.9ms	remaining: 1.11s
7:	learn: 0.2877822	total: 88.5ms	remaining: 1.02s
8:	learn: 0.2607131	total: 93.8ms	remaining: 948ms
9:	learn: 0.2364596	total: 96.5ms	remaining: 868ms
10:	learn: 0.2186680	total: 101ms	remaining: 814ms
11:	learn: 0.1980051	total: 107ms	remaining: 781ms
12:	learn: 0.1782319	total: 108ms	remaining: 724ms
13:	learn: 0.1636687	total: 112ms	remaining: 691ms
14:	learn: 0.1498279	total: 119ms	remaining: 672ms
15:	learn: 0.1378836	total: 124ms	remaining: 650ms
16:	learn: 0.1268104	total: 126ms	remaining: 616ms
17:	learn: 0.1185019	total: 131ms	remaining: 598ms
18:	learn: 0.1086723	total: 133ms	remaining: 565ms
19:	learn: 0.0993018	total: 133m

In [29]:
#  predict and evaluate the model

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print the classification report
print(classification_report(y_test, y_pred))

# Print the confusion matrix
print(confusion_matrix(y_test, y_pred))


Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       105
           1       1.00      1.00      1.00        74

    accuracy                           1.00       179
   macro avg       1.00      1.00      1.00       179
weighted avg       1.00      1.00      1.00       179

[[105   0]
 [  0  74]]
