In [1]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv('treated_df.csv').loc[:, 'age':]

In [3]:
df.groupby('has_cat').size()

has_cat
0    289
1     65
dtype: int64

In [4]:
filtered_df = df[df['has_cat'] == 0]

df = pd.concat([filtered_df.groupby('has_cat').sample(frac=0.23), df.loc[df['has_cat'] == 1]], axis=0)

In [5]:
x = df.drop(columns=['has_cat'])
y = df['has_cat'].values

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [7]:
params = {
    'iterations': 1000,
    'learning_rate': 0.1,
    'depth': 6,
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'random_seed': 42,
    'l2_leaf_reg': 3,
    'logging_level': 'Verbose',
    'od_type': 'Iter',
    'od_wait': 50
}

In [8]:
model = CatBoostClassifier(**params)

model.fit(x_train, y_train)

0:	total: 141ms	remaining: 2m 20s
1:	total: 142ms	remaining: 1m 10s
2:	total: 143ms	remaining: 47.7s
3:	total: 146ms	remaining: 36.3s
4:	total: 148ms	remaining: 29.4s
5:	total: 149ms	remaining: 24.7s
6:	total: 151ms	remaining: 21.4s
7:	total: 153ms	remaining: 18.9s
8:	total: 154ms	remaining: 16.9s
9:	total: 155ms	remaining: 15.4s
10:	total: 156ms	remaining: 14.1s
11:	total: 158ms	remaining: 13s
12:	total: 160ms	remaining: 12.1s
13:	total: 163ms	remaining: 11.5s
14:	total: 164ms	remaining: 10.8s
15:	total: 166ms	remaining: 10.2s
16:	total: 167ms	remaining: 9.68s
17:	total: 169ms	remaining: 9.2s
18:	total: 170ms	remaining: 8.76s
19:	total: 171ms	remaining: 8.37s
20:	total: 172ms	remaining: 8.02s
21:	total: 173ms	remaining: 7.71s
22:	total: 175ms	remaining: 7.45s
23:	total: 178ms	remaining: 7.25s
24:	total: 180ms	remaining: 7.01s
25:	total: 181ms	remaining: 6.77s
26:	total: 182ms	remaining: 6.56s
27:	total: 183ms	remaining: 6.36s
28:	total: 184ms	remaining: 6.17s
29:	total: 186ms	remainin

<catboost.core.CatBoostClassifier at 0x19b5b47fcd0>

In [9]:
result = model.predict(x_test)

print(classification_report(y_pred=result, y_true=y_test))

              precision    recall  f1-score   support

           0       1.00      0.79      0.88        19
           1       0.78      1.00      0.88        14

    accuracy                           0.88        33
   macro avg       0.89      0.89      0.88        33
weighted avg       0.91      0.88      0.88        33



In [10]:
all_features = []

for i in range(len(model.feature_importances_)):
    dicionario = {'feature_name': model.feature_names_[i], 'feature_importance': model.feature_importances_[i]}
    all_features.append(dicionario)

pd.DataFrame.from_dict(all_features).sort_values(by='feature_importance', ascending=False)

Unnamed: 0,feature_name,feature_importance
6,number_of_pets,29.40363
7,has_dog,25.717211
5,has_pets,14.69354
10,report_abandoned,6.821573
9,forgets,4.081504
1,gender,3.510364
12,would_use,3.178872
2,time_in_social_media,3.062983
11,feeling,2.699511
4,social_class,2.49802


In [11]:
x.columns

Index(['age', 'gender', 'time_in_social_media', 'people_living_together',
       'social_class', 'has_pets', 'number_of_pets', 'has_dog', 'has_others',
       'forgets', 'report_abandoned', 'feeling', 'would_use'],
      dtype='object')

In [12]:
import pickle as pkl
with open('../FORMS 2/model_cats.pkl', 'wb') as f:
    pkl.dump(model, f)