In [1]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.read_csv('treated_df.csv').loc[:, 'age':]

In [3]:
df.groupby('has_cat').size()

has_cat
0    289
1     65
dtype: int64

In [4]:
filtered_df = df[df['has_cat'] == 0]

df = pd.concat([filtered_df.groupby('has_cat').sample(frac=0.23), df.loc[df['has_cat'] == 1]], axis=0)

In [5]:
x = df.drop(columns=['has_cat'])
y = df['has_cat'].values

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [7]:
params = {
    'iterations': 1000,
    'learning_rate': 0.1,
    'depth': 6,
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'random_seed': 42,
    'l2_leaf_reg': 3,
    'logging_level': 'Verbose',
    'od_type': 'Iter',
    'od_wait': 50
}

In [8]:
model = CatBoostClassifier(**params)

model.fit(x_train, y_train)

0:	total: 151ms	remaining: 2m 31s
1:	total: 155ms	remaining: 1m 17s
2:	total: 159ms	remaining: 52.9s
3:	total: 166ms	remaining: 41.4s
4:	total: 170ms	remaining: 33.8s
5:	total: 175ms	remaining: 28.9s
6:	total: 180ms	remaining: 25.6s
7:	total: 187ms	remaining: 23.2s
8:	total: 195ms	remaining: 21.5s
9:	total: 200ms	remaining: 19.8s
10:	total: 203ms	remaining: 18.3s
11:	total: 221ms	remaining: 18.2s
12:	total: 229ms	remaining: 17.4s
13:	total: 235ms	remaining: 16.5s
14:	total: 240ms	remaining: 15.8s
15:	total: 245ms	remaining: 15.1s
16:	total: 251ms	remaining: 14.5s
17:	total: 259ms	remaining: 14.1s
18:	total: 264ms	remaining: 13.6s
19:	total: 267ms	remaining: 13.1s
20:	total: 277ms	remaining: 12.9s
21:	total: 288ms	remaining: 12.8s
22:	total: 296ms	remaining: 12.6s
23:	total: 306ms	remaining: 12.4s
24:	total: 310ms	remaining: 12.1s
25:	total: 319ms	remaining: 11.9s
26:	total: 323ms	remaining: 11.6s
27:	total: 331ms	remaining: 11.5s
28:	total: 334ms	remaining: 11.2s
29:	total: 338ms	remai

<catboost.core.CatBoostClassifier at 0x2615484f990>

In [9]:
result = model.predict(x_test)

print(classification_report(y_pred=result, y_true=y_test))

              precision    recall  f1-score   support

           0       0.88      0.74      0.80        19
           1       0.71      0.86      0.77        14

    accuracy                           0.79        33
   macro avg       0.79      0.80      0.79        33
weighted avg       0.80      0.79      0.79        33



In [10]:
all_features = []

for i in range(len(model.feature_importances_)):
    dicionario = {'feature_name': model.feature_names_[i], 'feature_importance': model.feature_importances_[i]}
    all_features.append(dicionario)

pd.DataFrame.from_dict(all_features).sort_values(by='feature_importance', ascending=False)

Unnamed: 0,feature_name,feature_importance
7,has_dog,26.475821
6,number_of_pets,22.774202
5,has_pets,10.944875
2,time_in_social_media,9.62916
11,feeling,7.201791
12,would_use,6.18707
10,report_abandoned,4.897414
1,gender,3.119303
0,age,3.000356
3,people_living_together,2.771197


In [11]:
x.columns

Index(['age', 'gender', 'time_in_social_media', 'people_living_together',
       'social_class', 'has_pets', 'number_of_pets', 'has_dog', 'has_others',
       'forgets', 'report_abandoned', 'feeling', 'would_use'],
      dtype='object')

In [12]:
import pickle as pkl
with open('../FORMS 2/model_cats.pkl', 'wb') as f:
    pkl.dump(model, f)