In [1]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np

In [2]:
df = pd.read_csv('treated_df.csv').loc[:, 'age':]

In [3]:
filtered_df = df[df['has_dog'] == 0]

df = pd.concat([filtered_df.groupby('has_dog').sample(frac=0.92), df.loc[df['has_dog'] == 1]], axis=0)

In [4]:
x = df.drop(columns=['has_dog', 'has_cat'])
y = df['has_dog'].values

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [6]:
params = {
    'iterations': 1000,
    'learning_rate': 0.1,
    'depth': 6,
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'random_seed': 42,
    'l2_leaf_reg': 3,
    'logging_level': 'Verbose',
    'od_type': 'Iter',
    'od_wait': 50
}

In [7]:
model = CatBoostClassifier(**params)

In [8]:
# model = RandomForestClassifier()

In [9]:
model.fit(x_train, y_train)

0:	total: 149ms	remaining: 2m 28s
1:	total: 155ms	remaining: 1m 17s
2:	total: 178ms	remaining: 59.2s
3:	total: 186ms	remaining: 46.3s
4:	total: 191ms	remaining: 38s
5:	total: 193ms	remaining: 32s
6:	total: 196ms	remaining: 27.8s
7:	total: 198ms	remaining: 24.6s
8:	total: 201ms	remaining: 22.2s
9:	total: 203ms	remaining: 20.1s
10:	total: 208ms	remaining: 18.7s
11:	total: 210ms	remaining: 17.3s
12:	total: 217ms	remaining: 16.5s
13:	total: 221ms	remaining: 15.6s
14:	total: 224ms	remaining: 14.7s
15:	total: 230ms	remaining: 14.2s
16:	total: 234ms	remaining: 13.5s
17:	total: 237ms	remaining: 12.9s
18:	total: 241ms	remaining: 12.4s
19:	total: 246ms	remaining: 12.1s
20:	total: 251ms	remaining: 11.7s
21:	total: 253ms	remaining: 11.2s
22:	total: 254ms	remaining: 10.8s
23:	total: 257ms	remaining: 10.4s
24:	total: 260ms	remaining: 10.1s
25:	total: 262ms	remaining: 9.82s
26:	total: 266ms	remaining: 9.57s
27:	total: 268ms	remaining: 9.29s
28:	total: 270ms	remaining: 9.03s
29:	total: 276ms	remaining

<catboost.core.CatBoostClassifier at 0x1ab375a77d0>

In [10]:
df.groupby('has_dog').size()

has_dog
0    169
1    170
dtype: int64

In [None]:
result = model.predict(x_test)

print(classification_report(y_pred=result, y_true=y_test))

              precision    recall  f1-score   support

           0       0.92      0.78      0.85        46
           1       0.78      0.92      0.85        39

    accuracy                           0.85        85
   macro avg       0.85      0.85      0.85        85
weighted avg       0.86      0.85      0.85        85



In [None]:
all_features = []

for i in range(len(model.feature_importances_)):
    dicionario = {'feature_name': model.feature_names_[i], 'feature_importance': model.feature_importances_[i]}
    all_features.append(dicionario)

pd.DataFrame.from_dict(all_features).sort_values(by='feature_importance', ascending=False)

Unnamed: 0,feature_name,feature_importance
6,number_of_pets,27.143919
0,age,10.456315
9,report_abandoned,9.698813
3,people_living_together,8.715426
2,time_in_social_media,7.855661
1,gender,7.283002
10,feeling,6.647517
5,has_pets,6.49233
4,social_class,6.247529
11,would_use,6.088368


In [13]:
import pickle as pkl
with open('../FORMS 2/model_dogs.pkl', 'wb') as f:
    pkl.dump(model, f)

In [14]:
x.columns

Index(['age', 'gender', 'time_in_social_media', 'people_living_together',
       'social_class', 'has_pets', 'number_of_pets', 'has_others', 'forgets',
       'report_abandoned', 'feeling', 'would_use'],
      dtype='object')