In [1]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.read_csv('treated_df.csv').loc[:, 'age':]

In [3]:
filtered_df = df[df['has_dog'] == 0]

df = pd.concat([filtered_df.groupby('has_dog').sample(frac=0.92), df.loc[df['has_dog'] == 1]], axis=0)

In [4]:
x = df.drop(columns=['has_dog', 'has_cat'])
y = df['has_dog'].values

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [6]:
params = {
    'iterations': 1000,
    'learning_rate': 0.1,
    'depth': 6,
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'random_seed': 42,
    'l2_leaf_reg': 3,
    'logging_level': 'Verbose',
    'od_type': 'Iter',
    'od_wait': 50
}

In [7]:
model = CatBoostClassifier(**params)

In [8]:
# model = RandomForestClassifier()

In [9]:
model.fit(x_train, y_train)

0:	total: 152ms	remaining: 2m 32s
1:	total: 157ms	remaining: 1m 18s
2:	total: 161ms	remaining: 53.6s
3:	total: 166ms	remaining: 41.4s
4:	total: 171ms	remaining: 34.1s
5:	total: 176ms	remaining: 29.1s
6:	total: 180ms	remaining: 25.5s
7:	total: 183ms	remaining: 22.7s
8:	total: 191ms	remaining: 21s
9:	total: 195ms	remaining: 19.3s
10:	total: 204ms	remaining: 18.3s
11:	total: 215ms	remaining: 17.7s
12:	total: 221ms	remaining: 16.8s
13:	total: 225ms	remaining: 15.9s
14:	total: 230ms	remaining: 15.1s
15:	total: 238ms	remaining: 14.7s
16:	total: 244ms	remaining: 14.1s
17:	total: 253ms	remaining: 13.8s
18:	total: 259ms	remaining: 13.4s
19:	total: 264ms	remaining: 12.9s
20:	total: 278ms	remaining: 12.9s
21:	total: 281ms	remaining: 12.5s
22:	total: 294ms	remaining: 12.5s
23:	total: 299ms	remaining: 12.1s
24:	total: 307ms	remaining: 12s
25:	total: 316ms	remaining: 11.8s
26:	total: 320ms	remaining: 11.5s
27:	total: 326ms	remaining: 11.3s
28:	total: 354ms	remaining: 11.8s
29:	total: 358ms	remaining

128:	total: 931ms	remaining: 6.29s
129:	total: 935ms	remaining: 6.26s
130:	total: 940ms	remaining: 6.24s
131:	total: 946ms	remaining: 6.22s
132:	total: 953ms	remaining: 6.21s
133:	total: 958ms	remaining: 6.19s
134:	total: 966ms	remaining: 6.19s
135:	total: 971ms	remaining: 6.17s
136:	total: 975ms	remaining: 6.14s
137:	total: 980ms	remaining: 6.12s
138:	total: 984ms	remaining: 6.09s
139:	total: 988ms	remaining: 6.07s
140:	total: 992ms	remaining: 6.04s
141:	total: 1000ms	remaining: 6.04s
142:	total: 1s	remaining: 6.03s
143:	total: 1.02s	remaining: 6.06s
144:	total: 1.03s	remaining: 6.08s
145:	total: 1.03s	remaining: 6.06s
146:	total: 1.04s	remaining: 6.06s
147:	total: 1.05s	remaining: 6.04s
148:	total: 1.05s	remaining: 6.01s
149:	total: 1.06s	remaining: 6s
150:	total: 1.07s	remaining: 6.03s
151:	total: 1.08s	remaining: 6.01s
152:	total: 1.08s	remaining: 6s
153:	total: 1.09s	remaining: 6.01s
154:	total: 1.1s	remaining: 5.99s
155:	total: 1.1s	remaining: 5.97s
156:	total: 1.11s	remaining: 5

<catboost.core.CatBoostClassifier at 0x26b7ae4a010>

In [10]:
df.groupby('has_dog').size()

has_dog
0    169
1    170
dtype: int64

In [11]:
result = model.predict(x_test)

print(classification_report(y_pred=result, y_true=y_test))

              precision    recall  f1-score   support

           0       0.89      0.91      0.90        46
           1       0.89      0.87      0.88        39

    accuracy                           0.89        85
   macro avg       0.89      0.89      0.89        85
weighted avg       0.89      0.89      0.89        85



In [12]:
all_features = []

for i in range(len(model.feature_importances_)):
    dicionario = {'feature_name': model.feature_names_[i], 'feature_importance': model.feature_importances_[i]}
    all_features.append(dicionario)

pd.DataFrame.from_dict(all_features).sort_values(by='feature_importance', ascending=False)

Unnamed: 0,feature_name,feature_importance
6,number_of_pets,28.297361
0,age,11.22486
2,time_in_social_media,9.120702
9,report_abandoned,8.819978
3,people_living_together,8.593366
4,social_class,6.736188
1,gender,6.352397
10,feeling,6.17305
5,has_pets,4.939948
11,would_use,4.811265


In [13]:
import pickle as pkl
with open('../FORMS 2/model_dogs.pkl', 'wb') as f:
    pkl.dump(model, f)

In [14]:
x.columns

Index(['age', 'gender', 'time_in_social_media', 'people_living_together',
       'social_class', 'has_pets', 'number_of_pets', 'has_others', 'forgets',
       'report_abandoned', 'feeling', 'would_use'],
      dtype='object')