In [1]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv('treated_df.csv').loc[:, 'age':]

In [3]:
df.groupby('has_others').size()

has_others
0    334
1     20
dtype: int64

In [4]:
filtered_df = df[df['has_others'] == 0]

df = pd.concat([filtered_df.groupby('has_others').sample(frac=0.05), df.loc[df['has_others'] == 1]], axis=0)

In [5]:
x = df.drop(columns=['has_others', 'has_dog', 'has_cat'])
y = df['has_others'].values

In [6]:
x

Unnamed: 0,age,gender,time_in_social_media,people_living_together,social_class,has_pets,number_of_pets,forgets,report_abandoned,feeling,would_use
99,15,1,6,4,4,0,0,1,2,1,0.0
71,15,1,0,4,3,1,1,1,3,4,1.0
234,50,0,0,2,3,1,1,0,0,3,0.5
147,15,1,2,4,3,1,1,0,4,3,1.0
261,50,1,0,2,4,1,1,0,4,2,1.0
144,15,0,6,5,4,1,1,1,4,3,0.0
56,20,0,2,5,3,1,3,1,2,3,0.0
263,50,1,0,4,3,1,3,0,2,3,1.0
83,20,1,0,3,2,1,1,1,2,4,1.0
227,25,1,2,2,2,1,3,1,4,4,1.0


In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [8]:
params = {
    'iterations': 1000,
    'learning_rate': 0.1,
    'depth': 6,
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'random_seed': 42,
    'l2_leaf_reg': 3,
    'logging_level': 'Verbose',
    'od_type': 'Iter',
    'od_wait': 50
}

In [9]:
model = CatBoostClassifier(**params)

model.fit(x_train, y_train)

0:	total: 147ms	remaining: 2m 26s
1:	total: 148ms	remaining: 1m 13s
2:	total: 150ms	remaining: 49.7s
3:	total: 152ms	remaining: 37.7s
4:	total: 153ms	remaining: 30.4s
5:	total: 155ms	remaining: 25.6s
6:	total: 156ms	remaining: 22.1s
7:	total: 158ms	remaining: 19.6s
8:	total: 160ms	remaining: 17.6s
9:	total: 161ms	remaining: 15.9s
10:	total: 163ms	remaining: 14.6s
11:	total: 164ms	remaining: 13.5s
12:	total: 166ms	remaining: 12.6s
13:	total: 168ms	remaining: 11.8s
14:	total: 169ms	remaining: 11.1s
15:	total: 171ms	remaining: 10.5s
16:	total: 173ms	remaining: 9.98s
17:	total: 174ms	remaining: 9.49s
18:	total: 175ms	remaining: 9.05s
19:	total: 177ms	remaining: 8.66s
20:	total: 178ms	remaining: 8.31s
21:	total: 179ms	remaining: 7.97s
22:	total: 181ms	remaining: 7.68s
23:	total: 183ms	remaining: 7.42s
24:	total: 184ms	remaining: 7.17s
25:	total: 185ms	remaining: 6.95s
26:	total: 187ms	remaining: 6.73s
27:	total: 188ms	remaining: 6.54s
28:	total: 190ms	remaining: 6.36s
29:	total: 191ms	remai

<catboost.core.CatBoostClassifier at 0x1ff938db9d0>

In [10]:
result = model.predict(x_test)

print(classification_report(y_pred=result, y_true=y_test))

              precision    recall  f1-score   support

           0       0.75      0.75      0.75         4
           1       0.83      0.83      0.83         6

    accuracy                           0.80        10
   macro avg       0.79      0.79      0.79        10
weighted avg       0.80      0.80      0.80        10



In [11]:
all_features = []

for i in range(len(model.feature_importances_)):
    dicionario = {'feature_name': model.feature_names_[i], 'feature_importance': model.feature_importances_[i]}
    all_features.append(dicionario)

pd.DataFrame.from_dict(all_features).sort_values(by='feature_importance', ascending=False)

Unnamed: 0,feature_name,feature_importance
6,number_of_pets,41.463675
8,report_abandoned,9.793212
0,age,9.345845
3,people_living_together,8.210342
9,feeling,7.289805
4,social_class,6.658913
7,forgets,5.50058
2,time_in_social_media,4.824482
10,would_use,4.235182
1,gender,2.046337


In [12]:
import pickle as pkl
with open('../FORMS 2/model_others.pkl', 'wb') as f:
    pkl.dump(model, f)