In [13]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [14]:
df = pd.read_csv('treated_df.csv').loc[:, 'age':]

In [15]:
df.groupby('has_others').size()

has_others
0    334
1     20
dtype: int64

In [16]:
filtered_df = df[df['has_others'] == 0]

df = pd.concat([filtered_df.groupby('has_others').sample(frac=0.05), df.loc[df['has_others'] == 1]], axis=0)

In [17]:
x = df.drop(columns=['has_others', 'has_dog', 'has_cat'])
y = df['has_others'].values

In [18]:
x

Unnamed: 0,age,gender,time_in_social_media,people_living_together,social_class,has_pets,number_of_pets,forgets,report_abandoned,feeling,would_use
228,20,0,0,5,2,1,2,0,4,3,1.0
60,20,0,2,5,4,1,1,1,4,3,1.0
73,20,0,2,2,1,0,0,1,2,2,0.5
220,40,0,6,5,5,1,2,0,3,3,1.0
148,10,1,2,3,3,0,0,1,2,3,1.0
114,10,0,0,4,2,1,3,1,4,3,1.0
182,20,1,8,2,2,1,5,0,4,4,1.0
284,30,1,2,4,2,1,1,1,2,3,0.5
82,20,1,4,3,3,1,1,1,4,3,0.5
217,50,0,4,4,3,0,0,1,2,3,1.0


In [19]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [20]:
params = {
    'iterations': 1000,
    'learning_rate': 0.1,
    'depth': 6,
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'random_seed': 42,
    'l2_leaf_reg': 3,
    'logging_level': 'Verbose',
    'od_type': 'Iter',
    'od_wait': 50
}

In [21]:
model = CatBoostClassifier(**params)

model.fit(x_train, y_train)

0:	total: 8.32ms	remaining: 8.32s
1:	total: 14.8ms	remaining: 7.39s
2:	total: 22.7ms	remaining: 7.54s
3:	total: 27.7ms	remaining: 6.91s
4:	total: 30.7ms	remaining: 6.11s
5:	total: 36ms	remaining: 5.96s
6:	total: 40.4ms	remaining: 5.73s
7:	total: 44.4ms	remaining: 5.51s
8:	total: 49.7ms	remaining: 5.47s
9:	total: 52.9ms	remaining: 5.24s
10:	total: 56.2ms	remaining: 5.05s
11:	total: 60ms	remaining: 4.94s
12:	total: 65.1ms	remaining: 4.94s
13:	total: 70.3ms	remaining: 4.95s
14:	total: 74.4ms	remaining: 4.88s
15:	total: 78.7ms	remaining: 4.84s
16:	total: 81.9ms	remaining: 4.74s
17:	total: 85.9ms	remaining: 4.68s
18:	total: 90.3ms	remaining: 4.66s
19:	total: 97.9ms	remaining: 4.79s
20:	total: 101ms	remaining: 4.71s
21:	total: 109ms	remaining: 4.84s
22:	total: 114ms	remaining: 4.82s
23:	total: 119ms	remaining: 4.85s
24:	total: 126ms	remaining: 4.9s
25:	total: 130ms	remaining: 4.85s
26:	total: 135ms	remaining: 4.87s
27:	total: 140ms	remaining: 4.88s
28:	total: 150ms	remaining: 5.01s
29:	total

116:	total: 851ms	remaining: 6.42s
117:	total: 854ms	remaining: 6.39s
118:	total: 863ms	remaining: 6.39s
119:	total: 866ms	remaining: 6.35s
120:	total: 870ms	remaining: 6.32s
121:	total: 878ms	remaining: 6.32s
122:	total: 881ms	remaining: 6.28s
123:	total: 886ms	remaining: 6.26s
124:	total: 894ms	remaining: 6.26s
125:	total: 904ms	remaining: 6.27s
126:	total: 909ms	remaining: 6.25s
127:	total: 912ms	remaining: 6.21s
128:	total: 917ms	remaining: 6.19s
129:	total: 926ms	remaining: 6.2s
130:	total: 930ms	remaining: 6.17s
131:	total: 933ms	remaining: 6.13s
132:	total: 940ms	remaining: 6.13s
133:	total: 943ms	remaining: 6.09s
134:	total: 947ms	remaining: 6.07s
135:	total: 960ms	remaining: 6.1s
136:	total: 971ms	remaining: 6.12s
137:	total: 978ms	remaining: 6.11s
138:	total: 982ms	remaining: 6.08s
139:	total: 986ms	remaining: 6.06s
140:	total: 989ms	remaining: 6.03s
141:	total: 993ms	remaining: 6s
142:	total: 1000ms	remaining: 5.99s
143:	total: 1s	remaining: 5.96s
144:	total: 1.01s	remaining

<catboost.core.CatBoostClassifier at 0x1779d5c2a50>

In [22]:
result = model.predict(x_test)

print(classification_report(y_pred=result, y_true=y_test))

              precision    recall  f1-score   support

           0       1.00      0.75      0.86         4
           1       0.86      1.00      0.92         6

    accuracy                           0.90        10
   macro avg       0.93      0.88      0.89        10
weighted avg       0.91      0.90      0.90        10



In [23]:
all_features = []

for i in range(len(model.feature_importances_)):
    dicionario = {'feature_name': model.feature_names_[i], 'feature_importance': model.feature_importances_[i]}
    all_features.append(dicionario)

pd.DataFrame.from_dict(all_features).sort_values(by='feature_importance', ascending=False)

Unnamed: 0,feature_name,feature_importance
6,number_of_pets,48.694877
4,social_class,14.189125
10,would_use,6.747314
7,forgets,5.740591
2,time_in_social_media,5.728514
8,report_abandoned,5.032867
1,gender,4.886276
0,age,4.13779
3,people_living_together,2.879533
9,feeling,1.723293


In [24]:
import pickle as pkl
with open('../FORMS 2/model_others.pkl', 'wb') as f:
    pkl.dump(model, f)