In [54]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB

In [55]:
df = pd.read_csv('df_has_others.csv').loc[:, 'age':]
df

Unnamed: 0,age,gender,time_in_social_media,people_living_together,social_class,has_pets,number_of_pets,has_others,forgets,report_abandoned,feeling,would_use,has_dogs,is_lgbt
0,49,1,2.0,4,2,0,0.0,0,1,3,3.0,0,0,0
1,16,1,2.0,4,1,1,4.0,1,1,4,5.0,1,1,1
2,30,1,3.0,4,4,0,0.0,0,1,3,3.0,0,0,0
3,15,0,6.0,3,4,1,2.0,1,1,4,5.0,0,1,0
4,10,1,2.0,3,2,1,1.0,0,0,3,5.0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233,45,1,2.0,3,4,1,1.0,0,1,4,5.0,1,1,0
234,22,0,3.0,4,3,1,1.0,0,1,4,5.0,1,1,0
235,18,1,3.0,4,1,1,2.0,1,1,4,5.0,1,1,0
236,22,0,3.0,2,2,1,6.0,1,0,4,5.0,1,1,0


In [56]:
df = df.loc[~df['is_lgbt'].isna()]

In [57]:
filtered_df = df[df['is_lgbt'] == 0]

df = pd.concat([filtered_df.groupby('is_lgbt').sample(frac=0.18), df.loc[df['is_lgbt'] == 1]], axis=0)

In [58]:
x = df.drop(columns=['is_lgbt'])
y = df['is_lgbt']

In [59]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [60]:
params = {
    'iterations': 1000,
    'learning_rate': 0.1,
    'depth': 6,
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'random_seed': 42,
    'l2_leaf_reg': 3,
    'logging_level': 'Verbose',
    'od_type': 'Iter',
    'od_wait': 50
}

In [61]:
model = CatBoostClassifier()

model.fit(x_train, y_train)

Learning rate set to 0.002963
0:	learn: 0.6916181	total: 11.8ms	remaining: 11.8s
1:	learn: 0.6902514	total: 17.2ms	remaining: 8.6s
2:	learn: 0.6891313	total: 22.2ms	remaining: 7.37s
3:	learn: 0.6877740	total: 27.2ms	remaining: 6.77s
4:	learn: 0.6872726	total: 32.1ms	remaining: 6.38s
5:	learn: 0.6857047	total: 36ms	remaining: 5.96s
6:	learn: 0.6851014	total: 40.2ms	remaining: 5.7s
7:	learn: 0.6838419	total: 46.4ms	remaining: 5.75s
8:	learn: 0.6825604	total: 51.1ms	remaining: 5.62s
9:	learn: 0.6808285	total: 56.6ms	remaining: 5.6s
10:	learn: 0.6796078	total: 62.3ms	remaining: 5.6s


11:	learn: 0.6783891	total: 70.5ms	remaining: 5.8s
12:	learn: 0.6767622	total: 76ms	remaining: 5.77s
13:	learn: 0.6756461	total: 81.1ms	remaining: 5.71s
14:	learn: 0.6743736	total: 86.7ms	remaining: 5.7s
15:	learn: 0.6736040	total: 90.7ms	remaining: 5.58s
16:	learn: 0.6729002	total: 95.6ms	remaining: 5.53s
17:	learn: 0.6715995	total: 102ms	remaining: 5.56s
18:	learn: 0.6703559	total: 107ms	remaining: 5.5s
19:	learn: 0.6691389	total: 111ms	remaining: 5.46s
20:	learn: 0.6686588	total: 117ms	remaining: 5.47s
21:	learn: 0.6671674	total: 123ms	remaining: 5.46s
22:	learn: 0.6656699	total: 127ms	remaining: 5.38s
23:	learn: 0.6646444	total: 132ms	remaining: 5.38s
24:	learn: 0.6633738	total: 137ms	remaining: 5.33s
25:	learn: 0.6620268	total: 142ms	remaining: 5.33s
26:	learn: 0.6614091	total: 147ms	remaining: 5.31s
27:	learn: 0.6607287	total: 153ms	remaining: 5.32s
28:	learn: 0.6594771	total: 161ms	remaining: 5.39s
29:	learn: 0.6583040	total: 166ms	remaining: 5.37s
30:	learn: 0.6572697	total: 17

<catboost.core.CatBoostClassifier at 0x1b38c608f50>

In [62]:
result = model.predict(x_test)

print(classification_report(y_pred=result, y_true=y_test))

              precision    recall  f1-score   support

           0       0.71      0.45      0.56        11
           1       0.45      0.71      0.56         7

    accuracy                           0.56        18
   macro avg       0.58      0.58      0.56        18
weighted avg       0.61      0.56      0.56        18



In [63]:
all_features = []

for i in range(len(model.feature_importances_)):
    dicionario = {'feature_name': model.feature_names_[i], 'feature_importance': model.feature_importances_[i]}
    all_features.append(dicionario)

pd.DataFrame.from_dict(all_features).sort_values(by='feature_importance', ascending=False)

Unnamed: 0,feature_name,feature_importance
0,age,24.154814
1,gender,15.784309
2,time_in_social_media,10.759561
6,number_of_pets,9.365871
4,social_class,7.777982
10,feeling,7.049781
9,report_abandoned,6.659739
7,has_others,5.371373
3,people_living_together,4.895913
8,forgets,3.095657
