In [None]:
import warnings
warnings.filterwarnings('ignore')
import pyarrow.parquet as pq
import sklearn.metrics as m
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:
SPLIT_SEED = 16
DATA_FILE = 'data_out/data_for_learn_parquet_last_1'
TARGET_FILE = 'data_in/public_train.pqt'

data = pq.read_table(f"{DATA_FILE}").to_pandas()


data['day_avg_sum_date_request_cnt/avg_sum_date_request_cnt'] = data['day_avg_sum_date_request_cnt'] / data['avg_sum_date_request_cnt']
data['night_avg_sum_date_request_cnt/avg_sum_date_request_cnt'] = data['night_avg_sum_date_request_cnt'] / data['avg_sum_date_request_cnt']
data['morning_avg_sum_date_request_cnt/avg_sum_date_request_cnt'] = data['morning_avg_sum_date_request_cnt'] / data['avg_sum_date_request_cnt']
data['evening_avg_sum_date_request_cnt/avg_sum_date_request_cnt'] = data['evening_avg_sum_date_request_cnt'] / data['avg_sum_date_request_cnt']

all_usr_emb = pq.read_table(f"data_in/sum50_count30_countdate30_regionsumreq30_usr_emb_f_i50.parquet").to_pandas()
data = data.merge(all_usr_emb, how = 'left', on = ['user_id'])

data = data.fillna(0)

target = pq.read_table(f"{TARGET_FILE}").to_pandas()
data_t_is_male = target.merge(data, how = 'left', on = ['user_id'])

data_t_is_male['is_male'] = data_t_is_male['is_male'].map(str)
data_t_is_male = data_t_is_male[data_t_is_male['is_male'] != 'None']
data_t_is_male = data_t_is_male[data_t_is_male['is_male'] != 'NA']
data_t_is_male['is_male'] = data_t_is_male['is_male'].map(int)
print(data_t_is_male['is_male'].value_counts())
cat_features = list(data_t_is_male.select_dtypes(['object']).columns)
print(cat_features)

data_t_is_male = data_t_is_male.loc[~data_t_is_male['user_id'].isin([155670, 327408, 28719, 330397, 273762, 78276, 188466, 220333, 265327, 406073])]
# data_t_is_male = data_t_is_male.loc[data_t_is_male['price'] != 0]
print(data_t_is_male['is_male'].value_counts())

1    135332
0    128994
Name: is_male, dtype: int64
['top_1_url_sum_request_cnt', 'top_2_url_sum_request_cnt', 'top_3_url_sum_request_cnt', 'top_4_url_sum_request_cnt', 'top_5_url_sum_request_cnt', 'top_1_url_count_request_cnt', 'top_2_url_count_request_cnt', 'top_3_url_count_request_cnt', 'region_name', 'cpe_model_name']
1    135324
0    128992
Name: is_male, dtype: int64


In [3]:

x_train, x_test, y_train, y_test = train_test_split(
    data_t_is_male.drop(['user_id', 'age', 'is_male'], axis = 1), data_t_is_male['is_male'], test_size = 0.2
                        , random_state = SPLIT_SEED
                        , stratify = data_t_is_male['is_male'])

# X = data_t_is_male.drop(['user_id', 'age', 'is_male'], axis = 1)
# y = data_t_is_male['is_male']

clf_is_male = CatBoostClassifier(thread_count=5
                                 , iterations=7000
                                 , random_seed=SPLIT_SEED, learning_rate=0.01, early_stopping_rounds=100
                                 , eval_metric='AUC'
                                 , depth = 8
#                                  , max_ctr_complexity = 8
                                 , bootstrap_type = 'Bernoulli' # Bayesian, Bernoulli, MVS.
                                 , loss_function = 'CrossEntropy'
                                 , grow_policy='Depthwise'
                                 , min_data_in_leaf = 3)
clf_is_male.fit(x_train, y_train, verbose = False, cat_features=cat_features
                , plot=True
                , eval_set=(x_test, y_test)
                )

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x7fe61d925bb0>

In [4]:
print(f'GINI по полу {2 * m.roc_auc_score(y_test, clf_is_male.predict_proba(x_test)[:,1]) - 1:2.3f}')

clf_is_male.get_feature_importance(prettified=True).to_csv(f'data_out/clf_is_male_importance.csv'
                                                           , index = False, mode='w')

GINI по полу 0.700


In [5]:
clf_is_male.save_model('catboost_clf_is_male.cbm')

In [6]:
submit = pq.read_table('data_in/submit_2.pqt').to_pandas()
submit_predict_data = submit.merge(data, how = 'left', on = ['user_id'])
submit['is_male'] = clf_is_male.predict(submit_predict_data.drop(['user_id'], axis = 1))
submit.to_csv("data_in/male_submission.csv", index = False, mode='w')
submit.head(5)

Unnamed: 0,user_id,is_male
221301,221301,1
31271,31271,1
211594,211594,1
253119,253119,0
192578,192578,1
