In [1]:
import sys
import os
import warnings
warnings.filterwarnings('ignore')

In [4]:
import pandas as pd
import numpy as np
import time
import pyarrow.parquet as pq
import scipy
import bisect
import sklearn.metrics as m
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier

In [5]:
SPLIT_SEED = 42
DATA_FILE = 'data_out/data_for_learn_parquet_last_3'
TARGET_FILE = 'data_in/public_train.pqt'
SUBMISSION_FILE = 'data_in/submit_2.pqt'

In [6]:
data = pq.read_table(f"{DATA_FILE}").to_pandas()

In [7]:
data = data.fillna(0)

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 415317 entries, 0 to 415316
Data columns (total 79 columns):
 #   Column                                   Non-Null Count   Dtype  
---  ------                                   --------------   -----  
 0   user_id                                  415317 non-null  int64  
 1   price                                    415317 non-null  float64
 2   max_request_cnt                          415317 non-null  int64  
 3   avg_request_cnt                          415317 non-null  float64
 4   count_request_cnt                        415317 non-null  int64  
 5   stddev_request_cnt                       415317 non-null  float64
 6   stddev_pop_request_cnt                   415317 non-null  float64
 7   max_night_request_cnt                    415317 non-null  float64
 8   stddev_night_request_cnt                 415317 non-null  float64
 9   stddev_pop_night_request_cnt             415317 non-null  float64
 10  avg_night_request_cnt           

In [9]:
all_usr_emb = pq.read_table(f"data_in/all_usr_emb_f80_i40.parquet").to_pandas()
data = data.merge(all_usr_emb, how = 'left', on = ['user_id'])
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 415317 entries, 0 to 415316
Columns: 159 entries, user_id to 79
dtypes: float32(80), float64(59), int64(10), object(10)
memory usage: 380.2+ MB


In [10]:
target = pq.read_table(f"{TARGET_FILE}").to_pandas()

In [11]:
data_t_is_male = target.merge(data, how = 'left', on = ['user_id'])

In [12]:
data_t_is_male['is_male'] = data_t_is_male['is_male'].map(str)
data_t_is_male = data_t_is_male[data_t_is_male['is_male'] != 'None']
data_t_is_male = data_t_is_male[data_t_is_male['is_male'] != 'NA']
data_t_is_male['is_male'] = data_t_is_male['is_male'].map(int)
data_t_is_male['is_male'].value_counts()

1    135332
0    128994
Name: is_male, dtype: int64

In [93]:
# no_importance = ['count_region_name', 'top_3_url_count_request_cnt', 'count_day_date', 'count_morning_date', 'count_date', 'count_night_date', 'count_evening_date', 'min_sum_date_request_cnt', 'min_sum_date_evening_request_cnt', 'min_sum_date_day_request_cnt', 'min_sum_date_morning_request_cnt', 'min_sum_date_night_request_cnt', 'max_day_request_cnt', 'min_lag_date', 'max_morning_request_cnt', 'max_evening_request_cnt', 'max_request_cnt', 'max_night_request_cnt', 'min_count_part_of_day_date', 'max_count_part_of_day_date']

In [94]:
# data_t_is_male = data_t_is_male.drop(no_importance, axis = 1)

In [13]:
cat_features = list(data_t_is_male.select_dtypes(['object']).columns)

In [14]:
print(cat_features)

['top_1_url_sum_request_cnt', 'top_2_url_sum_request_cnt', 'top_3_url_sum_request_cnt', 'top_4_url_sum_request_cnt', 'top_5_url_sum_request_cnt', 'top_1_url_count_request_cnt', 'top_2_url_count_request_cnt', 'top_3_url_count_request_cnt', 'region_name', 'cpe_model_name']


In [15]:
data_t_is_male.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 264326 entries, 0 to 269999
Columns: 161 entries, age to 79
dtypes: float32(80), float64(60), int64(11), object(10)
memory usage: 246.0+ MB


## Получим оценку по полу

In [16]:
%%time
x_train, x_test, y_train, y_test = train_test_split(\
    data_t_is_male.drop(['user_id', 'age', 'is_male'], axis = 1), data_t_is_male['is_male'], test_size = 0.25, random_state = SPLIT_SEED)


CPU times: user 380 ms, sys: 91.3 ms, total: 472 ms
Wall time: 471 ms


In [17]:
clf_is_male = CatBoostClassifier(thread_count=3
                                 , iterations=2000
                                 , random_seed=SPLIT_SEED, learning_rate=0.05, early_stopping_rounds=20
                                 , eval_metric='AUC'
                                 , depth=7
                                 , grow_policy='Depthwise'
                                 , min_data_in_leaf = 3
                                 )

In [18]:
%%time
clf_is_male.fit(x_train, y_train, verbose = False, cat_features=cat_features
                , plot=True
                )

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

CPU times: user 42min 43s, sys: 37.1 s, total: 43min 20s
Wall time: 14min 38s


<catboost.core.CatBoostClassifier at 0x7f7c65f6fa00>

In [19]:
print(f'GINI по полу {2 * m.roc_auc_score(y_test, clf_is_male.predict_proba(x_test)[:,1]) - 1:2.3f}')
# 0.688

GINI по полу 0.688


## Получим оценку по возрасту

In [31]:
def age_bucket(x):
    return bisect.bisect_right([18,25,35,45,55,65], x)

In [32]:
data_t_age = target.merge(data, how = 'left', on = ['user_id'])

In [34]:
data_t_age['age'] = data_t_age['age'].map(str)
data_t_age = data_t_age[data_t_age['age'] != 'None']
data_t_age = data_t_age[data_t_age['age'] != 'NA']
data_t_age['age'] = data_t_age['age'].map(float)

In [35]:
data_t_age['age'] = data_t_age['age'].map(age_bucket)
# sns.histplot(df['age'], bins = 7)

In [36]:
data_t_age = data_t_age[data_t_age['age'] != 0]

In [37]:
data_t_age['age'].value_counts()

2    85212
3    84065
4    42083
1    25969
5    25420
6     6900
Name: age, dtype: int64

In [38]:
cat_features = list(data_t_age.select_dtypes(['object']).columns)
print(cat_features)
cat_features = ['top_1_url_sum_request_cnt', 'top_2_url_sum_request_cnt', 'top_3_url_sum_request_cnt', 'top_4_url_sum_request_cnt', 'top_5_url_sum_request_cnt', 'top_1_url_count_request_cnt', 'top_2_url_count_request_cnt', 'top_3_url_count_request_cnt', 'region_name', 'cpe_model_name']

['is_male', 'top_1_url_sum_request_cnt', 'top_2_url_sum_request_cnt', 'top_3_url_sum_request_cnt', 'top_4_url_sum_request_cnt', 'top_5_url_sum_request_cnt', 'top_1_url_count_request_cnt', 'top_2_url_count_request_cnt', 'top_3_url_count_request_cnt', 'region_name', 'cpe_model_name']


In [41]:
x_train, x_test, y_train, y_test = train_test_split(\
    data_t_age.drop(['user_id', 'age', 'is_male'], axis = 1), data_t_age['age'], test_size = 0.25, random_state = SPLIT_SEED)

clf = CatBoostClassifier(thread_count = 3
                         , iterations = 500
                         , random_seed = SPLIT_SEED, learning_rate = 0.05, early_stopping_rounds = 20
                         , eval_metric = 'AUC', loss_function = 'MultiClass'
                         , classes_count = 6
                         , class_names = [1, 2, 3, 4, 5, 6]
                        )

In [42]:
%%time

clf.fit(x_train, y_train, verbose = False, plot=True, cat_features=cat_features)
print(m.classification_report(y_test, clf.predict(x_test), \
                            target_names = ['18-25','25-34', '35-44', '45-54', '55-65', '65+']))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

              precision    recall  f1-score   support

       18-25       0.53      0.19      0.27      6596
       25-34       0.49      0.62      0.55     21118
       35-44       0.41      0.62      0.49     21105
       45-54       0.36      0.13      0.19     10507
       55-65       0.41      0.20      0.27      6375
         65+       0.49      0.01      0.02      1712

    accuracy                           0.44     67413
   macro avg       0.45      0.29      0.30     67413
weighted avg       0.44      0.44      0.41     67413

CPU times: user 1h 36min 41s, sys: 3min, total: 1h 39min 42s
Wall time: 34min 10s


## Сабмит

In [117]:
submit = pq.read_table(f"{SUBMISSION_FILE}").to_pandas()

In [118]:
submit_predict_data = submit.merge(data, how = 'left', on = ['user_id'])

In [32]:
# submit_predict_data = submit_predict_data.fillna(0)

In [119]:
submit['is_male'] = clf_is_male.predict_proba(submit_predict_data.drop(['user_id'], axis = 1))[:,1]
submit['age'] = clf.predict(submit_predict_data.drop(['user_id'], axis = 1))
submit.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 144724 entries, 221301 to 145315
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   user_id  144724 non-null  int64  
 1   is_male  144724 non-null  float64
 2   age      144724 non-null  int64  
dtypes: float64(1), int64(2)
memory usage: 4.4 MB


In [120]:
submit.head()

Unnamed: 0,user_id,is_male,age
221301,221301,0.946883,2
31271,31271,0.574551,3
211594,211594,0.459825,2
253119,253119,0.53922,2
192578,192578,0.726408,2


In [121]:
submit.to_csv(f'data_out/sample_submission.csv', index = False, mode='w')

In [46]:
clf.get_feature_importance(prettified=True).to_csv(f'data_out/importance.csv', index = False, mode='w')

In [20]:
importance = clf_is_male.get_feature_importance(prettified=True).tail(60)

**Модель классификации по полу сильно ускорилась и немного улучшилась после удаления признаков с Importances < 0.3, после удаления Importances < 0.5 ухудшилавь**

In [21]:
importance.head(60)

Unnamed: 0,Feature Id,Importances
74,65,0.558375
75,18,0.548302
76,count_morning_request_cnt,0.543132
77,max_sum_date_request_cnt,0.540781
78,53,0.527146
79,29,0.523092
80,50,0.522125
81,avg_sum_date_request_cnt,0.517819
82,avg_morning_request_cnt,0.509213
83,36,0.506564


In [64]:
no_importance = list(importance[importance['Importances'] < 0.3]['Feature Id'])
print(no_importance)

['74', 'top_2_url_sum_request_cnt', '78', 'avg_sum_date_evening_request_cnt', 'avg_request_cnt', 'count_night_request_cnt', '54', '73', '69', 'region_name', '57', 'avg_sum_date_day_request_cnt', '66', 'count_evening_request_cnt', 'max_sum_date_day_request_cnt', 'top_3_url_sum_request_cnt', 'avg_night_request_cnt', '61', 'max_sum_date_morning_request_cnt', 'avg_evening_request_cnt', '72', 'top_4_url_sum_request_cnt', 'max_sum_date_evening_request_cnt', 'top_5_url_sum_request_cnt', 'max_lag_date', 'max_sum_date_night_request_cnt', 'count_region_name', 'top_3_url_count_request_cnt', 'count_day_date', 'count_morning_date', 'count_date', 'count_night_date', 'count_evening_date', 'min_sum_date_request_cnt', 'min_sum_date_evening_request_cnt', 'min_sum_date_day_request_cnt', 'min_sum_date_morning_request_cnt', 'min_sum_date_night_request_cnt', 'max_day_request_cnt', 'min_lag_date', 'max_morning_request_cnt', 'max_evening_request_cnt', 'max_request_cnt', 'max_night_request_cnt', 'min_count_par

In [None]:
# svm, 