In [1]:
import sys
import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import time
import pyarrow.parquet as pq
import scipy
import bisect
import sklearn.metrics as m
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier

In [83]:
SPLIT_SEED = 42
DATA_FILE = 'data_out/data_for_learn_parquet_last_2'
TARGET_FILE = 'data_in/public_train.pqt'
SUBMISSION_FILE = 'data_in/submit_2.pqt'

In [84]:
data = pq.read_table(f"{DATA_FILE}").to_pandas()

In [85]:
data = data.fillna(0)

In [68]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 415317 entries, 0 to 415316
Data columns (total 55 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   user_id                           415317 non-null  int64  
 1   price                             415317 non-null  float64
 2   max_request_cnt                   415317 non-null  int64  
 3   avg_request_cnt                   415317 non-null  float64
 4   count_request_cnt                 415317 non-null  int64  
 5   max_night_request_cnt             415317 non-null  float64
 6   avg_night_request_cnt             415317 non-null  float64
 7   count_night_request_cnt           415317 non-null  float64
 8   max_day_request_cnt               415317 non-null  float64
 9   avg_day_request_cnt               415317 non-null  float64
 10  count_day_request_cnt             415317 non-null  float64
 11  max_morning_request_cnt           415317 non-null  f

In [88]:
all_usr_emb = pq.read_table(f"data_in/all_usr_emb_f80_i40.parquet").to_pandas()
data = data.merge(all_usr_emb, how = 'left', on = ['user_id'])
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 415317 entries, 0 to 415316
Columns: 215 entries, user_id to 79_y
dtypes: float32(160), float64(35), int64(10), object(10)
memory usage: 430.9+ MB


In [89]:
count_all_usr_emb = pq.read_table(f"data_in/count_usr_emb_f50_i30.parquet").to_pandas()
data = data.merge(count_all_usr_emb, how = 'left', on = ['user_id'])
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 415317 entries, 0 to 415316
Columns: 265 entries, user_id to 49
dtypes: float32(210), float64(35), int64(10), object(10)
memory usage: 510.1+ MB


In [90]:
target = pq.read_table(f"{TARGET_FILE}").to_pandas()

In [91]:
data_t_is_male = target.merge(data, how = 'left', on = ['user_id'])

In [92]:
data_t_is_male['is_male'] = data_t_is_male['is_male'].map(str)
data_t_is_male = data_t_is_male[data_t_is_male['is_male'] != 'None']
data_t_is_male = data_t_is_male[data_t_is_male['is_male'] != 'NA']
data_t_is_male['is_male'] = data_t_is_male['is_male'].map(int)
data_t_is_male['is_male'].value_counts()

1    135332
0    128994
Name: is_male, dtype: int64

In [93]:
no_importance = ['count_region_name', 'top_3_url_count_request_cnt', 'count_day_date', 'count_morning_date', 'count_date', 'count_night_date', 'count_evening_date', 'min_sum_date_request_cnt', 'min_sum_date_evening_request_cnt', 'min_sum_date_day_request_cnt', 'min_sum_date_morning_request_cnt', 'min_sum_date_night_request_cnt', 'max_day_request_cnt', 'min_lag_date', 'max_morning_request_cnt', 'max_evening_request_cnt', 'max_request_cnt', 'max_night_request_cnt', 'min_count_part_of_day_date', 'max_count_part_of_day_date']

In [94]:
data_t_is_male = data_t_is_male.drop(no_importance, axis = 1)

In [95]:
cat_features = list(data_t_is_male.select_dtypes(['object']).columns)

In [96]:
print(cat_features)

['top_1_url_sum_request_cnt', 'top_2_url_sum_request_cnt', 'top_3_url_sum_request_cnt', 'top_4_url_sum_request_cnt', 'top_5_url_sum_request_cnt', 'top_1_url_count_request_cnt', 'top_2_url_count_request_cnt', 'region_name', 'cpe_model_name']


In [97]:
data_t_is_male.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 264326 entries, 0 to 269999
Columns: 247 entries, age to 49
dtypes: float32(210), float64(23), int64(5), object(9)
memory usage: 288.4+ MB


## Получим оценку по полу

In [98]:
%%time
x_train, x_test, y_train, y_test = train_test_split(\
    data_t_is_male.drop(['user_id', 'age', 'is_male'], axis = 1), data_t_is_male['is_male'], test_size = 0.25, random_state = SPLIT_SEED)


CPU times: user 459 ms, sys: 74.6 ms, total: 534 ms
Wall time: 537 ms


In [99]:
clf_is_male = CatBoostClassifier(thread_count=3
                                 , iterations=2000
                                 , random_seed=SPLIT_SEED, learning_rate=0.05, early_stopping_rounds=20
                                 , eval_metric='AUC'
                                 , depth=7
                                 , grow_policy='Depthwise'
                                 , min_data_in_leaf = 3
                                 )

In [100]:
%%time
clf_is_male.fit(x_train, y_train, verbose = False, cat_features=cat_features
                , plot=True
                )
print(f'GINI по полу {2 * m.roc_auc_score(y_test, clf_is_male.predict_proba(x_test)[:,1]) - 1:2.3f}')
# 0.688

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

GINI по полу 0.691
CPU times: user 1h 4min 15s, sys: 48.6 s, total: 1h 5min 3s
Wall time: 22min 1s


## Получим оценку по возрасту

In [103]:
def age_bucket(x):
    return bisect.bisect_right([18,25,35,45,55,65], x)

In [104]:
data_t_age = target.merge(data, how = 'left', on = ['user_id'])

In [105]:
data_t_age['age'] = data_t_age['age'].map(str)
data_t_age = data_t_age[data_t_age['age'] != 'None']
data_t_age = data_t_age[data_t_age['age'] != 'NA']
data_t_age['age'] = data_t_age['age'].map(float)

In [106]:
data_t_age['age'] = data_t_age['age'].map(age_bucket)
# sns.histplot(df['age'], bins = 7)

In [107]:
data_t_age = data_t_age[data_t_age['age'] != 0]

In [108]:
data_t_age['age'].value_counts()

2    85212
3    84065
4    42083
1    25969
5    25420
6     6900
Name: age, dtype: int64

In [112]:
cat_features = list(data_t_age.select_dtypes(['object']).columns)
print(cat_features)
cat_features = ['top_1_url_sum_request_cnt', 'top_2_url_sum_request_cnt', 'top_3_url_sum_request_cnt', 'top_4_url_sum_request_cnt', 'top_5_url_sum_request_cnt', 'top_1_url_count_request_cnt', 'top_2_url_count_request_cnt', 'top_3_url_count_request_cnt', 'region_name', 'cpe_model_name']

['is_male', 'top_1_url_sum_request_cnt', 'top_2_url_sum_request_cnt', 'top_3_url_sum_request_cnt', 'top_4_url_sum_request_cnt', 'top_5_url_sum_request_cnt', 'top_1_url_count_request_cnt', 'top_2_url_count_request_cnt', 'top_3_url_count_request_cnt', 'region_name', 'cpe_model_name']


In [115]:
x_train, x_test, y_train, y_test = train_test_split(\
    data_t_age.drop(['user_id', 'age', 'is_male'], axis = 1), data_t_age['age'], test_size = 0.25, random_state = SPLIT_SEED)

clf = CatBoostClassifier(thread_count = 3
                         , iterations = 5000
                         , random_seed = SPLIT_SEED, learning_rate = 0.05, early_stopping_rounds = 20
                         , eval_metric = 'AUC', loss_function = 'MultiClass'
                         , classes_count = 6
                         , class_names = [1, 2, 3, 4, 5, 6]
                        )

In [116]:
%%time

clf.fit(x_train, y_train, verbose = False, plot=True, cat_features=cat_features)
print(m.classification_report(y_test, clf.predict(x_test), \
                            target_names = ['18-25','25-34', '35-44', '45-54', '55-65', '65+']))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

              precision    recall  f1-score   support

       18-25       0.52      0.27      0.36      6596
       25-34       0.52      0.62      0.56     21118
       35-44       0.43      0.60      0.50     21105
       45-54       0.38      0.19      0.26     10507
       55-65       0.40      0.25      0.31      6375
         65+       0.41      0.04      0.07      1712

    accuracy                           0.46     67413
   macro avg       0.44      0.33      0.34     67413
weighted avg       0.46      0.46      0.44     67413

CPU times: user 17h 12min 33s, sys: 37min 33s, total: 17h 50min 7s
Wall time: 6h 3min 41s


## Сабмит

In [117]:
submit = pq.read_table(f"{SUBMISSION_FILE}").to_pandas()

In [118]:
submit_predict_data = submit.merge(data, how = 'left', on = ['user_id'])

In [32]:
# submit_predict_data = submit_predict_data.fillna(0)

In [119]:
submit['is_male'] = clf_is_male.predict_proba(submit_predict_data.drop(['user_id', 'count_region_name', 
    'top_3_url_count_request_cnt', 'count_day_date', 'count_morning_date', 'count_date', 'count_night_date', 
    'count_evening_date', 'min_sum_date_request_cnt', 'min_sum_date_evening_request_cnt', 'min_sum_date_day_request_cnt', 
    'min_sum_date_morning_request_cnt', 'min_sum_date_night_request_cnt', 'max_day_request_cnt', 'min_lag_date', 
    'max_morning_request_cnt', 'max_evening_request_cnt', 'max_request_cnt', 'max_night_request_cnt', 'min_count_part_of_day_date', 
    'max_count_part_of_day_date'], axis = 1))[:,1]
submit['age'] = clf.predict(submit_predict_data.drop(['user_id'], axis = 1))
submit.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 144724 entries, 221301 to 145315
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   user_id  144724 non-null  int64  
 1   is_male  144724 non-null  float64
 2   age      144724 non-null  int64  
dtypes: float64(1), int64(2)
memory usage: 4.4 MB


In [120]:
submit.head()

Unnamed: 0,user_id,is_male,age
221301,221301,0.946883,2
31271,31271,0.574551,3
211594,211594,0.459825,2
253119,253119,0.53922,2
192578,192578,0.726408,2


In [121]:
submit.to_csv(f'data_out/sample_submission.csv', index = False, mode='w')

In [126]:
clf.get_feature_importance(prettified=True).head(60)

Unnamed: 0,Feature Id,Importances
0,cpe_model_name,2.228839
1,top_1_url_sum_request_cnt,2.035186
2,38,1.830543
3,top_2_url_sum_request_cnt,1.444013
4,25,1.400044
5,35,1.343871
6,price,1.317015
7,region_name,1.262874
8,top_1_url_count_request_cnt,1.237051
9,13,1.224975


In [20]:
importance = clf_is_male.get_feature_importance(prettified=True).tail(60)

**Модель классификации по полу сильно ускорилась и немного улучшилась после удаления признаков с Importances < 0.3, после удаления Importances < 0.5 ухудшилавь**

In [21]:
importance.head(60)

Unnamed: 0,Feature Id,Importances
74,65,0.558375
75,18,0.548302
76,count_morning_request_cnt,0.543132
77,max_sum_date_request_cnt,0.540781
78,53,0.527146
79,29,0.523092
80,50,0.522125
81,avg_sum_date_request_cnt,0.517819
82,avg_morning_request_cnt,0.509213
83,36,0.506564


In [64]:
no_importance = list(importance[importance['Importances'] < 0.3]['Feature Id'])
print(no_importance)

['74', 'top_2_url_sum_request_cnt', '78', 'avg_sum_date_evening_request_cnt', 'avg_request_cnt', 'count_night_request_cnt', '54', '73', '69', 'region_name', '57', 'avg_sum_date_day_request_cnt', '66', 'count_evening_request_cnt', 'max_sum_date_day_request_cnt', 'top_3_url_sum_request_cnt', 'avg_night_request_cnt', '61', 'max_sum_date_morning_request_cnt', 'avg_evening_request_cnt', '72', 'top_4_url_sum_request_cnt', 'max_sum_date_evening_request_cnt', 'top_5_url_sum_request_cnt', 'max_lag_date', 'max_sum_date_night_request_cnt', 'count_region_name', 'top_3_url_count_request_cnt', 'count_day_date', 'count_morning_date', 'count_date', 'count_night_date', 'count_evening_date', 'min_sum_date_request_cnt', 'min_sum_date_evening_request_cnt', 'min_sum_date_day_request_cnt', 'min_sum_date_morning_request_cnt', 'min_sum_date_night_request_cnt', 'max_day_request_cnt', 'min_lag_date', 'max_morning_request_cnt', 'max_evening_request_cnt', 'max_request_cnt', 'max_night_request_cnt', 'min_count_par

In [None]:
# svm, 