In [1]:
import sys
import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import time
import pyarrow.parquet as pq
import scipy
import bisect
import sklearn.metrics as m
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier

In [3]:
SPLIT_SEED = 42
DATA_FILE = 'data_out/data_for_learn_parquet_last_1'
TARGET_FILE = 'data_in/public_train.pqt'
SUBMISSION_FILE = 'data_in/submit_2.pqt'

In [9]:
data = pq.read_table(f"{DATA_FILE}").to_pandas()

In [11]:
data = data.fillna(0)

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 415317 entries, 0 to 415316
Data columns (total 51 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   user_id                           415317 non-null  int64  
 1   price                             415317 non-null  float64
 2   max_request_cnt                   415317 non-null  int64  
 3   avg_request_cnt                   415317 non-null  float64
 4   count_request_cnt                 415317 non-null  int64  
 5   max_night_request_cnt             415317 non-null  float64
 6   avg_night_request_cnt             415317 non-null  float64
 7   count_night_request_cnt           415317 non-null  float64
 8   max_day_request_cnt               415317 non-null  float64
 9   avg_day_request_cnt               415317 non-null  float64
 10  count_day_request_cnt             415317 non-null  float64
 11  max_morning_request_cnt           415317 non-null  f

In [13]:
all_usr_emb = pq.read_table(f"data_in/all_usr_emb_f80_i40.parquet").to_pandas()

In [14]:
data = data.merge(all_usr_emb, how = 'left', on = ['user_id'])

In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 415317 entries, 0 to 415316
Columns: 131 entries, user_id to 79
dtypes: float32(80), float64(35), int64(10), object(6)
memory usage: 291.5+ MB


In [16]:
target = pq.read_table(f"{TARGET_FILE}").to_pandas()

In [17]:
data_t_is_male = target.merge(data, how = 'left', on = ['user_id'])

In [18]:
data_t_is_male['is_male'] = data_t_is_male['is_male'].map(str)
data_t_is_male = data_t_is_male[data_t_is_male['is_male'] != 'None']
data_t_is_male = data_t_is_male[data_t_is_male['is_male'] != 'NA']
data_t_is_male['is_male'] = data_t_is_male['is_male'].map(int)
data_t_is_male['is_male'].value_counts()

1    135332
0    128994
Name: is_male, dtype: int64

In [24]:
cat_features = list(data_t_is_male.select_dtypes(['object']).columns)

In [25]:
print(cat_features)

['top_1_url_sum_request_cnt', 'top_2_url_sum_request_cnt', 'top_3_url_sum_request_cnt', 'top_1_url_count_request_cnt', 'top_2_url_count_request_cnt', 'top_3_url_count_request_cnt']


## Получим оценку по полу

In [50]:
%%time
x_train, x_test, y_train, y_test = train_test_split(\
    data_t_is_male.drop(['user_id', 'age', 'is_male'], axis = 1), data_t_is_male['is_male'], test_size = 0.25, random_state = SPLIT_SEED)


CPU times: user 294 ms, sys: 54.4 ms, total: 349 ms
Wall time: 349 ms


In [51]:
clf_is_male = CatBoostClassifier(thread_count=3
                                 , iterations=2000
                                 , random_seed=SPLIT_SEED, learning_rate=0.05, early_stopping_rounds=20
                                 , eval_metric='AUC'
                                 , depth=7
                                 , grow_policy='Depthwise'
                                 , min_data_in_leaf = 3
                                 )

In [52]:
%%time
clf_is_male.fit(x_train, y_train, verbose = False, cat_features=cat_features
                , plot=True
                )
print(f'GINI по полу {2 * m.roc_auc_score(y_test, clf_is_male.predict_proba(x_test)[:,1]) - 1:2.3f}')
# 0.691

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

GINI по полу 0.683
CPU times: user 37min 19s, sys: 36.8 s, total: 37min 56s
Wall time: 12min 50s


## Получим оценку по возрасту

In [32]:
def age_bucket(x):
    return bisect.bisect_right([18,25,35,45,55,65], x)

In [33]:
data_t_age = target.merge(data, how = 'left', on = ['user_id'])

In [34]:
data_t_age['age'] = data_t_age['age'].map(str)
data_t_age = data_t_age[data_t_age['age'] != 'None']
data_t_age = data_t_age[data_t_age['age'] != 'NA']
data_t_age['age'] = data_t_age['age'].map(float)

In [35]:
data_t_age['age'] = data_t_age['age'].map(age_bucket)
# sns.histplot(df['age'], bins = 7)

In [36]:
data_t_age = data_t_age[data_t_age['age'] != 0]

In [57]:
data_t_age['age'].value_counts()

2    85212
3    84065
4    42083
1    25969
5    25420
6     6900
Name: age, dtype: int64

In [58]:
x_train, x_test, y_train, y_test = train_test_split(\
    data_t_age.drop(['user_id', 'age', 'is_male'], axis = 1), data_t_age['age'], test_size = 0.25, random_state = SPLIT_SEED)

clf = CatBoostClassifier(thread_count = 3
                         , iterations = 1000
                         , random_seed = SPLIT_SEED, learning_rate = 0.05, early_stopping_rounds = 20
                         , eval_metric = 'AUC', loss_function = 'MultiClass'
                         , classes_count = 6
                         , class_names = [1, 2, 3, 4, 5, 6]
                        )

In [None]:
%%time

clf.fit(x_train, y_train, verbose = False, plot=True, cat_features=cat_features)
print(m.classification_report(y_test, clf.predict(x_test), \
                            target_names = ['18-25','25-34', '35-44', '45-54', '55-65', '65+']))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Found only 6 unique classes in the data, but have defined 7 classes. Probably something is wrong with data.


## Сабмит

In [None]:
print(m.classification_report(y_test, clf.predict(x_test))

In [43]:
submit = pq.read_table(f"{SUBMISSION_FILE}").to_pandas()

In [44]:
submit_predict_data = submit.merge(data, how = 'left', on = ['user_id'])

In [32]:
# submit_predict_data = submit_predict_data.fillna(0)

In [45]:
submit['is_male'] = clf_is_male.predict_proba(submit_predict_data.drop(['user_id'], axis = 1))[:,1]
submit['age'] = clf.predict(submit_predict_data.drop(['user_id'], axis = 1))
submit.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 144724 entries, 221301 to 145315
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   user_id  144724 non-null  int64  
 1   is_male  144724 non-null  float64
 2   age      144724 non-null  int64  
dtypes: float64(1), int64(2)
memory usage: 4.4 MB


In [34]:
submit.head()

Unnamed: 0,user_id,is_male,age
221301,221301,0.918824,2
31271,31271,0.472104,3
211594,211594,0.441421,3
253119,253119,0.474605,2
192578,192578,0.841219,2


In [46]:
submit.to_csv(f'data_out/sample_submission.csv', index = False, mode='w')

In [42]:
clf.get_feature_importance(prettified=True).head(60)

Unnamed: 0,Feature Id,Importances
0,47,2.675131
1,top_1_url_sum_request_cnt,2.667914
2,price,2.502949
3,63,2.463012
4,top_2_url_sum_request_cnt,2.083147
5,67,1.997584
6,45,1.962041
7,53,1.731931
8,43,1.646971
9,32,1.39457
