In [1]:
import sys
import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import time
import pyarrow.parquet as pq
import scipy
import bisect
import sklearn.metrics as m
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [12]:
SPLIT_SEED = 42
DATA_FILE = 'data_out/data_for_learn_parquet_last'
TARGET_FILE = 'data_in/public_train.pqt'
SUBMISSION_FILE = 'data_in/submit.pqt'

In [13]:
data = pq.read_table(f"{DATA_FILE}").to_pandas()

In [14]:
all_usr_emb = pq.read_table(f"data_in/all_usr_emb_f80_i40.parquet").to_pandas()

In [15]:
data = data.merge(all_usr_emb, how = 'left', on = ['user_id'])

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 415317 entries, 0 to 415316
Columns: 125 entries, user_id to 79
dtypes: float32(80), float64(35), int64(10)
memory usage: 272.5 MB


In [17]:
target = pq.read_table(f"{TARGET_FILE}").to_pandas()

In [18]:
data_t_is_male = target.merge(data, how = 'left', on = ['user_id'])

In [19]:
data_t_is_male['is_male'] = data_t_is_male['is_male'].map(str)
data_t_is_male = data_t_is_male[data_t_is_male['is_male'] != 'None']
data_t_is_male = data_t_is_male[data_t_is_male['is_male'] != 'NA']
data_t_is_male['is_male'] = data_t_is_male['is_male'].map(int)
data_t_is_male = data_t_is_male.fillna(0)
data_t_is_male['is_male'].value_counts()

1    135332
0    128994
Name: is_male, dtype: int64

In [20]:
data.head(30)

Unnamed: 0,user_id,price,max_request_cnt,avg_request_cnt,count_request_cnt,max_night_request_cnt,avg_night_request_cnt,count_night_request_cnt,max_day_request_cnt,avg_day_request_cnt,...,70,71,72,73,74,75,76,77,78,79
0,0,2990.0,5,1.473,131,1.0,1.0,1.0,5.0,1.911,...,0.026467,0.0118,-0.00287,-0.00893,0.015715,0.031563,0.016494,-0.031078,0.004065,0.009447
1,6,8990.0,6,1.789,456,2.0,1.111,18.0,6.0,2.266,...,-0.001458,-0.039741,0.006562,0.008677,-0.027485,0.010693,-0.002047,-0.012734,0.017082,-0.025994
2,7,5490.0,6,1.612,461,6.0,1.801,146.0,5.0,1.477,...,0.011309,-0.014313,0.02642,-0.017382,0.003251,-0.00464,0.005185,-0.029093,0.011038,-0.010447
3,19,16478.0,6,1.76,375,3.0,2.0,8.0,6.0,2.024,...,0.017102,0.010049,-0.003874,-0.02837,0.00873,0.007202,0.015157,-0.008177,0.026108,-0.003416
4,22,32665.0,12,1.711,643,4.0,1.608,51.0,12.0,2.014,...,0.007499,-0.008379,-0.017514,-0.023724,0.040093,0.011489,-0.001925,0.03693,-0.013211,0.029407
5,25,33990.0,5,1.429,252,2.0,1.143,7.0,4.0,1.526,...,0.009868,-0.000728,-0.007041,-0.007483,-0.00144,0.030464,0.013476,-0.045049,-0.01076,0.010694
6,26,4990.0,6,1.411,1018,5.0,1.413,126.0,3.0,1.228,...,0.039601,0.009405,0.013775,-0.000411,0.007716,0.010244,-0.003905,0.010397,0.03479,0.006018
7,29,45696.0,6,1.49,1458,6.0,1.196,102.0,6.0,1.516,...,0.027952,0.008575,0.016422,0.005258,0.030686,0.045441,0.000492,0.023293,-0.021104,0.010385
8,31,22829.0,12,2.428,1104,12.0,2.211,294.0,10.0,2.429,...,-0.013547,-0.009483,0.031588,-0.034373,0.013365,-0.007253,0.007863,-0.042802,0.016877,-0.011459
9,32,15825.0,6,1.592,825,5.0,1.64,89.0,6.0,1.803,...,0.021533,-0.016075,-0.015951,0.005163,0.024261,0.03191,0.013938,-0.005159,0.021881,0.013032


## Получим оценку по полу

In [21]:
%%time
x_train, x_test, y_train, y_test = train_test_split(\
    data_t_is_male.drop(['user_id', 'age', 'is_male'], axis = 1), data_t_is_male['is_male'], test_size = 0.25, random_state = SPLIT_SEED)


CPU times: user 248 ms, sys: 62.2 ms, total: 310 ms
Wall time: 308 ms


In [23]:
%%time
clf = CatBoostClassifier(thread_count=3)
clf.fit(x_train, y_train, verbose = False)
print(f'GINI по полу {2 * m.roc_auc_score(y_test, clf.predict_proba(x_test)[:,1]) - 1:2.3f}')

GINI по полу 0.679
CPU times: user 3min 55s, sys: 4.11 s, total: 3min 59s
Wall time: 1min 21s


In [22]:
clf_ran_forest = RandomForestClassifier(max_depth=2, random_state=17)
clf_ran_forest.fit(x_train, y_train)
print(f'GINI по полу {2 * m.roc_auc_score(y_test, clf_ran_forest.predict_proba(x_test)[:,1]) - 1:2.3f}')

GINI по полу 0.405


## Получим оценку по возрасту

In [24]:
def age_bucket(x):
    return bisect.bisect_right([18,25,35,45,55,65], x)

In [25]:
data_t_age = target.merge(data, how = 'left', on = ['user_id'])

In [26]:
data_t_age['age'] = data_t_age['age'].map(str)
data_t_age = data_t_age[data_t_age['age'] != 'None']
data_t_age = data_t_age[data_t_age['age'] != 'NA']
data_t_age['age'] = data_t_age['age'].map(float)
data_t_age = data_t_age.fillna(0)

In [27]:
data_t_age['age'] = data_t_age['age'].map(age_bucket)
# sns.histplot(df['age'], bins = 7)

In [28]:
data_t_age = data_t_age[data_t_age['age'] != 0]

In [29]:
data_t_age['age'].value_counts()

2    85212
3    84065
4    42083
1    25969
5    25420
6     6899
Name: age, dtype: int64

In [30]:
%%time
x_train, x_test, y_train, y_test = train_test_split(\
    data_t_age.drop(['user_id', 'age', 'is_male'], axis = 1), data_t_age['age'], test_size = 0.25, random_state = SPLIT_SEED)

clf = CatBoostClassifier(thread_count=3)
clf.fit(x_train, y_train, verbose = False)
print(m.classification_report(y_test, clf.predict(x_test), \
                            target_names = ['18-25','25-34', '35-44', '45-54', '55-65', '65+']))

              precision    recall  f1-score   support

       18-25       0.50      0.21      0.30      6503
       25-34       0.50      0.61      0.55     21176
       35-44       0.42      0.61      0.50     20971
       45-54       0.37      0.16      0.23     10530
       55-65       0.41      0.23      0.29      6480
         65+       0.40      0.03      0.06      1752

    accuracy                           0.45     67412
   macro avg       0.43      0.31      0.32     67412
weighted avg       0.44      0.45      0.42     67412

CPU times: user 23min 47s, sys: 3.63 s, total: 23min 51s
Wall time: 8min 8s


In [109]:
clf.fit(df.drop(['user_id', 'age', 'is_male'], axis = 1), df['age'], verbose = False)
id_to_submit['age'] = clf.predict(id_to_submit[['user_id']].merge(usr_emb, how = 'inner', on = ['user_id']))

## Сабмит

In [110]:
id_to_submit.head()

Unnamed: 0,user_id,is_male,age
0,6,0.330467,2
1,11,0.725477,5
2,19,0.24019,1
3,27,0.536798,2
4,32,0.471325,3


In [111]:
id_to_submit.to_csv(f'{LOCAL_DATA_PATH}/submission.csv', index = False)

In [112]:
! head $LOCAL_DATA_PATH/submission.csv

user_id,is_male,age
6,0.330467150589351,2
11,0.7254769930049977,5
19,0.24019020466489424,1
27,0.5367979653267113,2
32,0.4713251899911531,3
37,0.2810748555581949,2
43,0.6659790932425269,2
44,0.9189155263784968,1
46,0.5166941298660128,3


# Скор на лидерборде

In [151]:
context_scorer(submission, answers)

1.4715992278434493