In [2]:
import sys
import os
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
import numpy as np
import time
import pyarrow.parquet as pq
import scipy
# import implicit
import bisect
import sklearn.metrics as m
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [4]:
SPLIT_SEED = 42
DATA_FILE = 'data_out/data_for_learn_parquet_last'
TARGET_FILE = 'data_in/public_train.pqt'
SUBMISSION_FILE = 'data_in/submit.pqt'

In [5]:
data = pq.read_table(f"{DATA_FILE}").to_pandas()

In [6]:
all_usr_emb = pq.read_table(f"data_in/all_usr_emb.parquet").to_pandas()

In [7]:
data = data.merge(all_usr_emb, how = 'left', on = ['user_id'])

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 270000 entries, 0 to 269999
Data columns (total 96 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   user_id                           270000 non-null  int64  
 1   is_male                           269958 non-null  object 
 2   age                               269999 non-null  float64
 3   max_request_cnt                   270000 non-null  int64  
 4   avg_request_cnt                   270000 non-null  float64
 5   count_request_cnt                 270000 non-null  int64  
 6   max_night_request_cnt             219789 non-null  float64
 7   avg_night_request_cnt             219789 non-null  float64
 8   count_night_request_cnt           219789 non-null  float64
 9   max_day_request_cnt               262528 non-null  float64
 10  avg_day_request_cnt               262528 non-null  float64
 11  count_day_request_cnt             262528 non-null  f

In [74]:
data['is_male'] = data['is_male'].map(str)
data = data[data['is_male'] != 'None']
data = data[data['is_male'] != 'NA']
data['is_male'] = data['is_male'].map(int)
data = data.fillna(0)
data['is_male'].value_counts()

1    135332
0    128994
Name: is_male, dtype: int64

In [73]:
data.head(30)

Unnamed: 0,user_id,is_male,age,max_request_cnt,avg_request_cnt,count_request_cnt,max_night_request_cnt,avg_night_request_cnt,count_night_request_cnt,max_day_request_cnt,...,40,41,42,43,44,45,46,47,48,49
0,0,0,35.0,5,1.473,131,1.0,1.0,1.0,5.0,...,0.022853,-0.006094,0.004465,-0.001786,0.060139,0.026925,0.05864,0.017299,0.026311,0.03082
1,1,0,41.0,6,1.496,700,3.0,1.088,34.0,6.0,...,-0.006913,-0.015512,0.002404,0.019383,-0.021763,-0.000559,-0.008278,0.036079,0.006483,0.019893
2,5,0,30.0,6,1.75,2519,6.0,1.714,238.0,6.0,...,-0.015745,-0.015845,0.000416,0.02096,-0.027603,-0.016322,-0.038254,0.026395,-0.024694,0.005446
3,22,1,41.0,12,1.711,643,4.0,1.608,51.0,12.0,...,-0.002526,-0.029195,0.002286,0.056949,0.008004,0.004535,0.001285,0.010217,-0.015897,-0.010648
4,25,0,26.0,5,1.429,252,2.0,1.143,7.0,4.0,...,0.017543,-0.01229,0.040848,0.01344,0.030002,0.018587,0.026283,0.016421,0.027815,-0.022683
5,26,0,59.0,6,1.411,1018,5.0,1.413,126.0,3.0,...,0.043306,-0.042662,0.04241,0.009777,-0.017051,0.018077,-0.014573,0.001667,0.022283,0.008028
6,28,1,26.0,6,1.827,1424,5.0,1.746,346.0,6.0,...,-0.031695,0.009417,0.018398,0.003226,0.009575,-0.001871,-0.022389,0.028857,-0.005008,0.013525
7,29,0,30.0,6,1.49,1458,6.0,1.196,102.0,6.0,...,0.013488,-0.000465,0.062412,0.007334,0.000374,-0.034019,-0.036064,-0.0026,0.022818,-0.022738
8,31,1,36.0,12,2.428,1104,12.0,2.211,294.0,10.0,...,0.006185,0.019309,0.007711,-0.024464,0.014273,0.001373,-0.002627,0.004056,0.005597,0.002939
9,33,0,42.0,6,1.641,730,5.0,1.321,53.0,6.0,...,-0.009507,0.01002,-0.007985,-0.008891,0.024466,0.000404,0.030166,0.020637,0.014987,0.015431


## Получим оценку по полу

In [20]:
%%time
x_train, x_test, y_train, y_test = train_test_split(\
    data.drop(['user_id', 'age', 'is_male'], axis = 1), data['is_male'], test_size = 0.25, random_state = SPLIT_SEED)


GINI по полу 0.672
CPU times: user 5min 14s, sys: 16.5 s, total: 5min 30s
Wall time: 45.2 s


In [None]:
%%time
clf = CatBoostClassifier()
clf.fit(x_train, y_train, verbose = False)
print(f'GINI по полу {2 * m.roc_auc_score(y_test, clf.predict_proba(x_test)[:,1]) - 1:2.3f}')

In [21]:
clf_log_reg = LogisticRegression(random_state=17)
clf_log_reg.fit(x_train, y_train)
print(f'GINI по полу {2 * m.roc_auc_score(y_test, clf_log_reg.predict_proba(x_test)[:,1]) - 1:2.3f}')

GINI по полу 0.182


In [22]:
clf_ran_forest = RandomForestClassifier(max_depth=2, random_state=17)
clf_ran_forest.fit(x_train, y_train)
print(f'GINI по полу {2 * m.roc_auc_score(y_test, clf_ran_forest.predict_proba(x_test)[:,1]) - 1:2.3f}')

GINI по полу 0.397


## Получим оценку по возрасту

In [9]:
def age_bucket(x):
    return bisect.bisect_right([18,25,35,45,55,65], x)

In [11]:
data['age'] = data['age'].map(str)
data = data[data['age'] != 'None']
data = data[data['age'] != 'NA']
data['age'] = data['age'].map(float)
data = data.fillna(0)
data['age'].value_counts()

34.0    10004
33.0     9948
35.0     9456
36.0     9401
37.0     9341
        ...  
84.0        9
83.0        5
91.0        3
86.0        1
88.0        1
Name: age, Length: 76, dtype: int64

In [12]:
data['age'] = data['age'].map(age_bucket)
# sns.histplot(df['age'], bins = 7)

In [13]:
data['age'].value_counts()

2    85212
3    84065
4    42083
1    25969
5    25420
6     6899
0      352
Name: age, dtype: int64

In [56]:
x_train, x_test, y_train, y_test = train_test_split(\
    data.drop(['user_id', 'age', 'is_male'], axis = 1), data['age'], test_size = 0.25, random_state = SPLIT_SEED)

clf = CatBoostClassifier()
clf.fit(x_train, y_train, verbose = False)
print(m.classification_report(y_test, clf.predict(x_test), \
                            target_names = ['<18', '18-25','25-34', '35-44', '45-54', '55-65', '65+']))

CatBoostError: Input data must have at least one feature

In [109]:
clf.fit(df.drop(['user_id', 'age', 'is_male'], axis = 1), df['age'], verbose = False)
id_to_submit['age'] = clf.predict(id_to_submit[['user_id']].merge(usr_emb, how = 'inner', on = ['user_id']))

## Сабмит

In [110]:
id_to_submit.head()

Unnamed: 0,user_id,is_male,age
0,6,0.330467,2
1,11,0.725477,5
2,19,0.24019,1
3,27,0.536798,2
4,32,0.471325,3


In [111]:
id_to_submit.to_csv(f'{LOCAL_DATA_PATH}/submission.csv', index = False)

In [112]:
! head $LOCAL_DATA_PATH/submission.csv

user_id,is_male,age
6,0.330467150589351,2
11,0.7254769930049977,5
19,0.24019020466489424,1
27,0.5367979653267113,2
32,0.4713251899911531,3
37,0.2810748555581949,2
43,0.6659790932425269,2
44,0.9189155263784968,1
46,0.5166941298660128,3


# Скор на лидерборде

In [151]:
context_scorer(submission, answers)

1.4715992278434493