In [1]:
import warnings
warnings.filterwarnings('ignore')
import pyarrow.parquet as pq
import bisect
import sklearn.metrics as m
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
import pandas as pd

In [4]:
SPLIT_SEED = 42
DATA_FILE = 'data_out/data_for_learn_parquet_last_3'
TARGET_FILE = 'data_in/public_train.pqt'

data = pq.read_table(f"{DATA_FILE}").to_pandas()


data['day_avg_sum_date_request_cnt/avg_sum_date_request_cnt'] = data['day_avg_sum_date_request_cnt'] / data['avg_sum_date_request_cnt']
data['night_avg_sum_date_request_cnt/avg_sum_date_request_cnt'] = data['night_avg_sum_date_request_cnt'] / data['avg_sum_date_request_cnt']
data['morning_avg_sum_date_request_cnt/avg_sum_date_request_cnt'] = data['morning_avg_sum_date_request_cnt'] / data['avg_sum_date_request_cnt']
data['evening_avg_sum_date_request_cnt/avg_sum_date_request_cnt'] = data['evening_avg_sum_date_request_cnt'] / data['avg_sum_date_request_cnt']

data = data.fillna(0)

all_usr_emb = pq.read_table(f"data_in/all_usr_emb_f80_i40.parquet").to_pandas()
data = data.merge(all_usr_emb, how = 'left', on = ['user_id'])
data_usr_emb =pq.read_table(f"data_in/data_usr_emb_f50_i40.parquet").to_pandas()
data = data.merge(data_usr_emb, how = 'left', on = ['user_id'])
target = pq.read_table(f"{TARGET_FILE}").to_pandas()

In [5]:
def age_bucket(x):
    return bisect.bisect_right([18,25,35,45,55,65], x)

data_t_age = target.merge(data, how = 'left', on = ['user_id'])
data_t_age['age'] = data_t_age['age'].map(str)
data_t_age = data_t_age[data_t_age['age'] != 'None']
data_t_age = data_t_age[data_t_age['age'] != 'NA']
data_t_age['age'] = data_t_age['age'].map(float)
data_t_age['age'] = data_t_age['age'].map(age_bucket)
data_t_age = data_t_age[data_t_age['age'] != 0]
print(data_t_age['age'].value_counts())
cat_features = list(data_t_age.drop('is_male', axis = 1).select_dtypes(['object']).columns)
print(cat_features)

2    85212
3    84065
4    42083
1    25969
5    25420
6     6900
Name: age, dtype: int64
['top_1_url_sum_request_cnt', 'top_2_url_sum_request_cnt', 'top_3_url_sum_request_cnt', 'top_4_url_sum_request_cnt', 'top_5_url_sum_request_cnt', 'top_1_url_count_request_cnt', 'top_2_url_count_request_cnt', 'top_3_url_count_request_cnt', 'region_name', 'cpe_model_name']


In [6]:
# X = data_t_age.drop(['user_id', 'age', 'is_male'], axis = 1)
# y = data_t_age['age']

x_train, x_test, y_train, y_test = train_test_split(\
    data_t_age.drop(['user_id', 'age', 'is_male'], axis = 1), data_t_age['age'], test_size = 0.25, random_state = SPLIT_SEED)

In [10]:
clf = CatBoostClassifier(thread_count = 3
                         , iterations = 2000
                         , random_seed = SPLIT_SEED, learning_rate = 0.05, early_stopping_rounds = 100
                         , eval_metric = 'TotalF1', loss_function = 'MultiClassOneVsAll'
                         , classes_count = 6
                         , class_names = [1, 2, 3, 4, 5, 6]
                        )

In [None]:
clf.fit(x_train, y_train, verbose = False, plot=True, cat_features=cat_features, eval_set=(x_test, y_test))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [7]:
print(m.classification_report(y_test, clf.predict(x_test), \
                            target_names = ['18-25','25-34', '35-44', '45-54', '55-65', '65+']))

              precision    recall  f1-score   support

       18-25       0.53      0.22      0.31      6596
       25-34       0.50      0.62      0.55     21118
       35-44       0.42      0.62      0.50     21105
       45-54       0.38      0.15      0.21     10507
       55-65       0.41      0.22      0.29      6375
         65+       0.45      0.02      0.03      1712

    accuracy                           0.45     67413
   macro avg       0.45      0.31      0.32     67413
weighted avg       0.45      0.45      0.42     67413



In [8]:
clf.get_feature_importance(prettified=True).to_csv(f'data_out/clf_age_importance.csv'
                                                           , index = False, mode='w')
clf.save_model('catboost_clf_age.cbm')

In [12]:
submit = pq.read_table('data_in/submit_2.pqt').to_pandas()
submit_predict_data = submit.merge(data, how = 'left', on = ['user_id'])
submit['age'] = clf.predict(submit_predict_data.drop(['user_id'], axis = 1))
submit_is_male = pd.read_csv('data_out/male_submission.csv')
samp_submit = submit_is_male.merge(submit, how = 'left', on = ['user_id'])
samp_submit.head()

Unnamed: 0,user_id,is_male,age
0,221301,0.995107,2
1,31271,0.58021,3
2,211594,0.19488,3
3,253119,0.39269,2
4,192578,0.965791,2


In [13]:
samp_submit.to_csv("data_in/sample_submission.csv", index = False, mode='w')