In [3]:
import warnings
warnings.filterwarnings('ignore')
import pyarrow.parquet as pq
import bisect
import sklearn.metrics as m
from catboost import CatBoostClassifier, Pool, cv
from sklearn.model_selection import train_test_split
import pandas as pd

In [4]:
SPLIT_SEED = 16
DATA_FILE = 'data_out/data_for_learn_parquet_last_1'
TARGET_FILE = 'data_in/public_train.pqt'

data = pq.read_table(f"{DATA_FILE}").to_pandas()


data['day_avg_sum_date_request_cnt/avg_sum_date_request_cnt'] = data['day_avg_sum_date_request_cnt'] / data['avg_sum_date_request_cnt']
data['night_avg_sum_date_request_cnt/avg_sum_date_request_cnt'] = data['night_avg_sum_date_request_cnt'] / data['avg_sum_date_request_cnt']
data['morning_avg_sum_date_request_cnt/avg_sum_date_request_cnt'] = data['morning_avg_sum_date_request_cnt'] / data['avg_sum_date_request_cnt']
data['evening_avg_sum_date_request_cnt/avg_sum_date_request_cnt'] = data['evening_avg_sum_date_request_cnt'] / data['avg_sum_date_request_cnt']

data = data.fillna(0)

all_usr_emb = pq.read_table(f"data_in/sum50_count30_countdate30_regionsumreq30_usr_emb_f_i50.parquet").to_pandas()
data = data.merge(all_usr_emb, how = 'left', on = ['user_id'])

data = data.fillna(0)

In [5]:
target = pq.read_table(f"{TARGET_FILE}").to_pandas()


def age_bucket(x):
    return bisect.bisect_right([19,26,36,46,56,66], x)

data_t_age = target.merge(data, how = 'left', on = ['user_id'])
data_t_age['age'] = data_t_age['age'].map(str)
data_t_age = data_t_age[data_t_age['age'] != 'None']
data_t_age = data_t_age[data_t_age['age'] != 'NA']
data_t_age['age'] = data_t_age['age'].map(float)
data_t_age['age'] = data_t_age['age'].map(age_bucket)
data_t_age = data_t_age[data_t_age['age'] != 0]
print(data_t_age['age'].value_counts())
cat_features = list(data_t_age.drop('is_male', axis = 1).select_dtypes(['object']).columns)
print(cat_features)
data_t_age = data_t_age.loc[~data_t_age['user_id'].isin([155670, 327408, 28719, 330397, 273762, 78276, 188466, 220333, 265327, 406073])]
data_t_age = data_t_age.loc[data_t_age['price'] != 0]
print(data_t_age['age'].value_counts())

2    87270
3    77486
4    42442
1    32641
5    23580
6     5504
Name: age, dtype: int64
['top_1_url_sum_request_cnt', 'top_2_url_sum_request_cnt', 'top_3_url_sum_request_cnt', 'top_4_url_sum_request_cnt', 'top_5_url_sum_request_cnt', 'top_1_url_count_request_cnt', 'top_2_url_count_request_cnt', 'top_3_url_count_request_cnt', 'region_name', 'cpe_model_name']
2    85482
3    75239
4    41021
1    32183
5    22771
6     5320
Name: age, dtype: int64


In [4]:
# X = data_t_age.drop(['user_id', 'age', 'is_male'], axis = 1)
# y = data_t_age['age']

x_train, x_test, y_train, y_test = train_test_split(
    data_t_age.drop(['user_id', 'age', 'is_male'], axis = 1)
        , data_t_age['age'], test_size = 0.25, random_state = SPLIT_SEED, stratify = data_t_age['age'])

In [37]:
clf = CatBoostClassifier(thread_count = 6
                         , iterations = 5000
                         , random_seed = SPLIT_SEED, learning_rate = 0.01, early_stopping_rounds = 100
                         , eval_metric = 'TotalF1'
                         , loss_function = 'MultiClassOneVsAll' # MultiClassOneVsAll
                         , classes_count = 6
                         , class_names = [1, 2, 3, 4, 5, 6]
                         , use_best_model = True
#                          , auto_class_weights = 'SqrtBalanced'
#                          , bootstrap_type = 'Bernoulli'
#                          , depth = 8
#                          , max_ctr_complexity = 8
#                          , grow_policy = "Depthwise"  # [SymmetricTree,Lossguide,Depthwise], [default=SymmetricTree]
                        )

In [10]:
clf.fit(x_train, y_train, verbose = False, eval_set=(x_test, y_test), categorical_feature=cat_features)

ValueError: DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in the following fields: top_1_url_sum_request_cnt, top_2_url_sum_request_cnt, top_3_url_sum_request_cnt, top_4_url_sum_request_cnt, top_5_url_sum_request_cnt, top_1_url_count_request_cnt, top_2_url_count_request_cnt, top_3_url_count_request_cnt, region_name, cpe_model_name

In [39]:
print(m.classification_report(y_test, clf.predict(x_test), \
                            target_names = ['19-25','26-35', '36-45', '46-55', '56-65', '66+']))

              precision    recall  f1-score   support

       19-25       0.55      0.32      0.40      8046
       26-35       0.49      0.66      0.56     21370
       36-45       0.39      0.52      0.45     18810
       46-55       0.38      0.18      0.24     10255
       56-65       0.43      0.17      0.25      5693
         66+       0.80      0.00      0.01      1330

    accuracy                           0.45     65504
   macro avg       0.51      0.31      0.32     65504
weighted avg       0.45      0.45      0.42     65504



In [40]:
# clf.get_feature_importance(prettified=True).to_csv(f'data_out/clf_age_importance.csv'
#                                                            , index = False, mode='w')
clf.save_model('catboost_clf_age_MultiClassOneVsAll_001.cbm')

In [7]:
clf = CatBoostClassifier()
clf.load_model("catboost_clf_age_MultiClass_Bernoulli_001.cbm")

<catboost.core.CatBoostClassifier at 0x7efd2ea453d0>

In [8]:
submit = pq.read_table('data_in/submit_2.pqt').to_pandas()
submit_predict_data = submit.merge(data, how = 'left', on = ['user_id'])

submit['age'] = clf.predict(submit_predict_data.drop(['user_id'], axis = 1))
submit_is_male = pd.read_csv('data_out/male_submission.csv')
samp_submit = submit_is_male.merge(submit, how = 'left', on = ['user_id'])
samp_submit.head()

Unnamed: 0,user_id,is_male,age
0,221301,0.995107,2
1,31271,0.58021,3
2,211594,0.19488,2
3,253119,0.39269,2
4,192578,0.965791,2


In [9]:
samp_submit.to_csv("data_in/sample_submission.csv", index = False, mode='w')