In [1]:
import sys
import os
import datetime
import warnings
os.environ['OPENBLAS_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import time
import pyarrow.parquet as pq
import pyarrow as pa
import scipy
import implicit
import bisect
import sklearn.metrics as m
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve, CalibratedClassifierCV

In [3]:
LOCAL_DATA_PATH = './context_data/'
SPLIT_SEED = 42
DATA_FILE = 'competition_data_final_pqt'
TARGET_FILE = 'public_train.pqt'
SUBMISSION_FILE = 'submit.pqt'

In [4]:
id_to_submit = pq.read_table(f'{LOCAL_DATA_PATH}/{SUBMISSION_FILE}').to_pandas()

In [5]:
# pqf = pq.ParquetFile(f'{LOCAL_DATA_PATH}/{DATA_FILE}')
pqd = pq.ParquetDataset(f'{LOCAL_DATA_PATH}/{DATA_FILE}')
pqd.files

['./context_data//competition_data_final_pqt/part-00000-aba60f69-2b63-4cc1-95ca-542598094698-c000.snappy.parquet',
 './context_data//competition_data_final_pqt/part-00001-aba60f69-2b63-4cc1-95ca-542598094698-c000.snappy.parquet',
 './context_data//competition_data_final_pqt/part-00002-aba60f69-2b63-4cc1-95ca-542598094698-c000.snappy.parquet',
 './context_data//competition_data_final_pqt/part-00003-aba60f69-2b63-4cc1-95ca-542598094698-c000.snappy.parquet',
 './context_data//competition_data_final_pqt/part-00004-aba60f69-2b63-4cc1-95ca-542598094698-c000.snappy.parquet',
 './context_data//competition_data_final_pqt/part-00005-aba60f69-2b63-4cc1-95ca-542598094698-c000.snappy.parquet',
 './context_data//competition_data_final_pqt/part-00006-aba60f69-2b63-4cc1-95ca-542598094698-c000.snappy.parquet',
 './context_data//competition_data_final_pqt/part-00007-aba60f69-2b63-4cc1-95ca-542598094698-c000.snappy.parquet',
 './context_data//competition_data_final_pqt/part-00008-aba60f69-2b63-4cc1-95ca-

In [None]:
data_agg = None
for ifile in pqd.files :

    #data = pq.read_table(f'{LOCAL_DATA_PATH}/{DATA_FILE}/{ifile}')
    data = pq.read_table(f'{ifile}')
    #pd.DataFrame([(z.name, z.type) for z in data.schema], columns = [['field', 'type']])

    data_agg0 = data.select(['user_id', 'url_host', 'request_cnt']).\
        group_by(['user_id', 'url_host']).aggregate([('request_cnt', "sum"),
                                                     ('request_cnt', "max"),
                                                     ('request_cnt', "mean"),])
    
    data_agg1 = data.select(['user_id','date', 'part_of_day', 'request_cnt']).\
        group_by(['user_id', 'date', 'part_of_day']).aggregate([('request_cnt', "sum"),
                                                                ('request_cnt', "max"),
                                                                ('request_cnt', "mean")])
    
    data_agg2 = data.select(['user_id', 'city_name', 'region_name', 'cpe_model_os_type', 'price','request_cnt', \
                            'cpe_manufacturer_name','cpe_model_name','cpe_type_cd']).\
        group_by(['user_id', ]).\
        aggregate([('cpe_model_os_type', "max"), ('cpe_manufacturer_name','max'),\
                   ('cpe_model_name','max'),('cpe_type_cd','max'), \
                   ('price', "max"), ('city_name', "max"), \
                   ('region_name', "max"),('request_cnt',"mean"),('request_cnt',"sum")])
    
    if data_agg is None :
        data_agg  = data_agg0
        data_aggB = data_agg1
        data_aggZ = data_agg2
        print(data.to_pandas().head())
    else :
        data_agg  = pa.concat_tables([data_agg, data_agg0],promote=False)
        data_aggB = pa.concat_tables([data_aggB,data_agg1],promote=False)
        data_aggZ = pa.concat_tables([data_aggZ,data_agg2],promote=False)
    
    del data, data_agg0, data_agg1, data_agg2
    
data_agg  = data_agg.to_pandas()
data_aggB = data_aggB.to_pandas()
data_aggZ = data_aggZ.to_pandas().fillna(0)

Регион   
, населенный пункт   
, производиель устройства   
, модель устроства   
, домен, с которого пришел рекламный запрос   
, тип устройства (смартфон или что-то другое)   
, операционка на устройстве   
, оценка цены устройства   
, дата   
, время дня (утро, вечер ...)   
, число запросов   
, id пользователя   

In [None]:
data_agg.info(), data_aggB.info(), data_aggZ.info()

In [None]:
data_agg.columns, data_aggB.columns, data_aggZ.columns

In [None]:
n_urls = 450 #500

urls_hosts_max=data_agg[['url_host','user_id']].groupby(['url_host']).size().reset_index()
                #sort_values('user_id', axis=0, ascending=False)[:10].reset_index()
urls_hosts_max.columns = ['url_host','size']
url_hosts_max = urls_hosts_max.sort_values('size', ascending=False).head(n_urls).url_host
#url_hosts_max.to_list()

In [None]:
data_temp = data_agg[~data_agg['url_host'].isin(url_hosts_max.to_list())]
data_aggY = pd.concat([data_temp, \
      data_agg[data_agg['user_id']. \
               isin(list(set(data_agg.user_id.unique())-set(data_temp.user_id.unique())))]], \
      ignore_index=True)
data_aggY.shape, data_aggY.head()

In [None]:
data_agg.shape, data_aggY.shape, len(data_agg['user_id'].unique()), len(data_aggZ.user_id.unique())

In [None]:
if 1 :
    
    data_aggC = data_aggZ[['user_id','cpe_model_os_type_max', 'cpe_manufacturer_name_max', \
                           'cpe_model_name_max', 'cpe_type_cd_max', \
                           'region_name_max','city_name_max','price_max']]
    
    cat_features  = ['cpe_model_os_type_max', 'cpe_manufacturer_name_max', \
                     'cpe_model_name_max', 'cpe_type_cd_max', \
                     'region_name_max','city_name_max',]
    
    
    data_aggD = pd.pivot_table(data_aggB[['user_id','part_of_day','request_cnt_sum']],
                                index='user_id', columns='part_of_day', values='request_cnt_sum', 
                                aggfunc=np.mean, fill_value=0).reset_index()
    data_aggC = data_aggC.merge(data_aggD,how='inner',on=['user_id'])
    
    data_aggB['DW'] = 'DW' + pd.to_datetime(data_aggB.date).dt.day_of_week.astype(str)
    data_aggD = pd.pivot_table(data_aggB[['user_id','DW','request_cnt_sum']],
                                index='user_id', columns='DW', values='request_cnt_sum', 
                                aggfunc=np.mean, fill_value=0).reset_index()
    data_aggC = data_aggC.merge(data_aggD,how='inner',on=['user_id'])

    data_aggB['DW'] = 'DWPD'+pd.to_datetime(data_aggB.date).dt.day_of_week.astype(str)+'.'+data_aggB.part_of_day
    data_aggD = pd.pivot_table(data_aggB[['user_id','DW','request_cnt_sum']],
                                index='user_id', columns='DW', values='request_cnt_sum', 
                                aggfunc=np.mean, fill_value=0).reset_index()
    data_aggC = data_aggC.merge(data_aggD,how='inner',on=['user_id'])
    
    if 0 :
        data_a = data_aggB[['user_id','date','request_cnt_sum']].groupby(['user_id','date']).sum().reset_index()
        data_a = data_a[['user_id','request_cnt_sum']].groupby('user_id').\
                    agg([np.sum,np.mean,np.median,np.min,np.max,np.std]).fillna(0).reset_index()
        data_a.columns = [a+('' if len(b)==0 else '_date_')+b for a,b in data_a.columns];
        data_aggC = data_aggC.merge(data_a,how='inner',on=['user_id'])      
    

In [None]:
#
#  tf-idf
#
if 1 : # tf-idf
    
    #data_a = data_agg[['user_id','url_host']].groupby(['user_id']).size().reset_index()
    data_a = data_agg[['user_id','request_cnt_sum',]].groupby('user_id').sum().reset_index()
    data_a.columns=['user_id','summ']
    data_tf = data_agg.merge(data_a, how='left', on='user_id')
    data_tf['tf'] = data_tf.request_cnt_sum/data_tf.summ
    data_tf       = data_tf[['user_id','url_host','tf']]
    
    if 0 :
        print('tf')
        print(data_a.shape, data_tf.shape, data_aggY.shape)
        print(data_a.head())
        print(data_tf.head())
    
    data_a = data_agg[['url_host','user_id',]].groupby('url_host').size().reset_index()
    data_a.columns=['url_host','sizee']
    user_id_len = len(data_agg.user_id.unique())
    
    data_idf = data_a
    data_idf['idf'] = np.log(user_id_len/data_a.sizee)
    
    if 0 :
        print('idf')
        print(data_a.shape, data_idf.shape, data_agg.shape)
        print(data_a.head())
        print(data_idf.head())
    
    data_tf_idf = data_tf.merge(data_idf[['url_host','idf']], how='left', on='url_host')
    data_tf_idf['tf_idf'] = data_tf_idf.tf*data_tf_idf.idf
    
    if 0 :
        print('idf')
        print(data_tf_idf.shape)
        print(data_tf_idf.head())
    
    del data_tf, data_idf
    del data_a
    
    print(data_tf_idf.shape)

In [None]:
data_aggC.shape, data_aggC.columns

In [None]:
temp_old = data_aggC.copy()
temp = data_aggC.describe();
temp

In [None]:
data_aggC = temp_old.copy()

In [None]:
ccolumns = [cc for cc in data_aggC.columns if cc not in cat_features and cc != 'user_id']
ttemp = (data_aggC.loc[:,ccolumns]/temp.loc['mean',ccolumns]) #/temp.loc['std',ccolumns]
data_aggC[ccolumns]=ttemp[ccolumns]

In [None]:
ttemp.describe()
data_aggC.describe()

In [None]:

if 1 :
    data_aggC['A']=data_aggC.day+data_aggC.evening+data_aggC.morning+data_aggC.night
    data_aggC['A000']=data_aggC.day/data_aggC.A
    data_aggC['A001']=data_aggC.evening/data_aggC.A
    data_aggC['A002']=data_aggC.morning/data_aggC.A
    data_aggC['A003']=data_aggC.night/data_aggC.A
    
if 1 :
    B000m=data_aggC.day.mean()
    B001m=data_aggC.evening.mean()
    B002m=data_aggC.morning.mean()
    B003m=data_aggC.night.mean()
    data_aggC['B000']=data_aggC.day/B000m
    data_aggC['B001']=data_aggC.evening/B001m
    data_aggC['B002']=data_aggC.morning/B002m
    data_aggC['B003']=data_aggC.night/B003m
    #data_aggC = data_aggC.drop(['day','morning','evening'],axis=1)
        
if 1 :
    DWA0  =data_aggC.DW0.median()
    DWA1  =data_aggC.DW1.median()
    DWA2  =data_aggC.DW2.median()
    DWA3  =data_aggC.DW3.median()
    DWA4  =data_aggC.DW4.median()
    DWA5  =data_aggC.DW5.median()
    DWA6  =data_aggC.DW6.median()
    data_aggC['DWB000']=data_aggC.DW0/DWA0
    data_aggC['DWB001']=data_aggC.DW1/DWA1
    data_aggC['DWB002']=data_aggC.DW2/DWA2
    data_aggC['DWB003']=data_aggC.DW3/DWA3
    data_aggC['DWB004']=data_aggC.DW4/DWA4
    data_aggC['DWB005']=data_aggC.DW5/DWA5
    data_aggC['DWB006']=data_aggC.DW6/DWA6
    
if 1 :
    data_aggC['DWA']=data_aggC.DW0+data_aggC.DW1+data_aggC.DW2+data_aggC.DW3+ \
                        data_aggC.DW4+data_aggC.DW5+data_aggC.DW6
    data_aggC['DW000']=data_aggC.DW0/data_aggC.DWA
    data_aggC['DW001']=data_aggC.DW1/data_aggC.DWA
    data_aggC['DW002']=data_aggC.DW2/data_aggC.DWA
    data_aggC['DW003']=data_aggC.DW3/data_aggC.DWA
    data_aggC['DW004']=data_aggC.DW4/data_aggC.DWA
    data_aggC['DW005']=data_aggC.DW5/data_aggC.DWA
    data_aggC['DW006']=data_aggC.DW6/data_aggC.DWA
    
data_aggC.shape, data_aggC.columns

In [None]:
data_aggC.shape

In [None]:
if 0 :
    import matplotlib.pyplot as mp
    if 0 :
        mp.hist(data_aggC.day, 100)
        mp.hist(data_aggC.morning, 100)
        mp.hist(data_aggC.evening, 100)
        #mp.hist(data_aggC.A003, 100)
        #mp.hist(data_aggC.DWB004, 100)

        mp.show()

        mp.hist(data_aggC.price_max,100)
        mp.show()

    mp.plot(data_aggB[['user_id','request_cnt_sum']].groupby(['user_id'],sort=True).sem())
    mp.show()
    
if 0 :    
    aa1 = data_aggB[['user_id','date','request_cnt_sum']].groupby(['user_id','date']).sum().reset_index()
    aa2 = aa1[['user_id','request_cnt_sum']].groupby('user_id').\
                agg([np.sum,np.mean,np.median,np.min,np.max,np.std]).fillna(0).reset_index()

In [None]:
targets = pq.read_table(f'{LOCAL_DATA_PATH}/{TARGET_FILE}')
pd.DataFrame([(z.name, z.type) for z in targets.schema], columns = [['field', 'type']])

In [None]:
import faiss

def set1 (items) :
    item_set = set(items)
    #print(f'{len(items)} items')
    item_dict = {item: iditem for item, iditem in zip(item_set, range(len(item_set)))}
    items_new = np.array(items.map(item_dict))
    return(item_dict, items_new)
    
def alsals (values, rows, cols, factors=50, iterations=30, clusters=400) :
    
    #values = np.array(data_agg['request_cnt_sum'])
    #rows   = np.array(data_agg['user_id'].map(usr_dict))
    #cols   = np.array(data_agg['url_host'].map(url_dict))
    mat = scipy.sparse.coo_matrix((values, (rows, cols)), shape=(rows.max() + 1, cols.max() + 1))
    als = implicit.approximate_als.FaissAlternatingLeastSquares(factors = factors, iterations = iterations, \
           use_gpu = False, nlist=clusters, \
           calculate_training_loss = False, regularization = 0.1)
           #calculate_training_loss = False, regularization = 0.1)
    als.fit(mat)
    u_factors = als.model.user_factors 
    d_factors = als.model.item_factors
    
    return u_factors

def doMatrix (users, items, values, clusters=400, factors=50, iterations=30) :
    users_dict, users_new = set1(users)
    items_dict, items_new = set1(items)

    uu = alsals(values,users_new,items_new, clusters=clusters, factors=factors, iterations=iterations)
    uu = pd.DataFrame(uu)

    inv_users_dict = {v: k for k, v in users_dict.items()}
    uu['user_id'] = uu.index.map(inv_users_dict)
    return uu

In [43]:
%%time

if 1 :
    u12= doMatrix(pd.concat([#data_agg.user_id, \
                         data_agg.user_id, \
                         #data_tf_idf.user_id, \
                         #data_tf_idf.user_id, \
                         #data_tf_idf.user_id, \
                         #data_aggB.user_id, \
                         #data_aggB.user_id, \
                        ], ignore_index=True),
              pd.concat([#data_agg.url_host,\
                         data_agg.url_host,\
                         #data_tf_idf.url_host,\
                         #data_tf_idf.url_host,\
                         #data_tf_idf.url_host, \
                         #data_aggB.part_of_day, \
                         #data_aggB.part_of_day, \
                         #data_aggB.DW,
                         #data_aggB.DW,
                        ], ignore_index=True),\
              pd.concat([#data_agg.request_cnt_sum,\
                         data_agg.request_cnt_mean/np.mean(data_agg.request_cnt_mean)* \
                                  np.mean(data_agg.request_cnt_sum), \
                         #data_tf_idf.tf_idf/np.mean(data_tf_idf.tf_idf)*np.mean(data_agg.request_cnt_sum), \
                         #data_tf_idf.tf/np.mean(data_tf_idf.tf)*np.mean(data_agg.request_cnt_sum), \
                         #data_tf_idf.idf/np.mean(data_tf_idf.idf)*np.mean(data_agg.request_cnt_sum), \
                         #data_aggB.request_cnt_sum/np.mean(data_aggB.request_cnt_sum)* \
                         #  np.mean(data_agg.request_cnt_sum), \
                         #data_aggB.request_cnt_mean/np.mean(data_aggB.request_cnt_mean)* \
                         #  np.mean(data_agg.request_cnt_sum), \
                        ], ignore_index=True),
              #clusters=400, factors=50)
              #clusters=400, factors=350, iterations=30)     
              #clusters=400, factors=550, iterations=30)
              #clusters=400, factors=3000, iterations=30)
              #clusters=400, factors=150, iterations=30)
              clusters=400, factors=250, iterations=50)

if 1 :
    u12x1= doMatrix(pd.concat([#data_agg.user_id, \
                         #data_agg.user_id, \
                         data_tf_idf.user_id, \
                         data_tf_idf.user_id, \
                         #data_tf_idf.user_id, \
                         #data_aggB.user_id, \
                         #data_aggB.user_id, \
                        ], ignore_index=True),
              pd.concat([#data_agg.url_host,\
                         #data_agg.url_host,\
                         data_tf_idf.url_host,\
                         data_tf_idf.url_host,\
                         #data_tf_idf.url_host, \
                         #data_aggB.part_of_day, \
                         #data_aggB.part_of_day, \
                         #data_aggB.DW,
                         #data_aggB.DW,
                        ], ignore_index=True),\
              pd.concat([#data_agg.request_cnt_sum,\
                         #data_agg.request_cnt_mean/np.mean(data_agg.request_cnt_mean)* \
                         #         np.mean(data_agg.request_cnt_sum), \
                         data_tf_idf.tf_idf/np.mean(data_tf_idf.tf_idf)*np.mean(data_agg.request_cnt_sum), \
                         data_tf_idf.tf/np.mean(data_tf_idf.tf)*np.mean(data_agg.request_cnt_sum), \
                         #data_tf_idf.idf/np.mean(data_tf_idf.idf)*np.mean(data_agg.request_cnt_sum), \
                         #data_aggB.request_cnt_sum/np.mean(data_aggB.request_cnt_sum)* \
                         #  np.mean(data_agg.request_cnt_sum), \
                         #data_aggB.request_cnt_mean/np.mean(data_aggB.request_cnt_mean)* \
                         #  np.mean(data_agg.request_cnt_sum), \
                        ], ignore_index=True),
              #clusters=400, factors=50)
              #clusters=400, factors=350, iterations=30)     
              #clusters=400, factors=550, iterations=30)
              #clusters=400, factors=3000, iterations=30)
              #clusters=400, factors=150, iterations=30)
              clusters=400, factors=250, iterations=50)

data_aggB['DW'] = 'DW' + pd.to_datetime(data_aggB.date).dt.day_of_week.astype(str) + '.' + data_aggB.part_of_day
u12x2= doMatrix(pd.concat([#data_agg.user_id, \
                         #data_agg.user_id, \
                         #data_tf_idf.user_id, \
                         #data_tf_idf.user_id, \
                         #data_tf_idf.user_id, \
                         #data_aggB.user_id, \
                         data_aggB.user_id, \
                        ], ignore_index=True),
              pd.concat([#data_agg.url_host,\
                         #data_agg.url_host,\
                         #data_tf_idf.url_host,\
                         #data_tf_idf.url_host,\
                         #data_tf_idf.url_host, \
                         #data_aggB.part_of_day, \
                         #data_aggB.part_of_day, \
                         #data_aggB.DW,
                         data_aggB.DW,
                        ], ignore_index=True),\
              pd.concat([#data_agg.request_cnt_sum,\
                         #data_agg.request_cnt_mean/np.mean(data_agg.request_cnt_mean)* \
                         #         np.mean(data_agg.request_cnt_sum), \
                         #data_tf_idf.tf_idf/np.mean(data_tf_idf.tf_idf)*np.mean(data_agg.request_cnt_sum), \
                         #data_tf_idf.tf/np.mean(data_tf_idf.tf)*np.mean(data_agg.request_cnt_sum), \
                         #data_tf_idf.idf/np.mean(data_tf_idf.idf)*np.mean(data_agg.request_cnt_sum), \
                         #data_aggB.request_cnt_sum/np.mean(data_aggB.request_cnt_sum)* \
                         #  np.mean(data_agg.request_cnt_sum), \
                         data_aggB.request_cnt_mean/np.mean(data_aggB.request_cnt_mean)* \
                           np.mean(data_agg.request_cnt_sum), \
                        ], ignore_index=True),
              clusters=10, factors=150, iterations=50)



  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]



CPU times: user 3h 15min 11s, sys: 3min 37s, total: 3h 18min 48s
Wall time: 27min 5s


In [44]:
u12.shape

(415317, 251)

In [45]:
#data_tf_idf.tf_idf.describe(), data_agg.request_cnt_sum.describe(), \
#data_agg.request_cnt_mean.describe(), data_aggB.request_cnt_sum.describe(), \
#data_aggB.request_cnt_mean.describe()

## Получим оценку по полу

In [46]:
%%time

usr_emb = u12
usr_emb = usr_emb.merge(u12x1,     how = 'inner', on = ['user_id'], suffixes=('_x1','_x1x1'))
usr_emb = usr_emb.merge(u12x2,     how = 'inner', on = ['user_id'], suffixes=('_x1','_x1x2'))
usr_emb = usr_emb.merge(data_aggC, how = 'inner', on = ['user_id'], suffixes=('_u12','_aggC'))

cat_features  = ['cpe_model_os_type_max', 'cpe_manufacturer_name_max', \
                 'cpe_model_name_max', 'cpe_type_cd_max', \
                 'region_name_max','city_name_max',]

#usr_emb[cat_features] = usr_emb[cat_features].fillna(' ')
#usr_emb = usr_emb.fillna(-1)

usr_targets = targets.to_pandas()
df = usr_targets.merge(usr_emb,   how = 'inner', on = ['user_id'])
df = df[df['is_male'] != 'NA']
df = df.dropna()
df['is_male'] = df['is_male'].map(int)
df['is_male'].value_counts()

CPU times: user 7.29 s, sys: 1.34 s, total: 8.63 s
Wall time: 8.63 s


1    135331
0    128994
Name: is_male, dtype: int64

In [47]:
#df1 = df
#df = df.drop('price_max',axis=1)
df.shape, cat_features #,cat_feature_

((264325, 723),
 ['cpe_model_os_type_max',
  'cpe_manufacturer_name_max',
  'cpe_model_name_max',
  'cpe_type_cd_max',
  'region_name_max',
  'city_name_max'])

In [48]:
%%time
print(datetime.datetime.now())
x_train, x_test, y_train, y_test = train_test_split(\
    df.drop(['user_id', 'age', 'is_male'], axis = 1), df['is_male'], test_size = 0.15, random_state = SPLIT_SEED)

clf1 = CatBoostClassifier(iterations=3500, early_stopping_rounds=25, cat_features=cat_features, 
                          one_hot_max_size=10)

clf1.fit(x_train, y_train, verbose = False, eval_set=(x_test,y_test), use_best_model=True, plot=True)
print(f'GINI по полу {2 * m.roc_auc_score(y_test, clf1.predict_proba(x_test)[:,1]) - 1:2.3f}')

2023-03-05 14:52:46.494114


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

GINI по полу 0.735
CPU times: user 1h 18min 21s, sys: 1min 3s, total: 1h 19min 25s
Wall time: 10min 32s


In [49]:
#clf1.get_all_params()

In [50]:
%%time
print(datetime.datetime.now())
#clf.fit(df.drop(['user_id', 'age', 'is_male'], axis = 1), df['is_male'], verbose = False)
id_to_submit['is_male'] = clf1.predict_proba(id_to_submit[['user_id']].merge(usr_emb, how = 'inner', on = ['user_id']).
                                            drop(['user_id'], axis=1))[:,1]

2023-03-05 15:04:20.281588
CPU times: user 2.95 s, sys: 643 ms, total: 3.59 s
Wall time: 1.97 s


## Получим оценку по возрасту

In [None]:
%%time

u20 = doMatrix(pd.concat([data_aggY.user_id, \
                         data_aggY.user_id, \
                         data_tf_idf.user_id, \
                         data_tf_idf.user_id, \
                        ], ignore_index=True),
              pd.concat([data_aggY.url_host,\
                         data_aggY.url_host,\
                         data_tf_idf.url_host,\
                         data_tf_idf.url_host,\
                        ], ignore_index=True),\
              pd.concat([data_aggY.request_cnt_sum,\
                         data_aggY.request_cnt_mean/np.mean(data_aggY.request_cnt_mean)* \
                                  np.mean(data_aggY.request_cnt_sum), \
                         data_tf_idf.tf_idf/np.mean(data_tf_idf.tf_idf)*np.mean(data_aggY.request_cnt_sum), \
                         data_tf_idf.tf/np.mean(data_tf_idf.tf)*np.mean(data_aggY.request_cnt_sum), \
                        ], ignore_index=True),
              clusters=400, factors=350, iterations=50)

  0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
u20.shape

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

%matplotlib inline
sns.set_style('darkgrid')

In [None]:
def age_bucket(x):
    return bisect.bisect_left([18,25,35,45,55,65], x)

In [None]:
usr_embx2 = u20
usr_embx2 = usr_embx2.merge(data_aggC, how = 'inner', on = ['user_id'], suffixes=('_u12','_aggC'))

df = usr_targets.merge(usr_embx2, how = 'inner', on = ['user_id'])
df = df[df['age'] != 'NA']
df = df.dropna()
df['age'] = df['age'].map(age_bucket)
sns.histplot(df['age'], bins = 7)
print(usr_embx2.shape,df.shape)

In [None]:
%%time
print(datetime.datetime.now())
x_train, x_test, y_train, y_test = train_test_split(\
    #df.drop(['user_id', 'age', 'is_male'], axis = 1), df['age'], test_size = 0.33, random_state = SPLIT_SEED)
    df.drop(['user_id', 'age', 'is_male'], axis = 1), df['age'], test_size = 0.15, random_state = SPLIT_SEED)

clf = CatBoostClassifier(iterations=3500, early_stopping_rounds=25, one_hot_max_size=10, cat_features=cat_features)
#clf.fit(x_train, y_train, verbose = False)
clf.fit(x_train, y_train, verbose = False, eval_set=(x_test,y_test), use_best_model=True,plot=True)
print(m.classification_report(y_test, clf.predict(x_test), \
                            target_names = ['<18', '18-25','25-34', '35-44', '45-54', '55-65', '65+']))

f1 0.01 0.38 0.55 0.44 0.27 0.27 0.03 - 0.44 0.28 0.41 (1.3164)
f1 0.00 0.39 0.55 0.44 0.27 0.27 0.04 - 0.44 0.28 0.42 (1.3110) (2*aggY    + tf)
f1 0.00 0.35 0.53 0.43 0.25 0.26 0.02 - 0.42 0.26 0.40 (1.3396) (aggY+agg  + tf + idf)
f1 0.00 0.38 0.54 0.44 0.27 0.27 0.03 - 0.44 0.28 0.41 (1.3145) (aggY+aggY + tf + idf) -400urls
f1 0.01 0.39 0.55 0.45 0.27 0.27 0.03 - 0.44 0.28 0.42 (1.3078) (aggY+aggY + tf + idf) -600urls
f1 0.00 0.40 0.55 0.45 0.27 0.28 0.04 - 0.44 0.28 0.42 (1.3073) (aggY+aggY + tf + idf) -500urls
f1 0.01 0.40 0.55 0.45 0.27 0.28 0.02 - 0.44 0.28 0.42 (1.3068) (aggY+aggY + tf + idf) -450urls

f1 0.01 0.41 0.56 0.46 0.28 0.30 0.04 - 0.45 0.29 0.43 (1.2834(1901)) (aggY+aggY + tf + idf) -450urls + aggC
f1 0.02 0.42 0.56 0.47 0.30 0.30 0.03 - 0.46 0.30 0.44 (1.2750(2072)) (aggY+aggY+tf+idf) -450urls +aggC (250.30)


f1 0.01 0.42 0.56 0.47 0.30 0.30 0.04 - 0.46 0.30 0.44 (als = 250.30 without)
pr 0.25 0.53 0.50 0.42 0.39 0.41 0.41 - -    0.42 0.45
re 0.01 0.34 0.64 0.52 0.24 0.24 0.02 - -    0.29 0.46

In [None]:
#clf.get_all_params()
df.shape

In [None]:
%%time
print(datetime.datetime.now())
#clf.fit(df.drop(['user_id', 'age', 'is_male'], axis = 1), df['age'], verbose = False)
id_to_submit['age'] = clf.predict(id_to_submit[['user_id']].merge(usr_embx2, how = 'inner', on = ['user_id']).
                                                            drop(['user_id'], axis=1))

In [None]:
df.shape

## Сабмит

In [383]:
id_to_submit.head()

Unnamed: 0,user_id,is_male,age
221301,221301,0.976589,2
31271,31271,0.526905,3
211594,211594,0.670681,2
253119,253119,0.464538,3
192578,192578,0.83867,3


In [384]:
id_to_submit.to_csv(f'{LOCAL_DATA_PATH}/submission32.csv', index = False)

In [385]:
! head $LOCAL_DATA_PATH/submission32.csv

user_id,is_male,age
221301,0.9765885856661181,2
31271,0.5269049944238794,3
211594,0.6706807535502721,2
253119,0.46453768313247373,3
192578,0.8386696180028068,3
268458,0.17333877931700292,2
205507,0.5256870338017409,2
341343,0.35522474640614765,3
282144,0.0801721641754244,2


# Скор на лидерборде

In [187]:
#context_scorer(submission, answers)

In [277]:
xx=clf1.get_feature_importance(prettified=True)
xx.head(50)

Unnamed: 0,Feature Id,Importances
0,55,2.201168
1,134,2.002252
2,145,1.777475
3,244,1.667876
4,147,1.592607
5,18,1.487434
6,102,1.462556
7,231,1.414512
8,112,1.27282
9,138,1.222269


In [278]:
xx=clf.get_feature_importance(prettified=True)
xx.head(50)

Unnamed: 0,Feature Id,Importances
0,cpe_model_name_max,1.734384
1,198,1.2177
2,37,1.03114
3,56,0.980427
4,cpe_manufacturer_name_max,0.914864
5,134,0.910991
6,7,0.763424
7,50,0.748658
8,190,0.718061
9,248,0.708576


In [279]:
np.save('./u12-250.npy',u12)
np.save('./u12x2-250.npy',u12x2)

In [62]:
nnsubs = pd.read_csv(f'{LOCAL_DATA_PATH}/submission23.csv')

In [63]:
nnsubs.age = nnsubs.age+1
nnsubs.age = nnsubs.age.clip(1,6)

In [59]:
np.unique(nnsubs.age,return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6]),
 array([    1, 10141, 60888, 53972, 13155,  6435,   132]))

In [64]:
np.unique(nnsubs.age,return_counts=True)

(array([1, 2, 3, 4, 5, 6]), array([    1, 10141, 60888, 53972, 13155,  6567]))

In [51]:
nnsubs.to_csv(f'{LOCAL_DATA_PATH}/submission23b.csv', index = False)

In [69]:
print(m.classification_report(y_test, clf.predict(x_test), \
                            target_names = ['<18', '18-25','25-34', '35-44', '45-54', '55-65', '65+']))

              precision    recall  f1-score   support

         <18       0.00      0.00      0.00       167
       18-25       0.54      0.32      0.40      4804
       25-34       0.49      0.64      0.56     13048
       35-44       0.41      0.52      0.46     11746
       45-54       0.38      0.21      0.27      6362
       55-65       0.40      0.20      0.27      3523
         65+       0.48      0.02      0.04       844

    accuracy                           0.45     40494
   macro avg       0.39      0.28      0.29     40494
weighted avg       0.44      0.45      0.43     40494



In [85]:
print(m.classification_report(y_test, (clf.predict(x_test)).clip(1,6), \
                            target_names = ['<18', '18-25','25-34', '35-44', '45-54', '55-65', '65+']))

              precision    recall  f1-score   support

         <18       0.00      0.00      0.00       167
       18-25       0.54      0.32      0.40      4804
       25-34       0.49      0.64      0.56     13048
       35-44       0.41      0.52      0.46     11746
       45-54       0.38      0.21      0.27      6362
       55-65       0.40      0.20      0.27      3523
         65+       0.48      0.02      0.04       844

    accuracy                           0.45     40494
   macro avg       0.39      0.28      0.29     40494
weighted avg       0.44      0.45      0.43     40494

