<a href="https://colab.research.google.com/github/SorokinMaksimArtemovich/MTS-ML-CUP/blob/main/data%20preprocessing/0.4_target_distribution_by_urls.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import sys
import os
import warnings
os.environ['OPENBLAS_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np
import time
import pyarrow as pa
import pyarrow.parquet as pq
import scipy
!pip install implicit
import implicit
import bisect
import sklearn.metrics as m
!pip install catboost
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.calibration import calibration_curve, CalibratedClassifierCV

[0m

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

%matplotlib inline
sns.set_style('darkgrid')

In [None]:
!pip install feather-format >> none
!pip install faiss-cpu --no-cache

[0mCollecting faiss-cpu
  Downloading faiss_cpu-1.7.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.0/17.0 MB[0m [31m190.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.3
[0m

In [None]:
def age_bucket(x):
    return bisect.bisect_left([18,25,35,45,55,65], x)

In [None]:
targets = pd.read_feather('/data/target_train.feather')
targets = targets.dropna()
targets['is_male'] = targets['is_male'].astype(int)
targets['age'] = targets['age'].astype(int)
targets.dtypes

age        int64
is_male    int64
user_id    int64
dtype: object

In [None]:
data = pd.read_feather('/data/dataset_full.feather')

In [None]:
data.shape

(322899435, 12)

In [None]:
data = data[['user_id', 'url_host']].drop_duplicates()

In [None]:
data.shape

(32277669, 2)

In [None]:
data_merge = data.merge(targets, how = 'inner', on = ['user_id'])
data_merge

Unnamed: 0,user_id,url_host,age,is_male
0,79395,ad.mail.ru,35,1
1,79395,i.ytimg.com,35,1
2,79395,node3.online.sberbank.ru,35,1
3,79395,play.google.com,35,1
4,79395,t.me,35,1
...,...,...,...,...
20592266,300964,youtube.com,57,0
20592267,300964,biosfera.kz,57,0
20592268,300964,chihuahuadog-ru.turbopages.org,57,0
20592269,300964,sun9-88.userapi.com,57,0


In [None]:
data_merge = pa.Table.from_pandas(data_merge)

### targets count, sum and mean aggregation by url_host

In [None]:
%%time
data_agg = data_merge.select(['user_id', 'url_host', 'age', 'is_male']).\
    group_by(['url_host']).aggregate([('age', 'sum'), ('is_male', 'sum'), ('age', 'count'), ('is_male', 'count'), ('age', 'mean'), ('is_male', 'mean')])

CPU times: user 878 ms, sys: 58.6 ms, total: 937 ms
Wall time: 972 ms


In [None]:
data_agg.shape

(170823, 7)

## ALS
we count mean user age by each url_host and use it as weights in ALS by user_id and url_host

In [None]:
data_to_als = pa.Table.from_pandas(data).select(['user_id', 'url_host']).join(data_agg.select(['url_host', 'age_mean']), 'url_host')
data_to_als

pyarrow.Table
user_id: int32
url_host: dictionary<values=string, indices=int32, ordered=0>
age_mean: double
----
user_id: [[173283,173283,173283,173283,173283,...,383285,383285,383285,383285,383285],[383285,383285,383285,383285,383285,...,49521,49521,49521,49521,49521],...,[58269,58269,58269,58269,58269,...,120278,120278,120278,120278,120278],[120278,120278,120278,120278,120278,...,120399,328966,89467,211894,76755]]
url_host: [  -- dictionary:
["-1","0--stranger-livejournal-com.turbopages.org","0-1.ru","0-34.ru","0-50.ru",...,"экзон.рф","юбилей-на-бис.рф","южныйокруг.рф","явернусь.рф","яркнига24.рф"]  -- indices:
[64628,89338,118279,146560,147911,...,180836,17626,49054,113508,114154],  -- dictionary:
["-1","0--stranger-livejournal-com.turbopages.org","0-1.ru","0-34.ru","0-50.ru",...,"экзон.рф","юбилей-на-бис.рф","южныйокруг.рф","явернусь.рф","яркнига24.рф"]  -- indices:
[117489,195022,111474,5790,160869,...,160868,91521,160874,185709,160875],...,  -- dictionary:
["-1","0--stranger-live

In [None]:
data_to_als.shape

(32277669, 3)

In [None]:
url_set = set(data_to_als.select(['url_host']).to_pandas()['url_host'])
print(f'{len(url_set)} urls')
url_dict = {url: idurl for url, idurl in zip(url_set, range(len(url_set)))}
usr_set = set(data_to_als.select(['user_id']).to_pandas()['user_id'])
print(f'{len(usr_set)} users')
usr_dict = {usr: user_id for usr, user_id in zip(usr_set, range(len(usr_set)))}

199683 urls
415317 users


In [None]:
%%time
values = np.array(data_to_als.select(['age_mean']).to_pandas()['age_mean'].fillna(0))
rows = np.array(data_to_als.select(['user_id']).to_pandas()['user_id'].map(usr_dict))
cols = np.array(data_to_als.select(['url_host']).to_pandas()['url_host'].map(url_dict))
mat = scipy.sparse.coo_matrix((values, (rows, cols)), shape=(rows.max() + 1, cols.max() + 1))
als = implicit.approximate_als.FaissAlternatingLeastSquares(factors = 50, iterations = 30, use_gpu = False, \
       calculate_training_loss = False, regularization = 0.1)

CPU times: user 2.12 s, sys: 891 ms, total: 3.01 s
Wall time: 2.96 s


In [None]:
%%time
als.fit(mat)

  0%|          | 0/30 [00:00<?, ?it/s]

CPU times: user 22min 31s, sys: 10.5 s, total: 22min 41s
Wall time: 5min 55s


In [None]:
u_factors = als.user_factors 
d_factors = als.item_factors

In [None]:
inv_usr_map = {v: k for k, v in usr_dict.items()}
mean_age_emb = pd.DataFrame(d_factors)
mean_age_emb['user_id'] = mean_age_emb.index.map(inv_usr_map)

In [None]:
mean_age_emb.to_csv('/data/target_distribution_by_urls/mean_age_emb.csv', index = False)

## weighted 
for each user we count sum of sum of targets by url_host and divide it on sum of count of user, that visit url_host

In [None]:
mean_table = pa.Table.from_pandas(data).\
    join(data_agg.select(['url_host', 'age_count', 'is_male_count', 'age_sum', 'is_male_sum']), 'url_host').\
    group_by(['user_id']).aggregate([('age_count', 'sum'), ('is_male_count', 'sum'), ('age_sum', 'sum'), ('is_male_sum', 'sum')])

In [None]:
mean_table = mean_table.to_pandas()

In [None]:
mean_table['mean_age'] = mean_table['age_sum_sum'] / mean_table['age_count_sum']
mean_table['mean_sex'] = mean_table['is_male_sum_sum'] / mean_table['is_male_count_sum']
mean_table

Unnamed: 0,age_count_sum,is_male_count_sum,age_sum_sum,is_male_sum_sum,user_id,mean_age,mean_sex
0,11598571,11598571,433942779,5879406,173283,37.413469,0.506908
1,4795015,4795015,183473095,2484028,13315,38.263299,0.518044
2,6464977,6464977,242024525,3302585,209303,37.436255,0.510842
3,1174563,1174563,45250693,601970,138828,38.525556,0.512506
4,3200725,3200725,122704329,1642214,86581,38.336417,0.513076
...,...,...,...,...,...,...,...
415312,4173922,4173922,159576772,2145918,369206,38.231853,0.514125
415313,5858087,5858087,225772347,3009956,61441,38.540286,0.513812
415314,4804520,4804520,183549220,2461060,271514,38.203446,0.512238
415315,4032664,4032664,153033926,2073617,188339,37.948593,0.514205


In [None]:
mean_table = mean_table.drop(['age_count_sum', 'is_male_count_sum', 'is_male_sum_sum', 'age_sum_sum'], axis=1)
mean_table

Unnamed: 0,user_id,mean_age,mean_sex
0,173283,37.413469,0.506908
1,13315,38.263299,0.518044
2,209303,37.436255,0.510842
3,138828,38.525556,0.512506
4,86581,38.336417,0.513076
...,...,...,...
415312,369206,38.231853,0.514125
415313,61441,38.540286,0.513812
415314,271514,38.203446,0.512238
415315,188339,37.948593,0.514205


In [None]:
mean_table.to_csv('/data/target_distribution_by_urls/mean_age_and_sex_weighted.scv', index=False)

## ranged
for each user we count sum of sum of targets by url_host weighted with request_cnt of this user in this url_host and divide it on sum of count of user, that visit url_host weighted with request_cnt of this user in this url_host

In [None]:
data = pd.read_feather('/kaggle/input/mts-ml-cookies/dataset_full.feather')
data = pa.Table.from_pandas(data)

In [None]:
%%time
data = data.select(['user_id', 'url_host', 'request_cnt']).\
    group_by(['user_id', 'url_host']).aggregate([('request_cnt', 'sum')])

CPU times: user 22.8 s, sys: 1.94 s, total: 24.7 s
Wall time: 24.6 s


In [None]:
data = data.join(data_agg.select(['url_host', 'age_count', 'is_male_count', 'age_sum', 'is_male_sum']), 'url_host', join_type='inner')

In [None]:
data = data.to_pandas()
data

Unnamed: 0,request_cnt_sum,user_id,url_host,age_sum,age_count,is_male_count,is_male_sum
0,3,173283,hoster1srv.povarenok.ru,38288,999,999,210
1,3,173283,login.mediafort.ru,122837,3253,3253,758
2,2,173283,optlist.ru,14663,420,420,219
3,1,173283,scripts.advmusic.com,788437,21604,21604,10617
4,23,173283,serieslife.online,162149,4878,4878,2439
...,...,...,...,...,...,...,...
32242723,2,185949,sun9-77.userapi.com,4578320,131320,131320,66317
32242724,2,185949,ad.adriver.ru,7019727,185627,185627,94717
32242725,1,185949,sun9-88.userapi.com,4346131,124795,124795,63497
32242726,4,25567,imasdk.googleapis.com,6226973,163402,163402,83642


In [None]:
data['is_male_sum'] = data['is_male_sum']*data['request_cnt_sum']
data['is_male_count'] = data['is_male_count']*data['request_cnt_sum']
data['age_sum'] = data['age_sum']*data['request_cnt_sum']
data['age_count'] = data['age_count']*data['request_cnt_sum']

In [None]:
mean_table = pa.Table.from_pandas(data).\
    group_by(['user_id']).aggregate([('age_count', 'sum'), ('is_male_count', 'sum'), ('age_sum', 'sum'), ('is_male_sum', 'sum')])

In [None]:
mean_table = mean_table.to_pandas()

In [None]:
mean_table['mean_age'] = mean_table['age_sum_sum'] / mean_table['age_count_sum']
mean_table['mean_sex'] = mean_table['is_male_sum_sum'] / mean_table['is_male_count_sum']
mean_table

Unnamed: 0,age_count_sum,is_male_count_sum,age_sum_sum,is_male_sum_sum,user_id,mean_age,mean_sex
0,1070173717,1070173717,41077836337,548124829,173283,38.384269,0.512183
1,89430911,89430911,3454652225,45985071,13315,38.629286,0.514197
2,63665357,63665357,2335717162,32183733,209303,36.687412,0.505514
3,1949409,1949409,75138645,996701,138828,38.544320,0.511284
4,17127218,17127218,659942327,8773294,86581,38.531788,0.512243
...,...,...,...,...,...,...,...
415312,35088525,35088525,1349613741,18024606,317795,38.463108,0.513689
415313,47000432,47000432,1812958764,24063854,349601,38.573236,0.511992
415314,210757724,210757724,8125089492,108343846,98292,38.551799,0.514068
415315,19200949,19200949,703712551,9712261,179147,36.649884,0.505822


In [None]:
mean_table = mean_table.drop(['age_count_sum', 'is_male_count_sum', 'is_male_sum_sum', 'age_sum_sum'], axis=1)
mean_table

Unnamed: 0,user_id,mean_age,mean_sex
0,173283,38.384269,0.512183
1,13315,38.629286,0.514197
2,209303,36.687412,0.505514
3,138828,38.544320,0.511284
4,86581,38.531788,0.512243
...,...,...,...
415312,317795,38.463108,0.513689
415313,349601,38.573236,0.511992
415314,98292,38.551799,0.514068
415315,179147,36.649884,0.505822


In [None]:
mean_table.to_csv('/data/target_distribution_by_urls/mean_age_and_sex_ranged.scv', index=False)