In [1]:
import sys
import os
import warnings
os.environ['OPENBLAS_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import time
import pyarrow as pa
import pyarrow.parquet as pq
import scipy
!pip install implicit
import implicit
import bisect
import sklearn.metrics as m
!pip install catboost
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.calibration import calibration_curve, CalibratedClassifierCV

[0m

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

%matplotlib inline
sns.set_style('darkgrid')

In [4]:
!pip install feather-format >> none
!pip install faiss-cpu --no-cache

[0mCollecting faiss-cpu
  Downloading faiss_cpu-1.7.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.0/17.0 MB[0m [31m173.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.3
[0m

In [5]:
def age_bucket(x):
    return bisect.bisect_left([18,25,35,45,55,65], x)

In [6]:
targets = pd.read_feather('/data/target_train.feather')
targets = targets.dropna()
targets['is_male'] = targets['is_male'].astype(np.int8)
targets['age'] = targets['age'].astype(np.int16)
targets.dtypes

age        int16
is_male     int8
user_id    int64
dtype: object

In [7]:
data = pd.read_feather('/data/dataset_full.feather')

In [8]:
data = data[['user_id', 'cpe_manufacturer_name', 'cpe_model_name']].drop_duplicates()
data

Unnamed: 0,user_id,cpe_manufacturer_name,cpe_model_name
0,45098,Apple,iPhone 7
1550,117132,Xiaomi,Redmi 5 Plus
2272,79395,Samsung,Galaxy S20+
3907,91294,Xiaomi,Poco X3 Pro
5477,161323,Xiaomi,Redmi Note 8 Pro
...,...,...,...
322896216,375972,Apple,iPhone X
322896906,160996,Apple,iPhone 7
322897366,5505,Huawei,P Smart Z
322897462,260127,Apple,iPhone SE 2020


In [9]:
data['cpe_model_name'] = data['cpe_manufacturer_name'].astype('string') + ' ' + data['cpe_model_name'].astype('string')
data = data.drop('cpe_manufacturer_name', axis=1)
data = data.drop_duplicates()
data

Unnamed: 0,user_id,cpe_model_name
0,45098,Apple iPhone 7
1550,117132,Xiaomi Redmi 5 Plus
2272,79395,Samsung Galaxy S20+
3907,91294,Xiaomi Poco X3 Pro
5477,161323,Xiaomi Redmi Note 8 Pro
...,...,...
322896216,375972,Apple iPhone X
322896906,160996,Apple iPhone 7
322897366,5505,Huawei P Smart Z
322897462,260127,Apple iPhone SE 2020


In [10]:
data_mearge = data.merge(targets, how = 'inner', on = ['user_id'])
data_mearge

Unnamed: 0,user_id,cpe_model_name,age,is_male
0,79395,Samsung Galaxy S20+,35,1
1,91294,Xiaomi Poco X3 Pro,35,1
2,161323,Xiaomi Redmi Note 8 Pro,39,0
3,304574,Huawei Honor 10,28,1
4,16874,Samsung Galaxy A40 Dual,32,0
...,...,...,...,...
264291,403319,Apple iPhone 12,28,0
264292,74976,Huawei Honor 10,39,1
264293,375972,Apple iPhone X,48,0
264294,5505,Huawei P Smart Z,26,1


In [11]:
data_mearge = pa.Table.from_pandas(data_mearge)

In [12]:
%%time
data_agg = data_mearge.select(['cpe_model_name', 'age', 'is_male']).\
    group_by(['cpe_model_name']).aggregate([('age', 'mean'), ('is_male', 'mean')])

CPU times: user 27.8 ms, sys: 5.72 ms, total: 33.5 ms
Wall time: 68.7 ms


In [13]:
data_agg

pyarrow.Table
age_mean: double
is_male_mean: double
cpe_model_name: string
----
age_mean: [[37.13590844062947,36.387291981845685,37.84362505553087,37.21165354330709,42.525154457193295,...,33,44,45.5,38,42]]
is_male_mean: [[0.6065808297567954,0.7201210287443268,0.5870724122612172,0.5650393700787402,0.501323918799647,...,1,0.5,0,1,0.4]]
cpe_model_name: [["Samsung Galaxy S20+","Xiaomi Poco X3 Pro","Xiaomi Redmi Note 8 Pro","Huawei Honor 10","Samsung Galaxy A40 Dual",...,"Nokia 3.1 Plus","Meizu 16","Sony Xperia XA","Xiaomi Redmi Note 5A Lite","Highscreen Power Five Max"]]

In [14]:
data_mearge = data.merge(data_agg.to_pandas(), how = 'left', on = ['cpe_model_name'])
data_mearge

Unnamed: 0,user_id,cpe_model_name,age_mean,is_male_mean
0,45098,Apple iPhone 7,36.055685,0.529805
1,117132,Xiaomi Redmi 5 Plus,43.219931,0.553265
2,79395,Samsung Galaxy S20+,37.135908,0.606581
3,91294,Xiaomi Poco X3 Pro,36.387292,0.720121
4,161323,Xiaomi Redmi Note 8 Pro,37.843625,0.587072
...,...,...,...,...
415312,375972,Apple iPhone X,34.749203,0.550114
415313,160996,Apple iPhone 7,36.055685,0.529805
415314,5505,Huawei P Smart Z,38.490972,0.544150
415315,260127,Apple iPhone SE 2020,35.982153,0.590590


In [15]:
data_mearge.to_csv('/data/target_distribution_by_features/model_agg.csv', index=False)

In [16]:
data = pd.read_feather('/data/dataset_full.feather')

In [17]:
data = data[['user_id', 'region_name', 'city_name']].drop_duplicates()
data

Unnamed: 0,user_id,region_name,city_name
0,45098,Краснодарский край,Краснодар
428,45098,Ставропольский край,Ставрополь
445,45098,Республика Адыгея,Адыгейск
1124,45098,Ставропольский край,Изобильный
1355,45098,Ставропольский край,Новоалександровск
...,...,...,...
322897462,260127,Курская область,Курск
322897463,260127,Белгородская область,Белгород
322897962,260127,Белгородская область,Строитель
322898322,300964,Калужская область,Обнинск


In [18]:
data['city_name'] = data['region_name'].astype('string') + ' ' + data['city_name'].astype('string')
data = data.drop('region_name', axis=1)
data = data.drop_duplicates()
data

Unnamed: 0,user_id,city_name
0,45098,Краснодарский край Краснодар
428,45098,Ставропольский край Ставрополь
445,45098,Республика Адыгея Адыгейск
1124,45098,Ставропольский край Изобильный
1355,45098,Ставропольский край Новоалександровск
...,...,...
322897462,260127,Курская область Курск
322897463,260127,Белгородская область Белгород
322897962,260127,Белгородская область Строитель
322898322,300964,Калужская область Обнинск


In [19]:
data_mearge = data.merge(targets, how = 'inner', on = ['user_id'])
data_mearge

Unnamed: 0,user_id,city_name,age,is_male
0,79395,Санкт-Петербург Санкт-Петербург,35,1
1,79395,Калининградская область Калининград,35,1
2,91294,Краснодарский край Новороссийск,35,1
3,161323,Тюменская область Тюмень,39,0
4,161323,Свердловская область Екатеринбург,39,0
...,...,...,...,...
823023,5505,Ярославская область Ярославль,26,1
823024,5505,Ярославская область Рыбинск,26,1
823025,5505,Ярославская область Тутаев,26,1
823026,300964,Калужская область Обнинск,57,0


In [20]:
data_mearge = pa.Table.from_pandas(data_mearge)

In [21]:
%%time
data_agg = data_mearge.select(['city_name', 'age', 'is_male']).\
    group_by(['city_name']).aggregate([('age', 'mean'), ('is_male', 'mean')])

CPU times: user 56.3 ms, sys: 1.15 ms, total: 57.4 ms
Wall time: 56.2 ms


In [22]:
data_mearge = data.merge(data_agg.to_pandas(), how = 'left', on = ['city_name'])
data_mearge

Unnamed: 0,user_id,city_name,age_mean,is_male_mean
0,45098,Краснодарский край Краснодар,38.329572,0.516624
1,45098,Ставропольский край Ставрополь,36.742826,0.567697
2,45098,Республика Адыгея Адыгейск,36.605013,0.561366
3,45098,Ставропольский край Изобильный,34.079545,0.588068
4,45098,Ставропольский край Новоалександровск,34.801120,0.649860
...,...,...,...,...
1288654,260127,Курская область Курск,37.574174,0.566967
1288655,260127,Белгородская область Белгород,37.637193,0.558471
1288656,260127,Белгородская область Строитель,36.448980,0.584184
1288657,300964,Калужская область Обнинск,38.408385,0.573758


In [23]:
data_mearge[['user_id', 'age_mean', 'is_male_mean']].groupby('user_id').mean().to_csv('city_agg.csv')

In [24]:
age_agg = pa.Table.from_pandas(data_mearge).select(['city_name', 'age_mean', 'user_id']).\
    group_by(['city_name', 'user_id']).aggregate([('age_mean', 'mean')])

In [25]:
city_set = set(age_agg.select(['city_name']).to_pandas()['city_name'])
print(f'{len(city_set)} cities')
city_dict = {url: idurl for url, idurl in zip(city_set, range(len(city_set)))}
usr_set = set(age_agg.select(['user_id']).to_pandas()['user_id'])
print(f'{len(usr_set)} users')
usr_dict = {usr: user_id for usr, user_id in zip(usr_set, range(len(usr_set)))}

1000 cities
415317 users


In [26]:
age_agg

pyarrow.Table
age_mean_mean: double
city_name: string
user_id: int32
----
age_mean_mean: [[38.32957172740284,36.74282560706402,36.60501296456353,34.07954545454545,34.80112044817927,...,37.574174174174175,37.63719338277239,36.44897959183673,38.4083850931677,39.61675575857712]]
city_name: [["Краснодарский край Краснодар","Ставропольский край Ставрополь","Республика Адыгея Адыгейск","Ставропольский край Изобильный","Ставропольский край Новоалександровск",...,"Курская область Курск","Белгородская область Белгород","Белгородская область Строитель","Калужская область Обнинск","Москва Москва"]]
user_id: [[45098,45098,45098,45098,45098,...,260127,260127,260127,300964,300964]]

In [27]:
%%time
values = np.array(age_agg.select(['age_mean_mean']).to_pandas()['age_mean_mean'])
rows = np.array(age_agg.select(['user_id']).to_pandas()['user_id'].map(usr_dict))
cols = np.array(age_agg.select(['city_name']).to_pandas()['city_name'].map(city_dict))
mat = scipy.sparse.coo_matrix((values, (rows, cols)), shape=(rows.max() + 1, cols.max() + 1))
als = implicit.approximate_als.FaissAlternatingLeastSquares(factors = 50, iterations = 30, use_gpu = False, \
       calculate_training_loss = False, regularization = 0.1)

CPU times: user 645 ms, sys: 68.8 ms, total: 714 ms
Wall time: 693 ms


In [28]:
%%time
als.fit(mat)

  0%|          | 0/30 [00:00<?, ?it/s]

CPU times: user 3min 56s, sys: 47.9 s, total: 4min 43s
Wall time: 1min 14s


In [29]:
u_factors = als.user_factors 
d_factors = als.item_factors

In [30]:
%%time
inv_usr_map = {v: k for k, v in usr_dict.items()}
city_emb = pd.DataFrame(d_factors)
city_emb['user_id'] = city_emb.index.map(inv_usr_map)

CPU times: user 525 ms, sys: 32.7 ms, total: 558 ms
Wall time: 558 ms


In [31]:
city_emb.to_csv('/data/target_distribution_by_features/city_emb.csv', index=False)

In [32]:
data = pd.read_feather('/data/dataset_full.feather')

In [33]:
data = data[['user_id', 'price']].drop_duplicates()
data

Unnamed: 0,user_id,price
0,45098,20368.0
1550,117132,4990.0
2272,79395,74259.0
3907,91294,23876.0
5477,161323,20465.0
...,...,...
322896216,375972,56357.0
322896906,160996,26154.0
322897366,5505,15490.0
322897462,260127,40612.0


In [34]:
def price_bucket(x):
    return bisect.bisect_left(range(5000, 100001, 5000), x)

In [35]:
data['price'] = data['price'].map(price_bucket)
data = data.drop_duplicates()
data

Unnamed: 0,user_id,price
0,45098,4
1550,117132,0
2272,79395,14
3907,91294,4
5477,161323,4
...,...,...
322896216,375972,11
322896906,160996,5
322897366,5505,3
322897462,260127,8


In [36]:
data_mearge = data.merge(targets, how = 'inner', on = ['user_id'])
data_mearge

Unnamed: 0,user_id,price,age,is_male
0,79395,14,35,1
1,91294,4,35,1
2,161323,4,39,0
3,304574,5,28,1
4,16874,2,32,0
...,...,...,...,...
267035,403319,16,28,0
267036,74976,3,39,1
267037,375972,11,48,0
267038,5505,3,26,1


In [37]:
data_mearge = pa.Table.from_pandas(data_mearge)

In [38]:
%%time
data_agg = data_mearge.select(['price', 'age', 'is_male']).\
    group_by(['price']).aggregate([('age', 'mean'), ('is_male', 'mean')])

CPU times: user 6.23 ms, sys: 1.09 ms, total: 7.32 ms
Wall time: 6.35 ms


In [39]:
data_mearge = data.merge(data_agg.to_pandas(), how = 'left', on = ['price'])
data_mearge

Unnamed: 0,user_id,price,age_mean,is_male_mean
0,45098,4,38.931642,0.511604
1,117132,0,41.498401,0.576446
2,79395,14,35.358038,0.568894
3,91294,4,38.931642,0.511604
4,161323,4,38.931642,0.511604
...,...,...,...,...
419604,375972,11,33.055539,0.444242
419605,160996,5,37.433300,0.538750
419606,5505,3,39.771103,0.512968
419607,260127,8,35.822194,0.523869


In [40]:
data_mearge = data_mearge.groupby('user_id').mean()
data_mearge

Unnamed: 0_level_0,price,age_mean,is_male_mean
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.0,41.498401,0.576446
1,0.0,41.498401,0.576446
2,1.0,43.246011,0.513338
3,2.0,40.890760,0.519966
4,2.0,40.890760,0.519966
...,...,...,...
415312,1.0,43.246011,0.513338
415313,0.0,41.498401,0.576446
415314,3.0,39.771103,0.512968
415315,1.0,43.246011,0.513338


In [41]:
data_mearge.to_csv('/data/target_distribution_by_features/price_agg.csv', index=True)

In [42]:
data = pd.read_feather('/data/dataset_full.feather')

In [43]:
data = data[['user_id', 'date', 'request_cnt']]
data

Unnamed: 0,user_id,date,request_cnt
0,45098,2022-06-15,1
1,45098,2022-06-19,1
2,45098,2022-06-12,1
3,45098,2022-05-16,1
4,45098,2022-05-30,1
...,...,...,...
322899430,300964,2021-07-12,1
322899431,300964,2021-06-20,1
322899432,300964,2021-08-05,1
322899433,300964,2021-07-19,1


In [44]:
data = pa.Table.from_pandas(data)

In [45]:
%%time
data_agg = data.group_by(['user_id']).aggregate([('date', 'count'), ('request_cnt', 'sum')])

CPU times: user 11.6 s, sys: 2.71 s, total: 14.3 s
Wall time: 14.3 s


In [46]:
activity = data_agg.to_pandas()
activity['activity_per_day'] = activity['request_cnt_sum']/activity['date_count']
activity = activity.drop(['request_cnt_sum', 'date_count'], axis=1)
activity

Unnamed: 0,user_id,activity_per_day
0,45098,1.458710
1,117132,1.781163
2,79395,2.022018
3,91294,1.552229
4,161323,1.408946
...,...,...
415312,375972,1.211594
415313,160996,1.726087
415314,5505,1.291667
415315,260127,1.517442


In [47]:
def activity_bucket(x):
    return bisect.bisect_left(np.arange(1, 7, 0.25), x)

In [48]:
activity['activity'] = activity['activity_per_day'].map(activity_bucket)
activity

Unnamed: 0,user_id,activity_per_day,activity
0,45098,1.458710,2
1,117132,1.781163,4
2,79395,2.022018,5
3,91294,1.552229,3
4,161323,1.408946,2
...,...,...,...
415312,375972,1.211594,1
415313,160996,1.726087,3
415314,5505,1.291667,2
415315,260127,1.517442,3


In [49]:
data_mearge = targets.merge(activity, how = 'inner', on = ['user_id'])
data_mearge

Unnamed: 0,age,is_male,user_id,activity_per_day,activity
0,31,1,350459,1.348315,2
1,35,1,188276,1.414414,2
2,41,0,99002,1.339593,2
3,33,0,155506,1.727273,3
4,54,0,213873,1.000000,0
...,...,...,...,...,...
264291,49,1,225374,1.601331,3
264292,22,1,25776,1.266667,2
264293,28,0,148131,1.448276,2
264294,28,1,205570,1.698603,3


In [50]:
data_mearge = pa.Table.from_pandas(data_mearge)

In [51]:
%%time
activity_agg = data_mearge.select(['activity', 'age', 'is_male']).\
    group_by(['activity']).aggregate([('age', 'mean'), ('is_male', 'mean')])

CPU times: user 7.81 ms, sys: 0 ns, total: 7.81 ms
Wall time: 7.29 ms


In [52]:
activity_mearge = activity.merge(activity_agg.to_pandas(), how = 'inner', on = ['activity'])
activity_mearge

Unnamed: 0,user_id,activity_per_day,activity,age_mean,is_male_mean
0,45098,1.458710,2,39.38022,0.508531
1,161323,1.408946,2,39.38022,0.508531
2,69797,1.416466,2,39.38022,0.508531
3,401442,1.403941,2,39.38022,0.508531
4,205391,1.261538,2,39.38022,0.508531
...,...,...,...,...,...
415311,182543,5.500000,18,44.10000,0.400000
415312,72276,5.428571,18,44.10000,0.400000
415313,130544,5.333333,18,44.10000,0.400000
415314,34713,7.111111,24,39.00000,0.500000


In [53]:
activity_mearge = activity_mearge.drop('activity', axis=1)
activity_mearge

Unnamed: 0,user_id,activity_per_day,age_mean,is_male_mean
0,45098,1.458710,39.38022,0.508531
1,161323,1.408946,39.38022,0.508531
2,69797,1.416466,39.38022,0.508531
3,401442,1.403941,39.38022,0.508531
4,205391,1.261538,39.38022,0.508531
...,...,...,...,...
415311,182543,5.500000,44.10000,0.400000
415312,72276,5.428571,44.10000,0.400000
415313,130544,5.333333,44.10000,0.400000
415314,34713,7.111111,39.00000,0.500000


In [54]:
activity_mearge.to_csv('/data/target_distribution_by_features/activity_per_day.csv', index=True)