In [41]:
import pandas as pd
import numpy as np
import math
from datetime import datetime
from ptls.preprocessing import PandasDataPreprocessor


In [12]:
df = pd.DataFrame()

for i in range(30):
    try:
        cur = pd.read_parquet(f'mnt/requests/part_{i}.parquet')
    
        cur = cur[cur.user_id.isin(need_users)]
        df = pd.concat([df, cur])
    except Exception as e:
        pass

In [13]:
df.shape

(9465920, 5)

In [17]:
geo = pd.read_csv('mnt/geo_dataframe.csv')
users = pd.read_csv('mnt/train_users.csv')
# df = pd.read_csv('data/better_feats.csv')
# df.drop(['Unnamed: 0'], axis=1, inplace=True)

geo.shape, users.shape

((5533, 3), (5000000, 3))

In [18]:
df = df.merge(geo, left_on='geo_id', right_on='geo_id')

In [19]:
df['hour'] = df['timestamp'].apply(lambda x: datetime.fromtimestamp(x).hour)
median_online_time = df.groupby('region_id')['hour'].median()

df['MSK+'] = df.apply(lambda x: 14 - math.ceil(median_online_time[x['region_id']]), axis=1)

In [20]:
age_groups = {
    'age_14_18': [age for age in range(14, 18 + 1)],
    'age_19_25': [age for age in range(19, 25 + 1)],
    'age_26_30': [age for age in range(26, 30 + 1)],
    'age_31_40': [age for age in range(31, 40 + 1)],
    'age_41_55': [age for age in range(41, 55 + 1)],
    'age_55+': [age for age in range(56, 75 + 1)]
}

In [21]:
def get_age_group(age):
    for group in age_groups:
        if age in age_groups[group]:
            return group

users['group'] = users.age.apply(get_age_group)

In [7]:
df = df.merge(users, left_on='user_id', right_on='user_id')

In [25]:
df = df[df['referer'] != '']

In [None]:
def get_domain(string):
    domain = string.split('/')[2]
    domain = domain.lstrip('www.')
    return domain.lstrip('domain_')

def get_path(string):
    path = string.split('/')[3]
    return path.lstrip('path_')

df.loc[:, 'domain'] = df['referer'].apply(get_domain)
df.loc[:, 'path'] = df['referer'].apply(get_path)

In [27]:
df.head()

Unnamed: 0,timestamp,geo_id,referer,user_id,user_agent,region_id,country_id,hour,MSK+,domain,path
0,1712066848,3037,https://domain_2301/path_124968,16779507,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,53,40,14,2,2301,124968
1,1712104836,1888,https://domain_3191/path_9105,9663358,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,214,40,0,5,3191,9105
2,1712107207,1330,https://domain_3191/path_9105,14200832,Mozilla/5.0 (Linux; Android 10; K) AppleWebKit...,-1,145,1,4,3191,9105
3,1712172538,3065,https://domain_551/path_175610,3124559,Mozilla/5.0 (Linux; Android 10; K) AppleWebKit...,184,40,19,2,551,175610
4,1712045212,3640,https://domain_1353/path_90872,10567547,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,87,40,8,5,1353,90872


In [28]:
domains_frequency = df['domain'].value_counts()
useful_domains = set(domains_frequency[domains_frequency > 250].index)
len(useful_domains)

632

In [None]:
df['domain'] = df['domain'].apply(lambda x: x if x in useful_domains else 0)

In [30]:
df.shape

(9423592, 11)

In [42]:
preprocessor = PandasDataPreprocessor(
    col_id='user_id',
    col_event_time='timestamp',
    cols_category=['domain']
)

In [11]:
domain_male_percentage = df.groupby('domain')['gender'].mean()
domain_age_groups = df.groupby('domain')['group'].value_counts(normalize=True).unstack()

In [12]:
df = df.merge(domain_male_percentage, left_on='domain', right_on='domain')
df = df.merge(domain_age_groups, left_on='domain', right_on='domain')
df.head()

Unnamed: 0,timestamp,geo_id,user_id,user_agent,domain,path,browser,os,device,mobile,...,gender_x,age,group,gender_y,age_14_18,age_19_25,age_26_30,age_31_40,age_41_55,age_55+
0,1711945301,2540,1013613,Mozilla/5.0 (Linux; Android 8.1.0; Redmi 5 Plu...,0,175610,Opera Mobile,Android,XiaoMi Redmi 5 Plus,True,...,1,75,age_55+,0.502176,0.054901,0.081002,0.042783,0.194959,0.317166,0.309189
1,1711941903,3833,17082498,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,2042,144480,Chrome,Windows,Other,False,...,0,72,age_55+,0.677838,0.005287,0.015663,0.015437,0.144239,0.406593,0.412782
2,1712088484,3866,12787875,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,3191,9105,Chrome,Windows,Other,False,...,1,73,age_55+,0.700984,0.005614,0.015964,0.014239,0.157541,0.409511,0.397132
3,1711918808,1840,8871292,Mozilla/5.0 (Linux; Android 10; K) AppleWebKit...,551,175610,Chrome Mobile,Android,K,True,...,0,18,age_14_18,0.173001,0.206071,0.283564,0.073914,0.117349,0.164116,0.154985
4,1712117806,2293,11372673,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,662,120157,Chrome,Windows,Other,False,...,1,41,age_41_55,0.62298,0.007854,0.019559,0.016954,0.14269,0.385516,0.427428


In [13]:
df.shape

(4639029, 24)

In [14]:
popular_devices = df.device.value_counts()

df = df[df['device'].isin(popular_devices)]

In [37]:
multiple_watches = df['user_id'].value_counts()[df['user_id'].value_counts() > 5].index

In [38]:
df = df[df['user_id'].isin(multiple_watches)]

In [43]:
%%time

df_embeded = preprocessor.fit_transform(df.drop(['geo_id', 'path', 'user_agent'], axis=1))
df_embeded = pd.DataFrame(df_embeded)

CPU times: user 34.7 s, sys: 567 ms, total: 35.3 s
Wall time: 35.2 s


In [None]:
df_embeded['browser'] = df_embeded['browser'].apply(lambda x: x[0])
df_embeded['os'] = df_embeded['os'].apply(lambda x: x[0])
df_embeded['mobile'] = df_embeded['mobile'].apply(lambda x: bool(x[0]))
df_embeded['device'] = df_embeded['device'].apply(lambda x: x[0])

df_embeded['hour'] = df_embeded['hour'].apply(lambda x: int(x[0]))
df_embeded['MSK+'] = df_embeded['MSK+'].apply(lambda x: int(x[0]))

df_embeded['gender_x'] = df_embeded['gender_x'].apply(lambda x: int(x[0]))
df_embeded['age'] = df_embeded['age'].apply(lambda x: int(x[0]))
df_embeded['group'] = df_embeded['group'].apply(lambda x: x[0])

df_embeded['region_id'] = df_embeded['region_id'].apply(lambda x: int(x[0]))
df_embeded['country_id'] = df_embeded['country_id'].apply(lambda x: int(x[0]))

df_embeded['gender_y'] = df_embeded['gender_y'].apply(lambda x: np.mean([el.item() for el in x]))
df_embeded['age_14_18'] = df_embeded['age_14_18'].apply(lambda x: np.mean([el.item() for el in x]))
df_embeded['age_19_25'] = df_embeded['age_19_25'].apply(lambda x: np.mean([el.item() for el in x]))
df_embeded['age_26_30'] = df_embeded['age_26_30'].apply(lambda x: np.mean([el.item() for el in x]))
df_embeded['age_31_40'] = df_embeded['age_31_40'].apply(lambda x: np.mean([el.item() for el in x]))
df_embeded['age_41_55'] = df_embeded['age_41_55'].apply(lambda x: np.mean([el.item() for el in x]))
df_embeded['age_55+'] = df_embeded['age_55+'].apply(lambda x: np.mean([el.item() for el in x]))

In [20]:
df_embeded.head()

Unnamed: 0,user_id,browser,os,device,mobile,region_id,country_id,hour,MSK+,gender_x,...,group,gender_y,age_14_18,age_19_25,age_26_30,age_31_40,age_41_55,age_55+,event_time,domain
0,115,Chrome Mobile,Android,K,True,89,40,11,5,0,...,age_55+,0.275207,0.019739,0.032906,0.018528,0.115901,0.319082,0.493843,[tensor(0)],[tensor(11)]
1,202,Yandex Browser,Windows,Other,False,103,40,4,3,0,...,age_41_55,0.600375,0.013225,0.027734,0.019246,0.126135,0.300039,0.51362,[tensor(0)],[tensor(50)]
2,288,Yandex Browser,Windows,Other,False,39,40,6,6,0,...,age_55+,0.323367,0.003289,0.009461,0.006683,0.040837,0.223081,0.71665,[tensor(0)],[tensor(2)]
3,520,Chrome Mobile,Android,K,True,189,40,17,2,0,...,age_41_55,0.221325,0.020125,0.028653,0.027073,0.226514,0.362919,0.334717,[tensor(0)],[tensor(25)]
4,976,Chrome Mobile,Android,K,True,53,40,21,2,1,...,age_55+,0.677838,0.005287,0.015663,0.015437,0.144239,0.406593,0.412782,[tensor(0)],[tensor(4)]


In [None]:
df_embeded.shape

In [None]:
df_embeded.to_csv('data/full_data.csv')
# df_embeded.to_csv('data/vectors_featured_data.csv')

In [3]:
df_clean = pd.read_csv('data/full_data_2millions.csv')
df_clean.shape

(995487, 22)

In [8]:
df_clean.device.value_counts().head(10)

device
Other                593027
K                    154544
iPhone                 9249
Mac                    7434
Samsung SM-A515F       5015
Samsung SM-A125F       3849
Samsung SM-A505FN      3575
Samsung SM-A325F       3508
M2006C3MNG             3443
Samsung SM-A105F       3170
Name: count, dtype: int64

In [15]:
popular_devices = set(df_clean.device.value_counts().head(10).index)
popular_browsers = set(df_clean['browser'].value_counts().head(10).index)

df_clean['device'] = df_clean['device'].apply(lambda x: x if x in popular_devices else 'Other')
df_clean['browser'] = df_clean['browser'].apply(lambda x: x if x in popular_browsers else 'Other')

In [16]:
df_clean['browser'].value_counts()

browser
Chrome                   378730
Yandex Browser           291268
Chrome Mobile            161010
Edge                      49559
Opera                     32914
YandexSearch              26541
Firefox                   20644
Other                     13358
Chrome Mobile WebView      9655
Mobile Safari              7117
Samsung Internet           4691
Name: count, dtype: int64

In [18]:
df_clean.to_csv('data/full_data_cleaned.csv')

In [None]:
test_users = pd.read_csv('data/test_users.csv')
test_users.shape