In [101]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from collections import Counter
import plotly.express as px

In [102]:
path = '' # Ваш путь до директории с данными /path/to/data/

In [103]:
data_test = pd.read_csv(path + 'test_events.csv')
TEST_IDS = data_test['viewer_uid'].unique()

In [104]:
data = pd.read_csv(path + 'train_events.csv')
video = pd.read_csv(path + 'video_info_v2.csv')
targets = pd.read_csv(path + 'train_targets.csv')
data['ua_os'] = data['ua_os'].apply(lambda x: str(x))

In [105]:
def get_mode(row): # Кастомная функция вычисления модального значения категориального признака
    counter = Counter(row)
    (most_common, _) = counter.most_common(1)[0]
    
    return most_common

## Разделение на треин и валидацию:

In [106]:
TRAIN_IDS, VAL_IDS = train_test_split(data['viewer_uid'].unique(), train_size=0.8, shuffle=True, random_state=11)

train_events = data[data['viewer_uid'].isin(TRAIN_IDS)].sort_values(by='viewer_uid')
train_targets = targets[targets['viewer_uid'].isin(TRAIN_IDS)].sort_values(by='viewer_uid')
val_events = data[data['viewer_uid'].isin(VAL_IDS)].sort_values(by='viewer_uid')
val_targets = targets[targets['viewer_uid'].isin(VAL_IDS)].sort_values(by='viewer_uid')

In [107]:
data = pd.concat([data, data_test], ignore_index=True)
data['ua_os'] = data['ua_os'].apply(lambda x: str(x))

In [108]:
data['viewer_uid'].value_counts()

viewer_uid
10940057    401
10012635    395
10013772    264
10061854    257
10013868    237
           ... 
10732748      1
10748601      1
10124770      1
35351         1
18598         1
Name: count, Length: 240016, dtype: int64

In [109]:
def get_user_caterories(video, data):
    
    category = video['category'].explode()
    dummies = pd.get_dummies(category).groupby(level=0).sum()
    category_list = list(map(str, list(dummies.columns)))
    category_list = [str(i) for i in category_list]
    dummies.columns = category_list
    video2 = video.drop(columns='category').join(dummies)
    data_categories = data.merge(video2, how = 'left', on = 'rutube_video_id')
    for category in category_list:
        data_categories[category] *= data_categories.duration

    user_categories = data_categories[category_list + ['viewer_uid']].groupby('viewer_uid').sum().reset_index()

    sum_genres = user_categories.groupby('viewer_uid').sum().reset_index()[category_list].sum(axis=1)
    
    for g in category_list:
        user_categories[g] /= sum_genres
    categ_list_user = category_list
    user_categories.columns = ['viewer_uid'] + categ_list_user
    

    return user_categories, category_list

In [110]:
def get_user_info(data):
    user_df = pd.DataFrame()
    user_df = data.groupby('viewer_uid')[['ua_device_type','region', 'ua_client_type', 'ua_client_name', 'ua_os']].agg(get_mode).reset_index()
    print('Юзер фичи взяты')
    return user_df

In [111]:
def get_user_features(video, data):
    user_categories, category_list = get_user_caterories(video, data)
    user_df = get_user_info(data)   
    user_categories_info = user_categories.merge(user_df, on = 'viewer_uid', how = 'left')
    return user_categories_info, category_list

In [112]:
user_features, category_list = get_user_features(video, data)

Юзер фичи взяты


In [113]:
user_features

Unnamed: 0,viewer_uid,Авто-мото,Аниме,Аудио,Аудиокниги,Бизнес и предпринимательство,Видеоигры,Детям,Дизайн,Еда,...,Технологии и интернет,Фильмы,Хобби,Эзотерика,Юмор,ua_device_type,region,ua_client_type,ua_client_name,ua_os
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,smartphone,Oryol oblast,mobile app,Rutube,
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,desktop,Mariy-El Republic,browser,Yandex Browser,Windows
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,desktop,Kuzbass,browser,Safari,Mac
3,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,desktop,Krasnodar Krai,browser,Chrome,Windows
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.094453,0.0,0.0,0.000000,desktop,Kuzbass,browser,Chrome,Windows
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240011,11140828,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,tablet,St.-Petersburg,browser,Firefox Mobile,Android
240012,11140869,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,tablet,St.-Petersburg,browser,Firefox Mobile,Android
240013,11140872,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.483839,desktop,Kaliningrad Oblast,browser,Microsoft Edge,Windows
240014,11140875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,tablet,Moscow,browser,Firefox Mobile,Android


## Займемся авторами

In [114]:
def get_author_info():
    train_events = data[data['viewer_uid'].isin(TRAIN_IDS)]
    val_events = data[data['viewer_uid'].isin(VAL_IDS)]
    
    val_targets = targets[targets['viewer_uid'].isin(VAL_IDS)]
    
    train_events = train_events.merge(video[['author_id', 'rutube_video_id', 'duration']], how='left', on = 'rutube_video_id')
    train_events = train_events.merge(targets, on='viewer_uid', how='inner')
    
    train_events = train_events.drop(['event_timestamp','rutube_video_id', 'ua_device_type', 'ua_client_type', 'ua_os',\
                                               'ua_client_name'], axis=1)
    
    train_events['duration']/=1000  # переводим в секунды
    
    train_events['is_male'] = train_events['sex'].apply(lambda x: 1 if x == 'male' else 0)
    
    train_events['is_female'] = train_events['sex'].apply(lambda x: 1 if x == 'female' else 0)
    
    train_events['is_9_20'] = train_events['age'].apply(lambda x: 1 if x in range(1, 21) else 0) * train_events['total_watchtime']
    train_events['is_20_30'] = train_events['age'].apply(lambda x: 1 if x in range(21, 31) else 0) * train_events['total_watchtime']
    train_events['is_30_40'] = train_events['age'].apply(lambda x: 1 if x in range(31, 41) else 0) * train_events['total_watchtime']
    train_events['is_40_60'] = train_events['age'].apply(lambda x: 1 if x in range(41, 61) else 0) * train_events['total_watchtime']
    
    
    
    train_events['is_male'] *= train_events['total_watchtime']
    
    train_events['is_female'] *= train_events['total_watchtime']
    
    train_events
    
    author_median_age = train_events.groupby('author_id')['age'].median()
    author_sex_mode = train_events.groupby('author_id')['sex'].apply(get_mode)
    
    author_male_duration = train_events.groupby('author_id')['is_male'].sum()
    author_female_duration = train_events.groupby('author_id')['is_female'].sum()
    
    author_9_20_duration = train_events.groupby('author_id')['is_9_20'].sum()
    author_20_30_duration = train_events.groupby('author_id')['is_20_30'].sum()
    author_30_40_duration = train_events.groupby('author_id')['is_30_40'].sum()
    author_40_60_duration = train_events.groupby('author_id')['is_40_60'].sum()
    
    
    author_sex_mode.name = 'sex_mode'
    author_median_age.name = 'median_age'
    
    author_male_duration.name = 'male_duration'
    author_female_duration.name = 'female_duration'
    
    author_9_20_duration.name = '9_20_duration'
    author_20_30_duration.name = '20_30_duration'
    author_30_40_duration.name = '30_40_duration'
    author_40_60_duration.name = '40_60_duration'
    
    
    # author_info = author_sex_mode.to_frame().join(author_median_age).reset_index()
    
    author_info = pd.concat([author_sex_mode, author_median_age, author_male_duration, author_female_duration,
                            author_9_20_duration, author_20_30_duration, author_30_40_duration,
                            author_40_60_duration], axis=1).reset_index()
    
    author_info['full_author_duration'] = author_info['male_duration'] + author_info['female_duration']
    
    author_info['male_part'] = author_info['male_duration'] / author_info['full_author_duration']
    author_info['female_part'] = author_info['female_duration'] / author_info['full_author_duration']
    
    author_info['9_20_part'] = author_info['9_20_duration'] / author_info['full_author_duration']
    author_info['20_30_part'] = author_info['20_30_duration'] / author_info['full_author_duration']
    author_info['30_40_part'] = author_info['30_40_duration'] / author_info['full_author_duration']
    author_info['40_60_part'] = author_info['40_60_duration'] / author_info['full_author_duration']
    
    
    
    author_info['author_new_sex_by_whatch_time'] = author_info['male_part'] > author_info['female_part']
    
    author_info['author_new_sex_by_whatch_time'] = author_info['author_new_sex_by_whatch_time'].apply(lambda x: 'male' if x == True else 'female')
    
    return author_info

In [115]:
author_info = get_author_info()

In [116]:
author_info

Unnamed: 0,author_id,sex_mode,median_age,male_duration,female_duration,9_20_duration,20_30_duration,30_40_duration,40_60_duration,full_author_duration,male_part,female_part,9_20_part,20_30_part,30_40_part,40_60_part,author_new_sex_by_whatch_time
0,1000003,male,46.0,31,0,0,0,0,31,31,1.000000,0.000000,0.000000,0.000000,0.000000,1.000000,male
1,1000004,male,41.0,2554877,946857,51006,428252,1096190,1926286,3501734,0.729603,0.270397,0.014566,0.122297,0.313042,0.550095,male
2,1000005,female,34.0,47018,71670,2219,40935,46112,29422,118688,0.396148,0.603852,0.018696,0.344896,0.388514,0.247894,female
3,1000007,male,32.0,2092,0,0,1044,1048,0,2092,1.000000,0.000000,0.000000,0.499044,0.500956,0.000000,male
4,1000008,male,36.0,1365421,487874,30425,447913,717265,657692,1853295,0.736753,0.263247,0.016417,0.241685,0.387021,0.354877,male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21942,1125378,female,35.0,0,52,0,0,52,0,52,0.000000,1.000000,0.000000,0.000000,1.000000,0.000000,female
21943,1125386,male,24.0,34,0,0,34,0,0,34,1.000000,0.000000,0.000000,1.000000,0.000000,0.000000,male
21944,1125388,male,43.0,52,0,0,0,0,52,52,1.000000,0.000000,0.000000,0.000000,0.000000,1.000000,male
21945,1125398,female,25.0,0,80,0,80,0,0,80,0.000000,1.000000,0.000000,1.000000,0.000000,0.000000,female


## Получим фичи из таблицы авторов

In [117]:
def get_user_info_by_watch_time():
    data_full = data.merge(video[['author_id', 'rutube_video_id', 'duration']], how='left', on = 'rutube_video_id')\
                                                .merge(author_info, how = 'left', on = 'author_id')
    data_full['male_whatch_time'] = data_full['total_watchtime'] * data_full['male_part']
    data_full['female_whatch_time'] = data_full['total_watchtime'] * data_full['female_part']

    data_full['9_20_whatch_time'] = data_full['total_watchtime'] * data_full['9_20_part']
    data_full['20_30_whatch_time'] = data_full['total_watchtime'] * data_full['20_30_part']
    data_full['30_40_whatch_time'] = data_full['total_watchtime'] * data_full['30_40_part']
    data_full['40_60_whatch_time'] = data_full['total_watchtime'] * data_full['40_60_part']
    
    median_median_age = data_full.groupby('viewer_uid')['median_age'].median()
    mean_median_age = data_full.groupby('viewer_uid')['median_age'].mean()
    median_whatch_time = data_full.groupby('viewer_uid')['total_watchtime'].median()
    mean_whatch_time = data_full.groupby('viewer_uid')['total_watchtime'].mean()
    
    male_whatch_time = data_full.groupby('viewer_uid')['male_whatch_time'].sum()
    female_whatch_time = data_full.groupby('viewer_uid')['female_whatch_time'].sum()

    whatch_time_9_20 = data_full.groupby('viewer_uid')['9_20_whatch_time'].sum()
    whatch_time_20_30 = data_full.groupby('viewer_uid')['20_30_whatch_time'].sum()
    whatch_time_30_40 = data_full.groupby('viewer_uid')['30_40_whatch_time'].sum()
    whatch_time_40_60 = data_full.groupby('viewer_uid')['40_60_whatch_time'].sum()
    
    author_new_sex_by_whatch_time_mode = data_full.groupby('viewer_uid')['author_new_sex_by_whatch_time'].apply(get_mode)
    
    median_median_age.name = 'median_median_age'
    mean_median_age.name = 'mean_median_age'   
    median_whatch_time.name = 'median_whatch_time'
    mean_whatch_time.name = 'mean_whatch_time'
    male_whatch_time.name = 'male_whatch_time'
    female_whatch_time.name = 'female_whatch_time'
    author_new_sex_by_whatch_time_mode.name = 'author_new_sex_by_whatch_time_mode'

    whatch_time_9_20.name = '9_20_whatch_time'
    whatch_time_20_30.name = '20_30_whatch_time'
    whatch_time_30_40.name = '30_40_whatch_time'
    whatch_time_40_60.name = '40_60_whatch_time'
    
    user_info_by_watch_time = pd.concat([mean_median_age, median_median_age, median_whatch_time, 
                                         mean_whatch_time, male_whatch_time, female_whatch_time, 
                                         author_new_sex_by_whatch_time_mode, 
                                         whatch_time_9_20, whatch_time_20_30, whatch_time_30_40, 
                                         whatch_time_40_60
                                        ], axis=1).reset_index()
    
    user_info_by_watch_time['full_whatch_duration'] = user_info_by_watch_time['male_whatch_time'] + user_info_by_watch_time['female_whatch_time']
    
    user_info_by_watch_time['male_part_whatch_time'] = user_info_by_watch_time['male_whatch_time'] / user_info_by_watch_time['full_whatch_duration']
    user_info_by_watch_time['female_part_whatch_time'] = user_info_by_watch_time['female_whatch_time'] / user_info_by_watch_time['full_whatch_duration']


    user_info_by_watch_time['9_20_part_whatch_time'] = user_info_by_watch_time['9_20_whatch_time'] / user_info_by_watch_time['full_whatch_duration']
    user_info_by_watch_time['20_30_part_whatch_time'] = user_info_by_watch_time['20_30_whatch_time'] / user_info_by_watch_time['full_whatch_duration']
    user_info_by_watch_time['30_40_part_whatch_time'] = user_info_by_watch_time['30_40_whatch_time'] / user_info_by_watch_time['full_whatch_duration']
    user_info_by_watch_time['40_60_part_whatch_time'] = user_info_by_watch_time['40_60_whatch_time'] / user_info_by_watch_time['full_whatch_duration']
    
    
    
    user_info_by_watch_time['sex_by_whatch_time'] = user_info_by_watch_time['male_part_whatch_time'] > user_info_by_watch_time['female_part_whatch_time']
    
    user_info_by_watch_time['sex_by_whatch_time'] = user_info_by_watch_time['sex_by_whatch_time'].apply(lambda x: 'male' if x == True else 'female')

    return user_info_by_watch_time

In [118]:
user_info_by_watch_time = get_user_info_by_watch_time()

In [119]:
user_info_by_watch_time

Unnamed: 0,viewer_uid,mean_median_age,median_median_age,median_whatch_time,mean_whatch_time,male_whatch_time,female_whatch_time,author_new_sex_by_whatch_time_mode,9_20_whatch_time,20_30_whatch_time,30_40_whatch_time,40_60_whatch_time,full_whatch_duration,male_part_whatch_time,female_part_whatch_time,9_20_part_whatch_time,20_30_part_whatch_time,30_40_part_whatch_time,40_60_part_whatch_time,sex_by_whatch_time
0,0,41.500000,42.0,1542.0,1602.111111,11634.799608,2784.200392,male,32.545378,1052.954486,5436.358443,7897.141693,14419.0,0.806908,0.193092,0.002257,0.073025,0.377027,0.547690,male
1,1,42.000000,42.0,92.0,92.000000,81.551146,10.448854,male,0.011604,7.375414,26.036225,58.576758,92.0,0.886425,0.113575,0.000126,0.080168,0.283002,0.636704,male
2,2,34.000000,34.0,1735.0,1735.000000,2678.988812,791.011188,male,103.331066,1372.873045,1200.020316,793.775573,3470.0,0.772043,0.227957,0.029778,0.395641,0.345827,0.228754,male
3,3,33.000000,33.0,1097.0,1119.500000,1979.827560,2498.172440,female,434.391780,1624.553975,1537.294755,881.759490,4478.0,0.442123,0.557877,0.097006,0.362786,0.343299,0.196909,female
4,4,28.115385,28.0,1402.5,1411.961538,26392.800859,47029.199141,female,3563.529611,40679.722419,23137.453675,6041.294295,73422.0,0.359467,0.640533,0.048535,0.554054,0.315130,0.082282,female
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240011,11140828,38.000000,38.0,1475.0,1475.000000,692.749041,782.250959,female,53.902183,370.064804,558.310376,492.722638,1475.0,0.469660,0.530340,0.036544,0.250891,0.378516,0.334049,female
240012,11140869,38.000000,38.0,1305.0,1305.000000,612.906778,692.093222,female,47.689728,327.413267,493.962739,435.934266,1305.0,0.469660,0.530340,0.036544,0.250891,0.378516,0.334049,female
240013,11140872,40.500000,40.5,62.0,62.500000,173.073507,76.926493,male,2.381379,38.191885,86.092352,123.334384,250.0,0.692294,0.307706,0.009526,0.152768,0.344369,0.493338,male
240014,11140875,38.000000,38.0,1115.0,1115.000000,523.671309,591.328691,female,40.746396,279.743902,422.044792,372.464910,1115.0,0.469660,0.530340,0.036544,0.250891,0.378516,0.334049,female


# Займемся категориями

In [120]:
def get_cat_info():
    train_events = data[data['viewer_uid'].isin(TRAIN_IDS)]
    val_events = data[data['viewer_uid'].isin(VAL_IDS)]
    
    val_targets = targets[targets['viewer_uid'].isin(VAL_IDS)]
    
    train_events = train_events.merge(video[['author_id', 'rutube_video_id', 'duration', 'category']], how='left', on = 'rutube_video_id')
    train_events = train_events.merge(targets, on='viewer_uid', how='inner')
    
    train_events = train_events.drop(['event_timestamp','rutube_video_id', 'ua_device_type', 'ua_client_type', 'ua_os',\
                                               'ua_client_name'], axis=1)
    
    train_events['duration']/=1000  # переводим в секунды
    
    
    train_events['is_male'] = train_events['sex'].apply(lambda x: 1 if x == 'male' else 0)
    
    train_events['is_female'] = train_events['sex'].apply(lambda x: 1 if x == 'female' else 0)
    
    train_events['is_male'] *= train_events['total_watchtime']
    
    train_events['is_female'] *= train_events['total_watchtime']

    train_events['is_9_20'] = train_events['age'].apply(lambda x: 1 if x in range(1, 21) else 0) * train_events['total_watchtime']
    train_events['is_20_30'] = train_events['age'].apply(lambda x: 1 if x in range(21, 31) else 0) * train_events['total_watchtime']
    train_events['is_30_40'] = train_events['age'].apply(lambda x: 1 if x in range(31, 41) else 0) * train_events['total_watchtime']
    train_events['is_40_60'] = train_events['age'].apply(lambda x: 1 if x in range(41, 61) else 0) * train_events['total_watchtime']

    cat_9_20_duration = train_events.groupby('category')['is_9_20'].sum()
    cat_20_30_duration = train_events.groupby('category')['is_20_30'].sum()
    cat_30_40_duration = train_events.groupby('category')['is_30_40'].sum()
    cat_40_60_duration = train_events.groupby('category')['is_40_60'].sum()

    cat_9_20_duration.name = 'cat_9_20_duration'
    cat_20_30_duration.name = 'cat_20_30_duration'
    cat_30_40_duration.name = 'cat_30_40_duration'
    cat_40_60_duration.name = 'cat_40_60_duration'
    
    
    cat_median_age = train_events.groupby('category')['age'].median()
    cat_sex_mode = train_events.groupby('category')['sex'].apply(get_mode)
    
    cat_male_duration = train_events.groupby('category')['is_male'].sum()
    cat_female_duration = train_events.groupby('category')['is_female'].sum()
    
    
    cat_sex_mode.name = 'cat_sex_mode'
    cat_median_age.name = 'cat_median_age'
    
    cat_male_duration.name = 'cat_male_duration'
    cat_female_duration.name = 'cat_female_duration'
    
    cat_info = pd.concat([cat_sex_mode, cat_median_age, cat_male_duration, cat_female_duration,
                         cat_9_20_duration, cat_20_30_duration, cat_30_40_duration, cat_40_60_duration], axis=1).reset_index()
    
    cat_info['full_cat_duration'] = cat_info['cat_male_duration'] + cat_info['cat_female_duration']
    
    cat_info['cat_male_part'] = cat_info['cat_male_duration'] / cat_info['full_cat_duration']
    cat_info['cat_female_part'] = cat_info['cat_female_duration'] / cat_info['full_cat_duration']

    cat_info['cat_9_20_part'] = cat_info['cat_9_20_duration'] / cat_info['full_cat_duration']
    cat_info['cat_20_30_part'] = cat_info['cat_20_30_duration'] / cat_info['full_cat_duration']
    cat_info['cat_30_40_part'] = cat_info['cat_30_40_duration'] / cat_info['full_cat_duration']
    cat_info['cat_40_60_part'] = cat_info['cat_40_60_duration'] / cat_info['full_cat_duration']
    
    
    cat_info['cat_new_sex_by_whatch_time'] = cat_info['cat_male_part'] > cat_info['cat_female_part']
    
    cat_info['cat_new_sex_by_whatch_time'] = cat_info['cat_new_sex_by_whatch_time'].apply(lambda x: 'male' if x == True else 'female')
    
    return cat_info

In [121]:
cat_info = get_cat_info()

In [122]:
cat_info

Unnamed: 0,category,cat_sex_mode,cat_median_age,cat_male_duration,cat_female_duration,cat_9_20_duration,cat_20_30_duration,cat_30_40_duration,cat_40_60_duration,full_cat_duration,cat_male_part,cat_female_part,cat_9_20_part,cat_20_30_part,cat_30_40_part,cat_40_60_part,cat_new_sex_by_whatch_time
0,Авто-мото,male,40.0,4347008,310541,54880,484644,2032231,2085794,4657549,0.933325,0.066675,0.011783,0.104056,0.436331,0.447831,male
1,Аниме,male,31.0,6251686,2608678,828293,3444727,3222859,1364485,8860364,0.705579,0.294421,0.093483,0.388779,0.363739,0.153999,male
2,Аудио,male,37.0,907229,1925848,9964,246083,2086813,490217,2833077,0.320227,0.679773,0.003517,0.086861,0.736589,0.173033,female
3,Аудиокниги,male,41.0,2144321,781980,40985,315665,750310,1819341,2926301,0.732775,0.267225,0.014006,0.107872,0.256402,0.62172,male
4,Бизнес и предпринимательство,male,37.0,1142961,336976,11612,158865,644238,665222,1479937,0.772304,0.227696,0.007846,0.107346,0.435314,0.449493,male
5,Видеоигры,male,33.0,9317765,4501432,881310,3044355,6281138,3612394,13819197,0.674262,0.325738,0.063774,0.220299,0.454523,0.261404,male
6,Детям,male,31.0,1509435,1673505,233774,1233430,1176110,539626,3182940,0.474227,0.525773,0.073446,0.387513,0.369504,0.169537,female
7,Дизайн,female,41.0,28583,41678,484,8145,13337,48295,70261,0.406812,0.593188,0.006889,0.115925,0.189821,0.687366,female
8,Еда,male,32.0,1449730,743505,34930,663258,912743,582304,2193235,0.661001,0.338999,0.015926,0.302411,0.416163,0.2655,male
9,Животные,male,39.0,97429,96964,9908,38662,75696,70127,194393,0.501196,0.498804,0.050969,0.198886,0.389397,0.360749,male


## Получим фичи из таблицы категорий

In [123]:
def get_cat_info_by_watch_time():
    data_full = data.merge(video[['rutube_video_id', 'duration', 'category']], how='left', on = 'rutube_video_id')\
                                                .merge(cat_info, how = 'left', on = 'category')
    data_full['male_whatch_time'] = data_full['total_watchtime'] * data_full['cat_male_part']
    data_full['female_whatch_time'] = data_full['total_watchtime'] * data_full['cat_female_part']

    data_full['9_20_whatch_time'] = data_full['total_watchtime'] * data_full['cat_9_20_part']
    data_full['20_30_whatch_time'] = data_full['total_watchtime'] * data_full['cat_20_30_part']
    data_full['30_40_whatch_time'] = data_full['total_watchtime'] * data_full['cat_30_40_part']
    data_full['40_60_whatch_time'] = data_full['total_watchtime'] * data_full['cat_40_60_part']
    
    cat_male_whatch_time = data_full.groupby('viewer_uid')['male_whatch_time'].sum()
    cat_female_whatch_time = data_full.groupby('viewer_uid')['female_whatch_time'].sum()
    cat_new_sex_by_whatch_time_mode = data_full.groupby('viewer_uid')['cat_new_sex_by_whatch_time'].apply(get_mode)

    cat_whatch_time_9_20 = data_full.groupby('viewer_uid')['9_20_whatch_time'].sum()
    cat_whatch_time_20_30 = data_full.groupby('viewer_uid')['20_30_whatch_time'].sum()
    cat_whatch_time_30_40 = data_full.groupby('viewer_uid')['30_40_whatch_time'].sum()
    cat_whatch_time_40_60 = data_full.groupby('viewer_uid')['40_60_whatch_time'].sum()
    
    cat_male_whatch_time.name = 'cat_male_whatch_time'
    cat_female_whatch_time.name = 'cat_female_whatch_time'
    cat_new_sex_by_whatch_time_mode.name = 'cat_new_sex_by_whatch_time_mode'


    cat_whatch_time_9_20.name = 'cat_9_20_whatch_time'
    cat_whatch_time_20_30.name = 'cat_20_30_whatch_time'
    cat_whatch_time_30_40.name = 'cat_30_40_whatch_time'
    cat_whatch_time_40_60.name = 'cat_40_60_whatch_time'
    
    user_info_by_watch_time = pd.concat([cat_male_whatch_time, cat_female_whatch_time, cat_new_sex_by_whatch_time_mode,
                                        cat_whatch_time_9_20, cat_whatch_time_20_30, cat_whatch_time_30_40, cat_whatch_time_40_60], axis=1).reset_index()
    
    user_info_by_watch_time['cat_full_whatch_duration'] = user_info_by_watch_time['cat_male_whatch_time'] + user_info_by_watch_time['cat_female_whatch_time']
    
    user_info_by_watch_time['cat_9_20_part_whatch_time'] = user_info_by_watch_time['cat_9_20_whatch_time'] / user_info_by_watch_time['cat_full_whatch_duration']
    user_info_by_watch_time['cat_20_30_part_whatch_time'] = user_info_by_watch_time['cat_20_30_whatch_time'] / user_info_by_watch_time['cat_full_whatch_duration']
    user_info_by_watch_time['cat_30_40_part_whatch_time'] = user_info_by_watch_time['cat_30_40_whatch_time'] / user_info_by_watch_time['cat_full_whatch_duration']
    user_info_by_watch_time['cat_40_60_part_whatch_time'] = user_info_by_watch_time['cat_40_60_whatch_time'] / user_info_by_watch_time['cat_full_whatch_duration']
    
    user_info_by_watch_time['cat_male_part_whatch_time'] = user_info_by_watch_time['cat_male_whatch_time'] / user_info_by_watch_time['cat_full_whatch_duration']
    user_info_by_watch_time['cat_female_part_whatch_time'] = user_info_by_watch_time['cat_female_whatch_time'] / user_info_by_watch_time['cat_full_whatch_duration']
    
    user_info_by_watch_time['cat_sex_by_whatch_time'] = user_info_by_watch_time['cat_male_part_whatch_time'] > user_info_by_watch_time['cat_female_part_whatch_time']
    
    user_info_by_watch_time['cat_sex_by_whatch_time'] = user_info_by_watch_time['cat_sex_by_whatch_time'].apply(lambda x: 'male' if x == True else 'female')

    return user_info_by_watch_time

In [124]:
cat_info_by_watch_time = get_cat_info_by_watch_time()

In [125]:
cat_info_by_watch_time

Unnamed: 0,viewer_uid,cat_male_whatch_time,cat_female_whatch_time,cat_new_sex_by_whatch_time_mode,cat_9_20_whatch_time,cat_20_30_whatch_time,cat_30_40_whatch_time,cat_40_60_whatch_time,cat_full_whatch_duration,cat_9_20_part_whatch_time,cat_20_30_part_whatch_time,cat_30_40_part_whatch_time,cat_40_60_part_whatch_time,cat_male_part_whatch_time,cat_female_part_whatch_time,cat_sex_by_whatch_time
0,0,9944.401130,4474.598870,male,139.838608,2243.640596,5426.971887,6608.548909,14419.0,0.009698,0.155603,0.376376,0.458322,0.689673,0.310327,male
1,1,70.182646,21.817354,male,0.639686,10.296609,31.647355,49.416349,92.0,0.006953,0.111920,0.343993,0.537134,0.762855,0.237145,male
2,2,2647.106313,822.893687,male,24.127304,388.361242,1193.655677,1863.855777,3470.0,0.006953,0.111920,0.343993,0.537134,0.762855,0.237145,male
3,3,3159.582372,1318.417628,male,418.616668,1740.954153,1628.822766,689.606412,4478.0,0.093483,0.388779,0.363739,0.153999,0.705579,0.294421,male
4,4,32188.656091,41233.343909,female,2993.651702,30673.224205,26514.581785,13240.542308,73422.0,0.040773,0.417766,0.361126,0.180335,0.438406,0.561594,female
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240011,11140828,1306.847328,168.152672,male,6.966518,135.843187,594.695209,737.495086,1475.0,0.004723,0.092097,0.403183,0.499997,0.885998,0.114002,male
240012,11140869,1156.227636,148.772364,male,6.163597,120.186684,526.154066,652.495653,1305.0,0.004723,0.092097,0.403183,0.499997,0.885998,0.114002,male
240013,11140872,137.912796,112.087204,male,7.844637,64.896804,99.641512,77.617047,250.0,0.031379,0.259587,0.398566,0.310468,0.551651,0.448349,male
240014,11140875,987.887980,127.112020,male,5.266215,102.688240,449.549259,557.496286,1115.0,0.004723,0.092097,0.403183,0.499997,0.885998,0.114002,male


## Объединим все таблицы в одну

In [126]:
user_wt_features = user_info_by_watch_time.merge(user_features, how = 'left', on = 'viewer_uid')
user_wt_features = user_wt_features.merge(cat_info_by_watch_time, how = 'left', on = 'viewer_uid')
user_wt_features['author_new_sex_by_whatch_time_mode'] = user_wt_features['author_new_sex_by_whatch_time_mode'].apply(lambda x: str(x))
coeff = 0.8
user_wt_features['sum_male_parts'] = coeff * user_wt_features['cat_male_part_whatch_time'] + user_wt_features['male_part_whatch_time']
user_wt_features['sum_female_parts'] = coeff * user_wt_features['cat_female_part_whatch_time'] + user_wt_features['female_part_whatch_time']
user_wt_features

Unnamed: 0,viewer_uid,mean_median_age,median_median_age,median_whatch_time,mean_whatch_time,male_whatch_time,female_whatch_time,author_new_sex_by_whatch_time_mode,9_20_whatch_time,20_30_whatch_time,...,cat_full_whatch_duration,cat_9_20_part_whatch_time,cat_20_30_part_whatch_time,cat_30_40_part_whatch_time,cat_40_60_part_whatch_time,cat_male_part_whatch_time,cat_female_part_whatch_time,cat_sex_by_whatch_time,sum_male_parts,sum_female_parts
0,0,41.500000,42.0,1542.0,1602.111111,11634.799608,2784.200392,male,32.545378,1052.954486,...,14419.0,0.009698,0.155603,0.376376,0.458322,0.689673,0.310327,male,1.358646,0.441354
1,1,42.000000,42.0,92.0,92.000000,81.551146,10.448854,male,0.011604,7.375414,...,92.0,0.006953,0.111920,0.343993,0.537134,0.762855,0.237145,male,1.496709,0.303291
2,2,34.000000,34.0,1735.0,1735.000000,2678.988812,791.011188,male,103.331066,1372.873045,...,3470.0,0.006953,0.111920,0.343993,0.537134,0.762855,0.237145,male,1.382327,0.417673
3,3,33.000000,33.0,1097.0,1119.500000,1979.827560,2498.172440,female,434.391780,1624.553975,...,4478.0,0.093483,0.388779,0.363739,0.153999,0.705579,0.294421,male,1.006586,0.793414
4,4,28.115385,28.0,1402.5,1411.961538,26392.800859,47029.199141,female,3563.529611,40679.722419,...,73422.0,0.040773,0.417766,0.361126,0.180335,0.438406,0.561594,female,0.710192,1.089808
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240011,11140828,38.000000,38.0,1475.0,1475.000000,692.749041,782.250959,female,53.902183,370.064804,...,1475.0,0.004723,0.092097,0.403183,0.499997,0.885998,0.114002,male,1.178459,0.621541
240012,11140869,38.000000,38.0,1305.0,1305.000000,612.906778,692.093222,female,47.689728,327.413267,...,1305.0,0.004723,0.092097,0.403183,0.499997,0.885998,0.114002,male,1.178459,0.621541
240013,11140872,40.500000,40.5,62.0,62.500000,173.073507,76.926493,male,2.381379,38.191885,...,250.0,0.031379,0.259587,0.398566,0.310468,0.551651,0.448349,male,1.133615,0.666385
240014,11140875,38.000000,38.0,1115.0,1115.000000,523.671309,591.328691,female,40.746396,279.743902,...,1115.0,0.004723,0.092097,0.403183,0.499997,0.885998,0.114002,male,1.178459,0.621541


In [127]:
user_wt_features.columns

Index(['viewer_uid', 'mean_median_age', 'median_median_age',
       'median_whatch_time', 'mean_whatch_time', 'male_whatch_time',
       'female_whatch_time', 'author_new_sex_by_whatch_time_mode',
       '9_20_whatch_time', '20_30_whatch_time', '30_40_whatch_time',
       '40_60_whatch_time', 'full_whatch_duration', 'male_part_whatch_time',
       'female_part_whatch_time', '9_20_part_whatch_time',
       '20_30_part_whatch_time', '30_40_part_whatch_time',
       '40_60_part_whatch_time', 'sex_by_whatch_time', 'Авто-мото', 'Аниме',
       'Аудио', 'Аудиокниги', 'Бизнес и предпринимательство', 'Видеоигры',
       'Детям', 'Дизайн', 'Еда', 'Животные', 'Здоровье', 'Интервью', 'Красота',
       'Культура', 'Лайфстайл', 'Лайфхаки', 'Люди и блоги', 'Музыка',
       'Мультфильмы', 'Наука', 'Недвижимость', 'Обзоры и распаковки товаров',
       'Обучение', 'Охота и рыбалка', 'Природа', 'Психология', 'Путешествия',
       'Развлечения', 'Разное', 'Сад и огород', 'Сериалы', 'Спорт',
       'Строи

## Делим сплит по юзерам как раньше и делили

In [128]:
X_train = user_wt_features[user_wt_features['viewer_uid'].isin(TRAIN_IDS)].sort_values(by='viewer_uid')
X_val = user_wt_features[user_wt_features['viewer_uid'].isin(VAL_IDS)].sort_values(by='viewer_uid')

X_test = user_wt_features[user_wt_features['viewer_uid'].isin(TEST_IDS)].sort_values(by='viewer_uid')

y_train = targets[targets['viewer_uid'].isin(TRAIN_IDS)].sort_values(by='viewer_uid')
y_val = targets[targets['viewer_uid'].isin(VAL_IDS)].sort_values(by='viewer_uid')

In [129]:
cat_features = ['ua_device_type', 'region', 'ua_client_type', 'ua_client_name', 'ua_os']
filtred_features = ['median_median_age','Телепередачи','Обучение','Сериалы','Аниме','Видеоигры','Развлечения','Животные','Разное'] + ['median_whatch_time',
                    'mean_whatch_time','full_whatch_duration','male_part_whatch_time', 'female_part_whatch_time', 'ua_device_type', 
                    'cat_male_part_whatch_time', 'cat_female_part_whatch_time', 'cat_male_whatch_time', 'cat_female_whatch_time',
                    'region', 'ua_client_type', 'ua_client_name', 'ua_os', 'sum_male_parts', 'sum_female_parts',
                    '9_20_part_whatch_time','20_30_part_whatch_time','30_40_part_whatch_time','40_60_part_whatch_time',
                    'cat_9_20_part_whatch_time','cat_20_30_part_whatch_time','cat_30_40_part_whatch_time','cat_40_60_part_whatch_time']

In [130]:
from catboost import CatBoostClassifier
clf_sex = CatBoostClassifier(iterations=1000, learning_rate=0.01, 
                         cat_features = cat_features,
                         # l2_leaf_reg=3,
                         depth = 8,
                            eval_metric = 'Accuracy')

clf_sex.fit(X_train[filtred_features], y_train['sex'], verbose=20,
        eval_set=(X_val[filtred_features], y_val['sex']), early_stopping_rounds=60,
        cat_features = cat_features
)

y_pred = clf_sex.predict(X_val[filtred_features])
accuracy = accuracy_score(y_val['sex'], y_pred)
print(f'Точность на тестовых данных: {accuracy:.4f}')

0:	learn: 0.7493143	test: 0.7286059	best: 0.7286059 (0)	total: 278ms	remaining: 4m 38s
20:	learn: 0.7505295	test: 0.7409105	best: 0.7409938 (8)	total: 4.64s	remaining: 3m 36s
40:	learn: 0.7510850	test: 0.7412716	best: 0.7414104 (38)	total: 8.35s	remaining: 3m 15s
60:	learn: 0.7520433	test: 0.7417437	best: 0.7418548 (55)	total: 12s	remaining: 3m 4s
80:	learn: 0.7526960	test: 0.7424381	best: 0.7424381 (80)	total: 15.7s	remaining: 2m 57s
100:	learn: 0.7537932	test: 0.7423270	best: 0.7428270 (91)	total: 19.4s	remaining: 2m 52s
120:	learn: 0.7545223	test: 0.7423826	best: 0.7428270 (91)	total: 23.1s	remaining: 2m 48s
140:	learn: 0.7556056	test: 0.7431047	best: 0.7434658 (138)	total: 26.8s	remaining: 2m 43s
160:	learn: 0.7563138	test: 0.7433269	best: 0.7434658 (138)	total: 30.4s	remaining: 2m 38s
180:	learn: 0.7568277	test: 0.7443824	best: 0.7443824 (180)	total: 34.1s	remaining: 2m 34s
200:	learn: 0.7571749	test: 0.7439936	best: 0.7443824 (180)	total: 37.7s	remaining: 2m 30s
220:	learn: 0.757

In [131]:
X_val[filtred_features]

Unnamed: 0,median_median_age,Телепередачи,Обучение,Сериалы,Аниме,Видеоигры,Развлечения,Животные,Разное,median_whatch_time,...,sum_male_parts,sum_female_parts,9_20_part_whatch_time,20_30_part_whatch_time,30_40_part_whatch_time,40_60_part_whatch_time,cat_9_20_part_whatch_time,cat_20_30_part_whatch_time,cat_30_40_part_whatch_time,cat_40_60_part_whatch_time
60004,31.0,1.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,4003.0,...,0.337032,1.462968,0.038532,0.415569,0.401297,0.144602,0.033980,0.399317,0.407358,0.159345
60009,38.0,0.944218,0.0,0.000000,0.0,0.0,0.0,0.0,0.039997,2260.0,...,0.986669,0.813331,0.010748,0.162594,0.398761,0.427898,0.033775,0.392196,0.405921,0.168108
60012,,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,42.0,...,,,,,,,0.013600,0.171676,0.374011,0.440714
60014,34.0,0.000000,0.0,0.603672,0.0,0.0,0.0,0.0,0.000000,3021.0,...,0.960364,0.839636,0.038611,0.368803,0.359496,0.233090,0.041723,0.397334,0.360006,0.200937
60015,37.0,0.000000,0.0,1.000000,0.0,0.0,0.0,0.0,0.000000,63.0,...,1.009127,0.790873,0.016312,0.191150,0.443634,0.348903,0.040388,0.426056,0.361580,0.171976
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239988,38.0,0.000000,0.0,0.000000,0.0,0.0,1.0,0.0,0.000000,332.0,...,0.898245,0.901755,0.028120,0.210452,0.334584,0.426843,0.040781,0.272027,0.382907,0.304285
239991,34.0,0.000000,0.0,1.000000,0.0,0.0,0.0,0.0,0.000000,2787.5,...,0.858812,0.941188,0.026918,0.350631,0.388387,0.234064,0.040388,0.426056,0.361580,0.171976
239992,38.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,778.0,...,1.242595,0.557405,0.034299,0.210868,0.355236,0.399596,0.009088,0.123575,0.451155,0.416182
239996,32.0,0.000000,0.0,1.000000,0.0,0.0,0.0,0.0,0.000000,1169.0,...,0.744695,1.055305,0.026839,0.424513,0.343598,0.205050,0.040388,0.426056,0.361580,0.171976


In [132]:
X_test[filtred_features]

Unnamed: 0,median_median_age,Телепередачи,Обучение,Сериалы,Аниме,Видеоигры,Развлечения,Животные,Разное,median_whatch_time,...,sum_male_parts,sum_female_parts,9_20_part_whatch_time,20_30_part_whatch_time,30_40_part_whatch_time,40_60_part_whatch_time,cat_9_20_part_whatch_time,cat_20_30_part_whatch_time,cat_30_40_part_whatch_time,cat_40_60_part_whatch_time
0,42.0,0.000000,0.185576,0.000000,0.0,0.0,0.0,0.0,0.0,1542.0,...,1.358646,0.441354,0.002257,0.073025,0.377027,0.547690,0.009698,0.155603,0.376376,0.458322
1,42.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,92.0,...,1.496709,0.303291,0.000126,0.080168,0.283002,0.636704,0.006953,0.111920,0.343993,0.537134
2,34.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,1735.0,...,1.382327,0.417673,0.029778,0.395641,0.345827,0.228754,0.006953,0.111920,0.343993,0.537134
3,33.0,0.000000,0.000000,0.000000,1.0,0.0,0.0,0.0,0.0,1097.0,...,1.006586,0.793414,0.097006,0.362786,0.343299,0.196909,0.093483,0.388779,0.363739,0.153999
4,28.0,0.000000,0.000000,0.905547,0.0,0.0,0.0,0.0,0.0,1402.5,...,0.710192,1.089808,0.048535,0.554054,0.315130,0.082282,0.040773,0.417766,0.361126,0.180335
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59999,29.0,0.749605,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,367.0,...,0.389783,1.410217,0.054885,0.447395,0.383179,0.114540,0.036525,0.387041,0.394494,0.181940
60000,33.5,0.673792,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,403.5,...,0.626324,1.173676,0.017757,0.284683,0.390134,0.307425,0.038002,0.385089,0.297270,0.279639
60001,38.0,0.000000,1.000000,0.000000,0.0,0.0,0.0,0.0,0.0,1780.0,...,1.178459,0.621541,0.036544,0.250891,0.378516,0.334049,0.004723,0.092097,0.403183,0.499997
60002,31.0,1.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,2566.5,...,0.337032,1.462968,0.038532,0.415569,0.401297,0.144602,0.033980,0.399317,0.407358,0.159345


In [133]:
predicts_test = pd.DataFrame()
predicts_test['sex'] = clf_sex.predict(X_test[filtred_features])

In [134]:
predicts = pd.DataFrame()
predicts['sex'] = y_pred

In [135]:
importances = clf_sex.get_feature_importance()
importance_df = pd.DataFrame({'Feature': X_train[filtred_features].columns, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
importance_df.head(60)

Unnamed: 0,Feature,Importance
13,female_part_whatch_time,26.513558
12,male_part_whatch_time,26.072457
14,ua_device_type,4.059385
22,ua_os,3.893147
20,ua_client_type,3.769693
17,cat_male_whatch_time,3.508808
23,sum_male_parts,3.104479
27,30_40_part_whatch_time,2.927065
9,median_whatch_time,2.331787
24,sum_female_parts,2.273773


## Находим возраст

In [136]:
X_train = user_wt_features[user_wt_features['viewer_uid'].isin(TRAIN_IDS)].sort_values(by='viewer_uid')
X_val = user_wt_features[user_wt_features['viewer_uid'].isin(VAL_IDS)].sort_values(by='viewer_uid')
y_train = targets[targets['viewer_uid'].isin(TRAIN_IDS)].sort_values(by='viewer_uid')
y_val = targets[targets['viewer_uid'].isin(VAL_IDS)].sort_values(by='viewer_uid')

In [137]:
cat_features = ['ua_device_type', 'region', 'ua_client_type', 'ua_client_name', 'ua_os']
filtred_features = ['median_median_age','Телепередачи','Обучение','Сериалы','Аниме','Видеоигры','Развлечения','Животные','Разное'] + ['median_whatch_time',
                    'mean_whatch_time','full_whatch_duration','male_part_whatch_time', 'female_part_whatch_time', 'ua_device_type', 
                    'cat_male_part_whatch_time', 'cat_female_part_whatch_time', 'cat_male_whatch_time', 'cat_female_whatch_time',
                    'region', 'ua_client_type', 'ua_client_name', 'ua_os', 'sum_male_parts', 'sum_female_parts',
                    '9_20_part_whatch_time','20_30_part_whatch_time','30_40_part_whatch_time','40_60_part_whatch_time',
                    'cat_9_20_part_whatch_time','cat_20_30_part_whatch_time','cat_30_40_part_whatch_time','cat_40_60_part_whatch_time']

In [138]:
from catboost import CatBoostClassifier
clf_age = CatBoostClassifier(iterations=200, learning_rate=0.01, 
                         cat_features = cat_features,
                         # l2_leaf_reg=3,
                         depth = 8,
                            eval_metric = 'Accuracy')

clf_age.fit(X_train[filtred_features], y_train['age_class'], verbose=20,
        eval_set=(X_val[filtred_features], y_val['age_class']), early_stopping_rounds=40,
        cat_features = cat_features
)

y_pred = clf_age.predict(X_val[filtred_features])
accuracy = accuracy_score(y_val['age_class'], y_pred)
print(f'Точность на тестовых данных: {accuracy:.4f}')

0:	learn: 0.4850114	test: 0.4441297	best: 0.4441297 (0)	total: 553ms	remaining: 1m 50s
20:	learn: 0.5002257	test: 0.4581007	best: 0.4581563 (17)	total: 8.67s	remaining: 1m 13s
40:	learn: 0.5007256	test: 0.4572674	best: 0.4581563 (17)	total: 16.3s	remaining: 1m 3s
60:	learn: 0.5013228	test: 0.4581563	best: 0.4583785 (54)	total: 23.7s	remaining: 54.1s
80:	learn: 0.5022186	test: 0.4588229	best: 0.4591840 (72)	total: 31.4s	remaining: 46.1s
100:	learn: 0.5026422	test: 0.4597117	best: 0.4597117 (100)	total: 38.9s	remaining: 38.2s
120:	learn: 0.5032741	test: 0.4602672	best: 0.4602672 (120)	total: 46.6s	remaining: 30.5s
140:	learn: 0.5039060	test: 0.4611005	best: 0.4611560 (134)	total: 54.4s	remaining: 22.8s
160:	learn: 0.5047810	test: 0.4609894	best: 0.4611560 (134)	total: 1m 2s	remaining: 15s
180:	learn: 0.5053087	test: 0.4614338	best: 0.4615449 (164)	total: 1m 9s	remaining: 7.34s
199:	learn: 0.5058087	test: 0.4622670	best: 0.4622948 (192)	total: 1m 17s	remaining: 0us

bestTest = 0.462294808

In [139]:
predicts_test['age_class'] = clf_age.predict(X_test[filtred_features])

In [140]:
importances = clf_age.get_feature_importance()
importance_df = pd.DataFrame({'Feature': X_train[filtred_features].columns, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
importance_df.head(60)

Unnamed: 0,Feature,Importance
28,40_60_part_whatch_time,24.954375
27,30_40_part_whatch_time,22.410678
26,20_30_part_whatch_time,21.924745
25,9_20_part_whatch_time,6.933751
0,median_median_age,4.509799
22,ua_os,2.799587
8,Разное,2.551214
14,ua_device_type,1.612264
21,ua_client_name,1.603575
30,cat_20_30_part_whatch_time,1.330638


In [141]:
predicts['age_class'] = y_pred

In [142]:
predicts

Unnamed: 0,sex,age_class
0,female,1
1,male,3
2,male,0
3,male,1
4,male,2
...,...,...
35998,female,3
35999,male,1
36000,male,3
36001,female,1


In [143]:
f1_weighted = f1_score(y_val['age_class'], predicts['age_class'], average='weighted')
accuracy = accuracy_score(y_val['sex'], predicts['sex'])

final_score = 0.7 * f1_weighted + 0.3 * accuracy
print(f'Weighted F1 = {f1_weighted:.4f} \nAccuracy = {accuracy:.4f} \nFinal Score = {final_score:.4f}')

Weighted F1 = 0.4484 
Accuracy = 0.7444 
Final Score = 0.5372


In [144]:
predicts_test = pd.DataFrame()
predicts_test['sex'] = clf_sex.predict(X_test[filtred_features])
predicts_test['age_class'] = clf_age.predict(X_test[filtred_features])

In [146]:
predicts_test['age'] = predicts_test['age_class'] * 0

In [149]:
X_test['viewer_uid']

0            0
1            1
2            2
3            3
4            4
         ...  
59999    59999
60000    60000
60001    60001
60002    60002
60003    60003
Name: viewer_uid, Length: 60004, dtype: int64

In [147]:
predicts_test

Unnamed: 0,sex,age_class,age
0,male,3,0
1,male,3,0
2,male,1,0
3,male,2,0
4,female,1,0
...,...,...,...
59999,female,1,0
60000,female,3,0
60001,female,3,0
60002,female,1,0


In [152]:
submission = pd.DataFrame(columns=['viewer_uid', 'age', 'sex', 'age_class'])
submission['viewer_uid'] = X_test['viewer_uid'].values
submission['age'] = predicts_test['age'].values
submission['sex'] = predicts_test['sex'].values
submission['age_class'] = predicts_test['age_class'].values

In [153]:
submission.head()

Unnamed: 0,viewer_uid,age,sex,age_class
0,0,0,male,3
1,1,0,male,3
2,2,0,male,1
3,3,0,male,2
4,4,0,female,1


In [154]:
submission.to_csv(
    'submission.csv',
    index=False 
)