# Предобработка

In [1]:
import pandas as pd
import numpy as np

In [2]:
events = pd.read_csv("data/train_events.csv")
targets = pd.read_csv("data/train_targets.csv")
video = pd.read_csv("data/video_info_v2.csv")

In [3]:
train_events = pd.merge(events, video, on='rutube_video_id', how='left')
dataset = pd.merge(train_events, targets, on='viewer_uid', how='left')

dataset.head(5)

Unnamed: 0,event_timestamp,region,ua_device_type,ua_client_type,ua_os,ua_client_name,total_watchtime,rutube_video_id,viewer_uid,title,category,duration,author_id,age,sex,age_class
0,2024-06-01 06:40:58+03:00,Chelyabinsk,desktop,browser,Windows,Yandex Browser,1883,video_133074,10067243,Папа с особенностями. Мужское / Женское. Выпус...,Телепередачи,2456534,1009219,20,female,0
1,2024-06-01 19:33:24+03:00,Bashkortostan Republic,smartphone,mobile app,Android,Rutube,512,video_362960,10245341,Comedy Club: Мальдивы | Андрей Бебуришвили,Юмор,519211,1006760,40,female,2
2,2024-06-01 21:30:43+03:00,St.-Petersburg,desktop,browser,Windows,Chrome,5647,video_96775,10894333,"Новая Битва экстрасенсов, 24 сезон, 11 выпуск",Телепередачи,5518280,1009257,23,male,1
3,2024-06-01 23:03:42+03:00,Moscow,smartphone,mobile app,Android,Rutube,1521,video_161610,10029092,Сергей Орлов-снял дом!!!,Разное,1522069,1058671,41,male,3
4,2024-06-01 22:48:09+03:00,Moscow,smartphone,mobile app,Android,Rutube,71,video_116245,10452976,Ищем сокровища в Полевском | Уральская Флоренц...,Путешествия,1249920,1020020,38,female,2


In [4]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

dataset['rutube_video_id'] = dataset['rutube_video_id'].str.replace('video_', '')
dataset['rutube_video_id'] = dataset['rutube_video_id'].astype(int)

dataset['event_timestamp'] = pd.to_datetime(dataset['event_timestamp'])

dataset.drop(['sex'], inplace=True, axis=1)

dataset.head(5)

Unnamed: 0,event_timestamp,region,ua_device_type,ua_client_type,ua_os,ua_client_name,total_watchtime,rutube_video_id,viewer_uid,title,category,duration,author_id,age,age_class
0,2024-06-01 06:40:58+03:00,Chelyabinsk,desktop,browser,Windows,Yandex Browser,1883,133074,10067243,Папа с особенностями. Мужское / Женское. Выпус...,Телепередачи,2456534,1009219,20,0
1,2024-06-01 19:33:24+03:00,Bashkortostan Republic,smartphone,mobile app,Android,Rutube,512,362960,10245341,Comedy Club: Мальдивы | Андрей Бебуришвили,Юмор,519211,1006760,40,2
2,2024-06-01 21:30:43+03:00,St.-Petersburg,desktop,browser,Windows,Chrome,5647,96775,10894333,"Новая Битва экстрасенсов, 24 сезон, 11 выпуск",Телепередачи,5518280,1009257,23,1
3,2024-06-01 23:03:42+03:00,Moscow,smartphone,mobile app,Android,Rutube,1521,161610,10029092,Сергей Орлов-снял дом!!!,Разное,1522069,1058671,41,3
4,2024-06-01 22:48:09+03:00,Moscow,smartphone,mobile app,Android,Rutube,71,116245,10452976,Ищем сокровища в Полевском | Уральская Флоренц...,Путешествия,1249920,1020020,38,2


In [5]:
dataset['duration'] = dataset['duration'] // 1000

In [6]:
dataset.head(5)

Unnamed: 0,event_timestamp,region,ua_device_type,ua_client_type,ua_os,ua_client_name,total_watchtime,rutube_video_id,viewer_uid,title,category,duration,author_id,age,age_class
0,2024-06-01 06:40:58+03:00,Chelyabinsk,desktop,browser,Windows,Yandex Browser,1883,133074,10067243,Папа с особенностями. Мужское / Женское. Выпус...,Телепередачи,2456,1009219,20,0
1,2024-06-01 19:33:24+03:00,Bashkortostan Republic,smartphone,mobile app,Android,Rutube,512,362960,10245341,Comedy Club: Мальдивы | Андрей Бебуришвили,Юмор,519,1006760,40,2
2,2024-06-01 21:30:43+03:00,St.-Petersburg,desktop,browser,Windows,Chrome,5647,96775,10894333,"Новая Битва экстрасенсов, 24 сезон, 11 выпуск",Телепередачи,5518,1009257,23,1
3,2024-06-01 23:03:42+03:00,Moscow,smartphone,mobile app,Android,Rutube,1521,161610,10029092,Сергей Орлов-снял дом!!!,Разное,1522,1058671,41,3
4,2024-06-01 22:48:09+03:00,Moscow,smartphone,mobile app,Android,Rutube,71,116245,10452976,Ищем сокровища в Полевском | Уральская Флоренц...,Путешествия,1249,1020020,38,2


In [7]:
columns_to_sum = ['total_watchtime']
other_columns = [col for col in dataset.columns if col not in columns_to_sum + ['rutube_video_id', 'viewer_uid']]

agg_dict = {col: 'sum' for col in columns_to_sum}
agg_dict.update({col: 'first' for col in other_columns})

dataset = dataset.groupby(['rutube_video_id', 'viewer_uid'], as_index=False).agg(agg_dict)

In [8]:
dataset.head(5)

Unnamed: 0,rutube_video_id,viewer_uid,total_watchtime,event_timestamp,region,ua_device_type,ua_client_type,ua_os,ua_client_name,title,category,duration,author_id,age,age_class
0,5,10146165,4018,2024-06-04 17:18:43+03:00,Kemerovo Oblast,smartphone,mobile app,Android,Rutube,Безумная любовь 1 сезон 2 серия,Сериалы,4096,1095392,45,3
1,6,10003283,360,2024-06-16 16:50:40+03:00,Irkutsk Oblast,smartphone,mobile app,Android,Rutube,Нидерланды - Ирландия. Обзор отборочного матча...,Лайфстайл,365,1002444,37,2
2,6,10080455,360,2024-06-20 07:44:29+03:00,Moscow,smartphone,mobile app,Android,Rutube,Нидерланды - Ирландия. Обзор отборочного матча...,Лайфстайл,365,1002444,44,3
3,6,10128191,282,2024-06-08 00:23:46+03:00,Moscow,smartphone,mobile app,Android,Rutube,Нидерланды - Ирландия. Обзор отборочного матча...,Лайфстайл,365,1002444,37,2
4,6,10207004,150,2024-06-27 06:33:34+03:00,Krasnodar Krai,smartphone,mobile app,Android,Rutube,Нидерланды - Ирландия. Обзор отборочного матча...,Лайфстайл,365,1002444,38,2


In [9]:
dataset.loc[dataset['viewer_uid'] == 10075717]

Unnamed: 0,rutube_video_id,viewer_uid,total_watchtime,event_timestamp,region,ua_device_type,ua_client_type,ua_os,ua_client_name,title,category,duration,author_id,age,age_class
27,11,10075717,1333,2024-06-02 19:26:55+03:00,Moscow,desktop,browser,Windows,Chrome,Рик и Морти / Rick and Morty – 2 сезон 8 серия,Сериалы,1352,1091887,33,2
218020,66339,10075717,762,2024-06-08 02:38:54+03:00,Moscow,desktop,browser,Windows,Chrome,Рик и Морти / Rick and Morty – 4 сезон 9 серия,Сериалы,1355,1091887,33,2
257322,79438,10075717,32,2024-06-02 19:25:40+03:00,Moscow,desktop,browser,Windows,Chrome,Рик и Морти / Rick and Morty – 2 сезон 7 серия,Сериалы,1376,1091887,33,2
415515,118469,10075717,1432,2024-06-03 14:18:45+03:00,Moscow,desktop,browser,Windows,Chrome,Рик и Морти / Rick and Morty – 3 сезон 2 серия,Сериалы,1358,1091887,33,2
415834,118615,10075717,1484,2024-06-07 02:23:59+03:00,Moscow,desktop,browser,Windows,Chrome,Рик и Морти / Rick and Morty – 4 сезон 5 серия,Сериалы,1303,1091887,33,2
433093,125797,10075717,1539,2024-06-07 21:52:31+03:00,Moscow,desktop,browser,Windows,Chrome,Рик и Морти / Rick and Morty – 4 сезон 7 серия,Сериалы,1276,1091887,33,2
450901,131146,10075717,1467,2024-06-04 16:38:49+03:00,Moscow,desktop,browser,Windows,Chrome,Рик и Морти / Rick and Morty – 3 сезон 6 серия,Сериалы,1360,1091887,33,2
545039,153425,10075717,1333,2024-06-05 03:25:20+03:00,Moscow,desktop,browser,Windows,Chrome,Рик и Морти / Rick and Morty – 3 сезон 8 серия,Сериалы,1326,1091887,33,2
558935,156346,10075717,1383,2024-06-05 02:41:11+03:00,Moscow,desktop,browser,Windows,Chrome,Рик и Морти / Rick and Morty – 3 сезон 7 серия,Сериалы,1335,1091887,33,2
633824,183875,10075717,1374,2024-06-05 04:32:23+03:00,Moscow,desktop,browser,Windows,Chrome,Рик и Морти / Rick and Morty – 3 сезон 9 серия,Сериалы,1339,1091887,33,2


In [10]:
dataset['watch_percentage'] = (dataset['total_watchtime'] / dataset['duration']) * 100
dataset['watch_percentage'] = dataset['watch_percentage'].apply(lambda x: min(round(x), 100))

In [11]:
dataset.head(5)

Unnamed: 0,rutube_video_id,viewer_uid,total_watchtime,event_timestamp,region,ua_device_type,ua_client_type,ua_os,ua_client_name,title,category,duration,author_id,age,age_class,watch_percentage
0,5,10146165,4018,2024-06-04 17:18:43+03:00,Kemerovo Oblast,smartphone,mobile app,Android,Rutube,Безумная любовь 1 сезон 2 серия,Сериалы,4096,1095392,45,3,98
1,6,10003283,360,2024-06-16 16:50:40+03:00,Irkutsk Oblast,smartphone,mobile app,Android,Rutube,Нидерланды - Ирландия. Обзор отборочного матча...,Лайфстайл,365,1002444,37,2,99
2,6,10080455,360,2024-06-20 07:44:29+03:00,Moscow,smartphone,mobile app,Android,Rutube,Нидерланды - Ирландия. Обзор отборочного матча...,Лайфстайл,365,1002444,44,3,99
3,6,10128191,282,2024-06-08 00:23:46+03:00,Moscow,smartphone,mobile app,Android,Rutube,Нидерланды - Ирландия. Обзор отборочного матча...,Лайфстайл,365,1002444,37,2,77
4,6,10207004,150,2024-06-27 06:33:34+03:00,Krasnodar Krai,smartphone,mobile app,Android,Rutube,Нидерланды - Ирландия. Обзор отборочного матча...,Лайфстайл,365,1002444,38,2,41


In [12]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1581972 entries, 0 to 1581971
Data columns (total 16 columns):
 #   Column            Non-Null Count    Dtype                    
---  ------            --------------    -----                    
 0   rutube_video_id   1581972 non-null  int32                    
 1   viewer_uid        1581972 non-null  int64                    
 2   total_watchtime   1581972 non-null  int64                    
 3   event_timestamp   1581972 non-null  datetime64[ns, UTC+03:00]
 4   region            1581972 non-null  object                   
 5   ua_device_type    1581972 non-null  object                   
 6   ua_client_type    1581972 non-null  object                   
 7   ua_os             1477751 non-null  object                   
 8   ua_client_name    1581972 non-null  object                   
 9   title             1581972 non-null  object                   
 10  category          1581972 non-null  object                   
 11  duration   

In [13]:
dataset['number_of_videos'] = dataset.groupby('viewer_uid')['viewer_uid'].transform('count')
dataset.head(5)

Unnamed: 0,rutube_video_id,viewer_uid,total_watchtime,event_timestamp,region,ua_device_type,ua_client_type,ua_os,ua_client_name,title,category,duration,author_id,age,age_class,watch_percentage,number_of_videos
0,5,10146165,4018,2024-06-04 17:18:43+03:00,Kemerovo Oblast,smartphone,mobile app,Android,Rutube,Безумная любовь 1 сезон 2 серия,Сериалы,4096,1095392,45,3,98,9
1,6,10003283,360,2024-06-16 16:50:40+03:00,Irkutsk Oblast,smartphone,mobile app,Android,Rutube,Нидерланды - Ирландия. Обзор отборочного матча...,Лайфстайл,365,1002444,37,2,99,7
2,6,10080455,360,2024-06-20 07:44:29+03:00,Moscow,smartphone,mobile app,Android,Rutube,Нидерланды - Ирландия. Обзор отборочного матча...,Лайфстайл,365,1002444,44,3,99,9
3,6,10128191,282,2024-06-08 00:23:46+03:00,Moscow,smartphone,mobile app,Android,Rutube,Нидерланды - Ирландия. Обзор отборочного матча...,Лайфстайл,365,1002444,37,2,77,6
4,6,10207004,150,2024-06-27 06:33:34+03:00,Krasnodar Krai,smartphone,mobile app,Android,Rutube,Нидерланды - Ирландия. Обзор отборочного матча...,Лайфстайл,365,1002444,38,2,41,9


In [14]:
dataset['total_time'] = dataset.groupby('viewer_uid')['total_watchtime'].transform('sum')
dataset['mean_taim'] = dataset['total_time'] // dataset['number_of_videos']
dataset.head(5)

Unnamed: 0,rutube_video_id,viewer_uid,total_watchtime,event_timestamp,region,ua_device_type,ua_client_type,ua_os,ua_client_name,title,category,duration,author_id,age,age_class,watch_percentage,number_of_videos,total_time,mean_taim
0,5,10146165,4018,2024-06-04 17:18:43+03:00,Kemerovo Oblast,smartphone,mobile app,Android,Rutube,Безумная любовь 1 сезон 2 серия,Сериалы,4096,1095392,45,3,98,9,35474,3941
1,6,10003283,360,2024-06-16 16:50:40+03:00,Irkutsk Oblast,smartphone,mobile app,Android,Rutube,Нидерланды - Ирландия. Обзор отборочного матча...,Лайфстайл,365,1002444,37,2,99,7,2521,360
2,6,10080455,360,2024-06-20 07:44:29+03:00,Moscow,smartphone,mobile app,Android,Rutube,Нидерланды - Ирландия. Обзор отборочного матча...,Лайфстайл,365,1002444,44,3,99,9,9112,1012
3,6,10128191,282,2024-06-08 00:23:46+03:00,Moscow,smartphone,mobile app,Android,Rutube,Нидерланды - Ирландия. Обзор отборочного матча...,Лайфстайл,365,1002444,37,2,77,6,4554,759
4,6,10207004,150,2024-06-27 06:33:34+03:00,Krasnodar Krai,smartphone,mobile app,Android,Rutube,Нидерланды - Ирландия. Обзор отборочного матча...,Лайфстайл,365,1002444,38,2,41,9,46589,5176


In [15]:
dataset['event_timestamp'] = pd.to_datetime(dataset['event_timestamp'])
dataset['day_of_week'] = dataset['event_timestamp'].dt.day_name()

In [16]:
from data.time_zones import region_timezones
from pytz import timezone


# Функция для конвертации московского времени в местное
def convert_to_local_time(row):
    moscow_time = pd.to_datetime(row['event_timestamp'])
    local_timezone = timezone(region_timezones.get(row['region'], 'Europe/Moscow'))
    
    # Конвертация московского времени в местное
    moscow_time = moscow_time.tz_convert('Europe/Moscow')  # Время уже в зоне Москвы
    local_time = moscow_time.astimezone(local_timezone)
    
    local_hour = local_time.hour + local_time.minute / 60  # Учитываем минуты для округления
    return round(local_hour)

# Применяем функцию к каждому ряду и создаем новый столбец
dataset['local_time'] = dataset.apply(convert_to_local_time, axis=1)

In [17]:
dataset.head(5)

Unnamed: 0,rutube_video_id,viewer_uid,total_watchtime,event_timestamp,region,ua_device_type,ua_client_type,ua_os,ua_client_name,title,...,duration,author_id,age,age_class,watch_percentage,number_of_videos,total_time,mean_taim,day_of_week,local_time
0,5,10146165,4018,2024-06-04 17:18:43+03:00,Kemerovo Oblast,smartphone,mobile app,Android,Rutube,Безумная любовь 1 сезон 2 серия,...,4096,1095392,45,3,98,9,35474,3941,Tuesday,21
1,6,10003283,360,2024-06-16 16:50:40+03:00,Irkutsk Oblast,smartphone,mobile app,Android,Rutube,Нидерланды - Ирландия. Обзор отборочного матча...,...,365,1002444,37,2,99,7,2521,360,Sunday,22
2,6,10080455,360,2024-06-20 07:44:29+03:00,Moscow,smartphone,mobile app,Android,Rutube,Нидерланды - Ирландия. Обзор отборочного матча...,...,365,1002444,44,3,99,9,9112,1012,Thursday,8
3,6,10128191,282,2024-06-08 00:23:46+03:00,Moscow,smartphone,mobile app,Android,Rutube,Нидерланды - Ирландия. Обзор отборочного матча...,...,365,1002444,37,2,77,6,4554,759,Saturday,0
4,6,10207004,150,2024-06-27 06:33:34+03:00,Krasnodar Krai,smartphone,mobile app,Android,Rutube,Нидерланды - Ирландия. Обзор отборочного матча...,...,365,1002444,38,2,41,9,46589,5176,Thursday,7


In [18]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1581972 entries, 0 to 1581971
Data columns (total 21 columns):
 #   Column            Non-Null Count    Dtype                    
---  ------            --------------    -----                    
 0   rutube_video_id   1581972 non-null  int32                    
 1   viewer_uid        1581972 non-null  int64                    
 2   total_watchtime   1581972 non-null  int64                    
 3   event_timestamp   1581972 non-null  datetime64[ns, UTC+03:00]
 4   region            1581972 non-null  object                   
 5   ua_device_type    1581972 non-null  object                   
 6   ua_client_type    1581972 non-null  object                   
 7   ua_os             1477751 non-null  object                   
 8   ua_client_name    1581972 non-null  object                   
 9   title             1581972 non-null  object                   
 10  category          1581972 non-null  object                   
 11  duration   

In [19]:
dataset['day_of_week'] = label_encoder.fit_transform(dataset['day_of_week'])

dataset.drop(['event_timestamp','title'], inplace=True, axis=1)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1581972 entries, 0 to 1581971
Data columns (total 19 columns):
 #   Column            Non-Null Count    Dtype 
---  ------            --------------    ----- 
 0   rutube_video_id   1581972 non-null  int32 
 1   viewer_uid        1581972 non-null  int64 
 2   total_watchtime   1581972 non-null  int64 
 3   region            1581972 non-null  object
 4   ua_device_type    1581972 non-null  object
 5   ua_client_type    1581972 non-null  object
 6   ua_os             1477751 non-null  object
 7   ua_client_name    1581972 non-null  object
 8   category          1581972 non-null  object
 9   duration          1581972 non-null  int64 
 10  author_id         1581972 non-null  int64 
 11  age               1581972 non-null  int64 
 12  age_class         1581972 non-null  int64 
 13  watch_percentage  1581972 non-null  int64 
 14  number_of_videos  1581972 non-null  int64 
 15  total_time        1581972 non-null  int64 
 16  mean_taim         

In [20]:
dataset.head(5)

Unnamed: 0,rutube_video_id,viewer_uid,total_watchtime,region,ua_device_type,ua_client_type,ua_os,ua_client_name,category,duration,author_id,age,age_class,watch_percentage,number_of_videos,total_time,mean_taim,day_of_week,local_time
0,5,10146165,4018,Kemerovo Oblast,smartphone,mobile app,Android,Rutube,Сериалы,4096,1095392,45,3,98,9,35474,3941,5,21
1,6,10003283,360,Irkutsk Oblast,smartphone,mobile app,Android,Rutube,Лайфстайл,365,1002444,37,2,99,7,2521,360,3,22
2,6,10080455,360,Moscow,smartphone,mobile app,Android,Rutube,Лайфстайл,365,1002444,44,3,99,9,9112,1012,4,8
3,6,10128191,282,Moscow,smartphone,mobile app,Android,Rutube,Лайфстайл,365,1002444,37,2,77,6,4554,759,2,0
4,6,10207004,150,Krasnodar Krai,smartphone,mobile app,Android,Rutube,Лайфстайл,365,1002444,38,2,41,9,46589,5176,4,7


In [21]:
from data.region import region
from data.ua_client_name_to_num import client_name
from data.ua_os_type_to_num import os_type


# Переведите название регионов в нижний регистр
dataset['region'] = dataset['region'].str.lower()
dataset['region'] = dataset['region'].replace(region)

dataset['ua_client_name'] = dataset['ua_client_name'].str.lower()
dataset['ua_client_name'] = dataset['ua_client_name'].replace(client_name)

dataset['ua_os'] = dataset['ua_os'].str.lower()
dataset['ua_os'] = dataset['ua_os'].replace(os_type)

  dataset['region'] = dataset['region'].replace(region)
  dataset['ua_client_name'] = dataset['ua_client_name'].replace(client_name)
  dataset['ua_os'] = dataset['ua_os'].replace(os_type)


In [22]:
dataset = pd.get_dummies(dataset, columns=['ua_device_type'])
dataset = pd.get_dummies(dataset, columns=['ua_client_type'])
dataset = pd.get_dummies(dataset, columns=['category'])
dataset = pd.get_dummies(dataset, columns=['region'])


In [23]:

dataset = dataset.apply(lambda x: x.astype(int) if x.dtype == bool else x)
dataset['ua_os'] = dataset['ua_os'].fillna(1).astype(int)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1581972 entries, 0 to 1581971
Data columns (total 69 columns):
 #   Column                                 Non-Null Count    Dtype
---  ------                                 --------------    -----
 0   rutube_video_id                        1581972 non-null  int32
 1   viewer_uid                             1581972 non-null  int64
 2   total_watchtime                        1581972 non-null  int64
 3   ua_os                                  1581972 non-null  int32
 4   ua_client_name                         1581972 non-null  int64
 5   duration                               1581972 non-null  int64
 6   author_id                              1581972 non-null  int64
 7   age                                    1581972 non-null  int64
 8   age_class                              1581972 non-null  int64
 9   watch_percentage                       1581972 non-null  int64
 10  number_of_videos                       1581972 non-null  int64
 11

In [24]:
dataset.head(5)

Unnamed: 0,rutube_video_id,viewer_uid,total_watchtime,ua_os,ua_client_name,duration,author_id,age,age_class,watch_percentage,...,category_Эзотерика,category_Юмор,region_1,region_2,region_3,region_4,region_5,region_6,region_7,region_8
0,5,10146165,4018,2,7,4096,1095392,45,3,98,...,0,0,0,0,0,0,0,1,0,0
1,6,10003283,360,2,7,365,1002444,37,2,99,...,0,0,0,0,0,0,0,1,0,0
2,6,10080455,360,2,7,365,1002444,44,3,99,...,0,0,1,0,0,0,0,0,0,0
3,6,10128191,282,2,7,365,1002444,37,2,77,...,0,0,1,0,0,0,0,0,0,0
4,6,10207004,150,2,7,365,1002444,38,2,41,...,0,0,0,0,1,0,0,0,0,0


# Обучение


In [58]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
dataset = dataset.sort_values(by='viewer_uid')
y = dataset['age_class']
X = dataset.drop(['age', 'age_class'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
id = X_test['viewer_uid']
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [29]:
from sklearn.metrics import f1_score
def fff(y_test, y_pred):
    print(f1_score(y_test, y_pred, average='micro'))
    print(f1_score(y_test, y_pred, average='weighted'))
    print(f1_score(y_test, y_pred, average='macro'))

In [33]:
# CatBoost Example
from catboost import CatBoostClassifier

# Train a CatBoost model
catboost = CatBoostClassifier(iterations=100, learning_rate=0.01, max_depth=8)
catboost.fit(X_train, y_train)

# Make predictions on the test set
y_pred = catboost.predict(X_test)
print(classification_report(y_test, y_pred))
fff(y_test, y_pred)

0:	learn: 1.3812346	total: 774ms	remaining: 1m 16s
1:	learn: 1.3763641	total: 1.49s	remaining: 1m 12s
2:	learn: 1.3716140	total: 2.19s	remaining: 1m 10s
3:	learn: 1.3668704	total: 2.9s	remaining: 1m 9s
4:	learn: 1.3622262	total: 3.61s	remaining: 1m 8s
5:	learn: 1.3576742	total: 4.35s	remaining: 1m 8s
6:	learn: 1.3532022	total: 5.09s	remaining: 1m 7s
7:	learn: 1.3488336	total: 5.81s	remaining: 1m 6s
8:	learn: 1.3445167	total: 6.56s	remaining: 1m 6s
9:	learn: 1.3403219	total: 7.3s	remaining: 1m 5s
10:	learn: 1.3362141	total: 8.03s	remaining: 1m 5s
11:	learn: 1.3321461	total: 8.76s	remaining: 1m 4s
12:	learn: 1.3281535	total: 9.51s	remaining: 1m 3s
13:	learn: 1.3242744	total: 10.3s	remaining: 1m 3s
14:	learn: 1.3204048	total: 11s	remaining: 1m 2s
15:	learn: 1.3166872	total: 11.7s	remaining: 1m 1s
16:	learn: 1.3129628	total: 12.5s	remaining: 1m
17:	learn: 1.3093611	total: 13.3s	remaining: 1m
18:	learn: 1.3057861	total: 14s	remaining: 59.7s
19:	learn: 1.3022454	total: 14.7s	remaining: 59s
2

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00     24838
           1       0.46      0.18      0.26    122642
           2       0.36      0.85      0.51    109369
           3       0.37      0.05      0.09     59546

    accuracy                           0.38    316395
   macro avg       0.30      0.27      0.21    316395
weighted avg       0.37      0.38      0.29    316395

0.37594778678550544
0.29347384075638455
0.21396161017157708


In [38]:
# XGBoost Example
from xgboost import XGBClassifier

# Train an XGBoost model
xgb = XGBClassifier(n_estimators=200, max_depth=10, learning_rate=0.01)
xgb.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb.predict(X_test)
print(classification_report(y_test, y_pred))
fff(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.06      0.03      0.04     24838
           1       0.44      0.33      0.38    122642
           2       0.36      0.59      0.44    109369
           3       0.32      0.16      0.21     59546

    accuracy                           0.37    316395
   macro avg       0.29      0.28      0.27    316395
weighted avg       0.36      0.37      0.34    316395

0.3652807408460943
0.3441999435890169
0.2696598722805282


In [77]:
xgb.save_model("xgb.json")

In [79]:
model_xgb_2 = XGBClassifier()
model_xgb_2.load_model("xgb.json")

In [80]:
y_pred = model_xgb_2.predict(X_test)
print(classification_report(y_test, y_pred))
fff(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.06      0.03      0.04     24838
           1       0.44      0.33      0.38    122642
           2       0.36      0.59      0.44    109369
           3       0.32      0.16      0.21     59546

    accuracy                           0.37    316395
   macro avg       0.29      0.28      0.27    316395
weighted avg       0.36      0.37      0.34    316395

0.3652807408460943
0.3441999435890169
0.2696598722805282


In [72]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bagging = BaggingClassifier(DecisionTreeClassifier(), n_estimators=100, random_state=42)
bagging.fit(X_train, y_train)

y_pred = bagging.predict(X_test)
print(classification_report(y_test, y_pred))
fff(y_test, y_pred)

KeyboardInterrupt: 

In [42]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(n_estimators=30, max_depth=8).fit(X_train, y_train)
y_pred = gbc.predict(X_test)
print(classification_report(y_test, y_pred))
fff(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.05      0.00      0.00     24838
           1       0.46      0.17      0.24    122642
           2       0.36      0.89      0.51    109369
           3       0.28      0.00      0.01     59546

    accuracy                           0.37    316395
   macro avg       0.29      0.26      0.19    316395
weighted avg       0.36      0.37      0.27    316395

0.3719938684239637
0.2721399354454744
0.19031115600051157


In [71]:
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam, Optimizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
import numpy as np
import math
from keras.utils import to_categorical

# Кодируем целевую переменную в one-hot формат
y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)
num_classes = y_train_cat.shape[1]

# Определяем модель с уменьшенным размером
inputs = Input(shape=(X_train.shape[1],))

x = Dense(64, activation='relu')(inputs)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)

x = Dense(128, activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)

outputs = Dense(num_classes, activation='softmax')(x)

model = Model(inputs=inputs, outputs=outputs)

# Компилируем модель с более быстрой оптими��ацией
class CustomOptimizer(Adam):
    def __init__(self, learning_rate=0.001, **kwargs):
        super().__init__(learning_rate=learning_rate, **kwargs)

optimizer = CustomOptimizer()

model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Функция динамического изменения скорости обучения
def dynamic_lr_schedule(epoch):
    return 0.01 * math.pow(0.5, math.floor((1 + epoch) / 10))

# Колбэки
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, min_delta=0.001),
    tf.keras.callbacks.LearningRateScheduler(dynamic_lr_schedule)
]

# Создаем параллельные потоки данных
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train_cat))
train_dataset = train_dataset.batch(512).prefetch(tf.data.AUTOTUNE)

test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test_cat))
test_dataset = test_dataset.batch(512).prefetch(tf.data.AUTOTUNE)

# Обучение модели
history = model.fit(train_dataset, epochs=50, validation_data=test_dataset, callbacks=callbacks)

# Оценка модели
y_pred = np.argmax(model.predict(X_test), axis=1)

# Метрики
f1 = f1_score(y_test, y_pred, average='weighted')
accuracy = accuracy_score(y_test, y_pred)

print(f'F1-мера: {f1:.4f}')
print(f'Accuracy: {accuracy:.4f}')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
fff(y_test, y_pred)

Epoch 1/50
[1m2472/2472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.4564 - loss: 1.0929 - val_accuracy: 0.3709 - val_loss: 1.2629 - learning_rate: 0.0100
Epoch 2/50
[1m2472/2472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - accuracy: 0.4696 - loss: 1.0547 - val_accuracy: 0.3655 - val_loss: 1.2589 - learning_rate: 0.0100
Epoch 3/50
[1m2472/2472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - accuracy: 0.4706 - loss: 1.0531 - val_accuracy: 0.3600 - val_loss: 1.2756 - learning_rate: 0.0100
Epoch 4/50
[1m2472/2472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - accuracy: 0.4713 - loss: 1.0518 - val_accuracy: 0.3597 - val_loss: 1.2649 - learning_rate: 0.0100
Epoch 5/50
[1m2472/2472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.4723 - loss: 1.0509 - val_accuracy: 0.3498 - val_loss: 1.2855 - learning_rate: 0.0100
Epoch 6/50
[1m2472/2472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

In [53]:
print(type(X_test))
print(type(y_pred))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [66]:
import pandas as pd

new_df = pd.DataFrame({'viewer_uid': id, 'predicted_age': y_pred})
grouped_df = new_df.groupby('viewer_uid')['predicted_age'].median().astype(int).reset_index()
grouped_df.head(10)

Unnamed: 0,viewer_uid,predicted_age
0,10401033,2
1,10401074,2
2,10401082,2
3,10401088,2
4,10401100,2
5,10401108,2
6,10401113,1
7,10401153,2
8,10401155,2
9,10401157,2


In [67]:
targets.sort_values(by='viewer_uid')
targets.groupby('viewer_uid')['age_class'].median().astype(int).reset_index()

Unnamed: 0,viewer_uid,age_class
0,10000001,2
1,10000002,3
2,10000004,2
3,10000005,2
4,10000006,2
...,...,...
180007,11140828,2
180008,11140869,1
180009,11140872,3
180010,11140875,1


In [68]:
merged_df = pd.merge(grouped_df, targets[['age_class', 'viewer_uid']], on='viewer_uid', how='inner')

merged_df.head(5)

Unnamed: 0,viewer_uid,predicted_age,age_class
0,10401033,2,3
1,10401074,2,1
2,10401082,2,0
3,10401088,2,2
4,10401100,2,2


In [69]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52793 entries, 0 to 52792
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   viewer_uid     52793 non-null  int64
 1   predicted_age  52793 non-null  int32
 2   age_class      52793 non-null  int64
dtypes: int32(1), int64(2)
memory usage: 1.0 MB


In [70]:
matches = merged_df['predicted_age'].eq(merged_df['age_class'])
matches.mean()

0.35550167635860813