In [1]:
import torch
import os
import torch.nn.functional as F
from torch import Tensor
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm, trange

In [3]:
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"

def last_token_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]

def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery: {query}'

def sbs(strings, size=30, space=4):
    strings = list(strings)
    result = []

    while any(strings):
        line = []

        for i, s in enumerate(strings):
            if s is None:
                s = ""
            buf = s[:size]
            
            try:
                n = buf.index("\n")
                line.append(buf[:n].ljust(size))
                strings[i] = s[n+1:]
            except ValueError:
                line.append(buf.ljust(size))
                strings[i] = s[size:]

        result.append((" " * space).join(line))
    
    return "\n".join(result)

# device = "cuda:4"

# load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large-instruct')
model = AutoModel.from_pretrained('intfloat/multilingual-e5-large-instruct')

tokenizer_config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

In [4]:
device_ids = [3]
model = nn.DataParallel(model, device_ids=device_ids)
n_gpu = len(device_ids)

In [5]:
model.to(f"cuda:{device_ids[0]}")
model.device_ids

[3]

In [9]:
max_length = 512
def tokenize(texts):
    tensors = tokenizer(texts, max_length=max_length, padding=True, truncation=True, return_tensors="pt")
    return {k : v for k, v in tensors.items()}

def embed(texts):
    with torch.no_grad():
        t = tokenize(texts)
        
        last_state = model(**t).last_hidden_state
        return last_token_pool(last_state, t["attention_mask"])

def embed_batched(texts, bs=128):
    n = len(texts)
    res = []
    for i in trange(0, n, bs):
        res.append(embed(texts[i:i+bs]))
    return list(torch.cat(res).cpu().numpy().squeeze())

In [2]:
path = "../../data/"
data = pd.read_csv(path + 'train_events.csv')
video = pd.read_csv(path + 'video_info_v2.csv')
targets = pd.read_csv(path + 'train_targets.csv')

In [10]:
embeds = embed_batched(list(video['title']))

100%|██████████| 3762/3762 [17:39<00:00,  3.55it/s]


In [34]:
#np.save('data/embeds.npy', embeds)
embeds = np.load('data/embeds.npy')

In [35]:
from sklearn.decomposition import PCA

def reduce_dimensionality(data, target_dimension=32):
    """
    Уменьшает размерность данных с d до target_dimension с помощью PCA.

    Args:
        data (numpy.ndarray): Исходные данные размерности (n_samples, d).
        target_dimension (int): Желаемая размерность выходных данных. По умолчанию 32.

    Returns:
        numpy.ndarray: Данные размерности (n_samples, target_dimension).
    """
    # Инициализируем PCA с целевой размерностью
    pca = PCA(n_components=target_dimension)
    
    # Применяем PCA к данным
    reduced_data = pca.fit_transform(data)
    
    return reduced_data

In [36]:
embeds_32 = reduce_dimensionality(embeds, target_dimension=256)

In [3]:
#np.save('data/embeds_32.npy', embeds_32)
embeds_32 = np.load('data/embeds_32.npy')

In [4]:
targets.head()

Unnamed: 0,viewer_uid,age,sex,age_class
0,10087154,30,male,1
1,10908708,25,female,1
2,10190464,34,male,2
3,10939673,25,male,1
4,10288257,48,male,3


In [5]:
data.head()

Unnamed: 0,event_timestamp,region,ua_device_type,ua_client_type,ua_os,ua_client_name,total_watchtime,rutube_video_id,viewer_uid
0,2024-06-01 06:40:58+03:00,Chelyabinsk,desktop,browser,Windows,Yandex Browser,1883,video_133074,10067243
1,2024-06-01 19:33:24+03:00,Bashkortostan Republic,smartphone,mobile app,Android,Rutube,512,video_362960,10245341
2,2024-06-01 21:30:43+03:00,St.-Petersburg,desktop,browser,Windows,Chrome,5647,video_96775,10894333
3,2024-06-01 23:03:42+03:00,Moscow,smartphone,mobile app,Android,Rutube,1521,video_161610,10029092
4,2024-06-01 22:48:09+03:00,Moscow,smartphone,mobile app,Android,Rutube,71,video_116245,10452976


In [6]:
data = data.sort_values(by='event_timestamp')

In [7]:
video.head()

Unnamed: 0,rutube_video_id,title,category,duration,author_id
0,video_185549,Как собрать букет из мыльных тюльпанов - Силик...,Хобби,1559160,1015054
1,video_111035,"Осторожно, Киберземляне!, 1 сезон, 12 серия",Сериалы,1320007,1002180
2,video_476517,ПОПУЛЯРНЫЕ ВИДЕОИГРЫ в LEGO... перевод - TD BR...,Хобби,606145,1095337
3,video_157198,"Хороший лжец (фильм, 2019)",Фильмы,6577440,1043618
4,video_289824,Нашего старого гнобят по-всякому,Развлечения,859493,1009535


In [64]:
X = np.stack(video['embeds'])
y = video['category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

# CatBoost для предсказания пола
catboost_cat = CatBoostClassifier(iterations=1000, verbose=150, task_type="GPU", devices='3')
catboost_cat.fit(X_train, y_train, eval_set=(X_val, y_val))
y_pred_gender = catboost_cat.predict(X_test)

print("Classification report for gender prediction:")
print(classification_report(y_test, y_pred_gender))

Learning rate set to 0.186365
0:	learn: 2.5413746	test: 2.5440332	best: 2.5440332 (0)	total: 45.5ms	remaining: 45.5s
150:	learn: 1.2579594	test: 1.3729223	best: 1.3729223 (150)	total: 6.78s	remaining: 38.1s
300:	learn: 1.0873548	test: 1.3018527	best: 1.3018527 (300)	total: 13.2s	remaining: 30.8s
450:	learn: 0.9761509	test: 1.2705220	best: 1.2705220 (450)	total: 19.4s	remaining: 23.7s
600:	learn: 0.8851510	test: 1.2521097	best: 1.2521097 (600)	total: 25.5s	remaining: 16.9s
750:	learn: 0.8108169	test: 1.2402380	best: 1.2402380 (750)	total: 31.6s	remaining: 10.5s
900:	learn: 0.7452689	test: 1.2328872	best: 1.2328872 (900)	total: 37.8s	remaining: 4.15s
999:	learn: 0.7057376	test: 1.2285114	best: 1.2285114 (999)	total: 41.9s	remaining: 0us
bestTest = 1.228511406
bestIteration = 999
Classification report for gender prediction:
                              precision    recall  f1-score   support

                   Авто-мото       0.58      0.42      0.49      1606
                       Ани

In [8]:
video['duration_sec'] = video['duration'] / 1000

In [9]:
video[[f'embeds_{i}' for i in range(32)]] = embeds_32

In [10]:
video['embeds'] = list(embeds_32)

In [11]:
data = pd.merge(data, video[['rutube_video_id', 'embeds', 'duration_sec']], on='rutube_video_id', how='inner')

In [12]:
data['part_wached'] = data['total_watchtime'] / data['duration_sec']

In [13]:
# Усредняем с коэффициентами
def custom_aggregation(df):
    coeffs = np.array(df['part_wached'])
    coeffs /= np.linalg.norm(coeffs)
    return np.sum([coef * vec for vec, coef in zip(df['embeds'], coeffs)], axis=0)


def no_aggregation(df):
    return np.array(df['embeds'])

users_embeds = data.groupby('viewer_uid').apply(custom_aggregation)

  users_embeds = data.groupby('viewer_uid').apply(custom_aggregation)


In [14]:
users_embeds.name = 'embed'

In [15]:
targets = pd.merge(users_embeds, targets, on='viewer_uid', how='inner')

In [16]:
targets.head()

Unnamed: 0,viewer_uid,embed,age,sex,age_class
0,10000001,"[-12.513889382009461, 7.1613850705614315, -1.9...",40,female,2
1,10000002,"[-8.857678614174775, 7.798440248333709, -0.388...",44,male,3
2,10000004,"[0.6842827782612931, -0.6595857233902118, -3.8...",36,male,2
3,10000005,"[2.962346634406903, -1.1010594439649803, -1.52...",38,male,2
4,10000006,"[-3.6732354635099047, 0.7471836554557129, -0.0...",38,male,2


In [17]:
targets['sex'] = targets['sex'].apply(lambda x: 0 if x == 'male' else 1)

In [18]:
targets[[f'embeds_{i}' for i in range(32)]] = np.stack(targets['embed'])

In [2]:
targets = pd.read_csv('data/target_embeds-custom_aggregation.csv')

In [3]:
targets.head()

Unnamed: 0,viewer_uid,age,embeds_0,embeds_1,embeds_2,embeds_3,embeds_4,embeds_5,embeds_6,embeds_7,...,embeds_22,embeds_23,embeds_24,embeds_25,embeds_26,embeds_27,embeds_28,embeds_29,embeds_30,embeds_31
0,10000001,40,-12.513889,7.161385,-1.921415,-2.510002,-2.536459,3.87285,1.688143,-2.812249,...,-2.259542,-5.150448,1.861434,0.296535,2.228844,-0.781238,0.979583,0.499716,-0.831642,2.697985
1,10000002,44,-8.857679,7.79844,-0.388398,-2.04465,1.806627,-0.58076,-2.159648,0.328372,...,-0.75543,-2.0911,2.445388,-0.896826,-2.626723,1.500267,-1.236043,-1.899359,1.300126,0.106583
2,10000004,36,0.684283,-0.659586,-3.842297,2.629033,0.829981,-2.002862,0.163314,3.279346,...,0.424814,-2.111815,1.891209,-0.343905,-0.277934,-0.358481,-0.651661,-1.051867,-0.105431,-1.498083
3,10000005,38,2.962347,-1.101059,-1.526201,-1.449923,-0.182694,-1.36175,-0.151736,-0.204418,...,0.395631,-0.477124,-0.675521,-0.393002,0.410601,-0.89556,-1.239015,-0.741628,-0.058947,-1.069365
4,10000006,38,-3.673235,0.747184,-0.086243,-0.816762,-0.109268,0.662777,-0.299665,-0.554059,...,-0.241953,1.485296,-0.144536,-0.00174,0.563425,-1.091279,0.212252,2.557632,0.318442,-0.514779


In [5]:
targets.drop(columns=['age']).to_parquet('data/target_embeds-custom_aggregation.parquet', index=False)

In [49]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, classification_report

# Создание признаков и целевых переменных
X = np.stack(targets['embed'])
y = targets[['sex', 'age_class', 'age']]

# Разделение на тренировочную и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Логистическая регрессия для предсказания пола
log_reg_gender = LogisticRegression(max_iter=1000)
log_reg_gender.fit(X_train, y_train['sex'])
y_pred_gender = log_reg_gender.predict(X_test)

print("Classification report for gender prediction:")
print(classification_report(y_test['sex'], y_pred_gender))

# Логистическая регрессия для предсказания возраста
log_reg_age = LogisticRegression(max_iter=1000)
log_reg_age.fit(X_train, y_train['age_class'])
y_pred_age = log_reg_age.predict(X_test)

print("Classification report for age prediction:")
print(classification_report(y_test['age_class'], y_pred_age))

# Линейная регрессия для предсказания возраста
lin_reg_age = LinearRegression()
lin_reg_age.fit(X_train, y_train['age'] / y_train['age'].max())
y_pred_age = lin_reg_age.predict(X_test) * y_train['age'].max()
y_pred_age[(y_pred_age <= 20)] = 0
y_pred_age[(20 < y_pred_age) & (y_pred_age <= 30)] = 1
y_pred_age[(30 < y_pred_age) & (y_pred_age <= 40)] = 2
y_pred_age[(40 < y_pred_age)] = 3

print("Classification report for age prediction:")
print(classification_report(y_test['age_class'], y_pred_age))

Classification report for gender prediction:
              precision    recall  f1-score   support

           0       0.71      0.78      0.74     35900
           1       0.76      0.69      0.72     36105

    accuracy                           0.73     72005
   macro avg       0.74      0.73      0.73     72005
weighted avg       0.74      0.73      0.73     72005

Classification report for age prediction:
              precision    recall  f1-score   support

           0       0.21      0.01      0.01      2992
           1       0.46      0.59      0.51     25032
           2       0.42      0.41      0.42     26458
           3       0.47      0.36      0.41     17523

    accuracy                           0.44     72005
   macro avg       0.39      0.34      0.34     72005
weighted avg       0.44      0.44      0.43     72005

Classification report for age prediction:
              precision    recall  f1-score   support

           0       0.21      0.00      0.01      2992


In [50]:
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [51]:
from catboost import CatBoostClassifier


# CatBoost для предсказания пола
catboost_gender = CatBoostClassifier(iterations=1000, verbose=150)
catboost_gender.fit(X_train, y_train['sex'], eval_set=(X_val, y_val['sex']))
y_pred_gender = catboost_gender.predict(X_test)

print("Classification report for gender prediction:")
print(classification_report(y_test['sex'], y_pred_gender))

# CatBoost для предсказания возрастной категории
catboost_age = CatBoostClassifier(iterations=1000, verbose=150)
catboost_age.fit(X_train, y_train['age_class'], eval_set=(X_val, y_val['age_class']))
y_pred_age = catboost_age.predict(X_test)

print("Classification report for age prediction:")
print(classification_report(y_test['age_class'], y_pred_age))

Learning rate set to 0.100733
0:	learn: 0.6680467	test: 0.6683222	best: 0.6683222 (0)	total: 36.9ms	remaining: 36.9s
150:	learn: 0.5033954	test: 0.5193485	best: 0.5193485 (150)	total: 3.86s	remaining: 21.7s
300:	learn: 0.4783710	test: 0.5123885	best: 0.5123885 (300)	total: 7.8s	remaining: 18.1s
450:	learn: 0.4599059	test: 0.5104033	best: 0.5103961 (447)	total: 11.7s	remaining: 14.2s
600:	learn: 0.4441091	test: 0.5097757	best: 0.5097469 (597)	total: 15.4s	remaining: 10.2s
750:	learn: 0.4299061	test: 0.5094574	best: 0.5094227 (749)	total: 19.2s	remaining: 6.35s
900:	learn: 0.4170739	test: 0.5096901	best: 0.5094227 (749)	total: 22.9s	remaining: 2.51s
999:	learn: 0.4089430	test: 0.5099383	best: 0.5094227 (749)	total: 25.5s	remaining: 0us

bestTest = 0.5094226816
bestIteration = 749

Shrink model to first 750 iterations.
Classification report for gender prediction:
              precision    recall  f1-score   support

           0       0.73      0.76      0.74     17850
           1      

In [53]:
from catboost import CatBoostRegressor


# CatBoost для предсказания возрастной категории
catboost_age = CatBoostRegressor(iterations=1500, verbose=150, depth=3)
catboost_age.fit(X_train, y_train['age'], eval_set=(X_val, y_val['age']))
y_pred_age = catboost_age.predict(X_test)
y_pred_age[(y_pred_age <= 20)] = 0
y_pred_age[(20 < y_pred_age) & (y_pred_age <= 30)] = 1
y_pred_age[(30 < y_pred_age) & (y_pred_age <= 40)] = 2
y_pred_age[(40 < y_pred_age)] = 3

print("Classification report for age prediction:")
print(classification_report(y_test['age_class'], y_pred_age))

Learning rate set to 0.082934
0:	learn: 9.0067197	test: 8.9377359	best: 8.9377359 (0)	total: 10.6ms	remaining: 15.9s
150:	learn: 8.3829353	test: 8.3782496	best: 8.3782496 (150)	total: 1.5s	remaining: 13.4s
300:	learn: 8.2593318	test: 8.2927831	best: 8.2927831 (300)	total: 2.75s	remaining: 11s
450:	learn: 8.1840396	test: 8.2487730	best: 8.2487730 (450)	total: 4.19s	remaining: 9.74s
600:	learn: 8.1275031	test: 8.2229122	best: 8.2229122 (600)	total: 5.6s	remaining: 8.38s
750:	learn: 8.0806284	test: 8.2033142	best: 8.2033142 (750)	total: 6.78s	remaining: 6.76s
900:	learn: 8.0403966	test: 8.1913850	best: 8.1913850 (900)	total: 8.23s	remaining: 5.47s
1050:	learn: 8.0031721	test: 8.1795806	best: 8.1795806 (1050)	total: 9.66s	remaining: 4.13s
1200:	learn: 7.9685141	test: 8.1697647	best: 8.1697647 (1200)	total: 10.9s	remaining: 2.71s
1350:	learn: 7.9359805	test: 8.1609774	best: 8.1608994 (1349)	total: 12.2s	remaining: 1.35s
1499:	learn: 7.9068165	test: 8.1540261	best: 8.1540261 (1499)	total: 13

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
