In [10]:
import polars as pl
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve, auc
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib
import json
import string
import re
from tqdm import tqdm
import gc
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import LabelEncoder
# from sentence_transformers import SentenceTransformer
import torch
import torch.nn as nn
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
import logging
from pprint import pprint
# logging.getLogger('transformers').setLevel(logging.ERROR)
# logging.getLogger('sentence_transformers').setLevel(logging.ERROR)
# logging.getLogger('transformers').setLevel(logging.INFO)
# logging.getLogger('sentence_transformers').setLevel(logging.INFO)

In [5]:
def load_data():
    attributes_path = "/kaggle/input/ozon-hackaton/train/attributes.parquet"
    resnet_path = "/kaggle/input/ozon-hackaton/train/resnet.parquet"
    text_and_bert_path = "/kaggle/input/ozon-hackaton/train/text_and_bert.parquet"
    train_path = "/kaggle/input/ozon-hackaton/train/train.parquet"

    # Используем Polars для чтения файлов parquet
    attributes = pl.read_parquet(attributes_path)
    resnet = pl.read_parquet(resnet_path)
    text_and_bert = pl.read_parquet(text_and_bert_path)
    train = pl.read_parquet(train_path)

    # Возвращаем только половину данных
    print(attributes.shape)
    print(attributes[1799186009])

    print(resnet.shape)
    print(text_and_bert.shape)
    print(train.shape)
    return attributes, resnet, text_and_bert, train[:250000]

In [None]:
def process_text_and_bert(df):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  
                           u"\U0001F300-\U0001F5FF"
                           u"\U0001F680-\U0001F6FF"
                           u"\U0001F1E0-\U0001F1FF"
                           "]+", flags=re.UNICODE)
    df = df.with_columns(
        (pl.col("variantid")).alias("variantid"),
        (pl.col("description")
        .fill_null("")
        .str.replace_all(r'<[^<]+?>', ' ', literal=False)
        .str.replace_all(r'http\S+', ' ', literal=False)
        .str.replace_all(r'[^\w\s]', ' ', literal=False)
        .str.replace_all(r'[^a-zA-Zа-яА-Я0-9]', ' ', literal=False)
        .str.replace_all(r'<[^>]+>|&[a-zA-Z0-9#]+;', ' ', literal=False)
        .str.to_lowercase()
        .str.replace_all(r"\s+", " ", literal=False)
        .alias("combined_text")),
        (pl.col("name_bert_64")).alias("name_bert_64")
    )
    df=df.with_columns(pl.col("combined_text").map_elements(lambda x: emoji_pattern.sub(r[''], x),return_dtype=str).alias("combined_text"))
    df = df.drop(["name", "description"])
    print(df.head(5))
    return df

In [None]:
def merge_data(train,attributes, resnet, text_and_bert):
    # Оптимизация объединения данных с использованием join
    train_data = train.join(
        resnet.select(["variantid", "main_pic_embeddings_resnet_v1"]),
        left_on="variantid1",
        right_on="variantid",
        how="left",
    ).rename({"main_pic_embeddings_resnet_v1": "pic_embeddings_1"[0]})

    train_data = train_data.join(
        resnet.select(["variantid", "main_pic_embeddings_resnet_v1"]),
        left_on="variantid2",
        right_on="variantid",
        how="left",
    ).rename({"main_pic_embeddings_resnet_v1": "pic_embeddings_2"[0]})

    train_data = train_data.join(
        text_and_bert.select(["variantid", "combined_text","name_bert_64"]),
        left_on="variantid1",
        right_on="variantid",
        how="left",
    ).rename({"combined_text": "text_1","name_bert_64":"name_bert_1"[0]})

    train_data = train_data.join(
        text_and_bert.select(["variantid", "combined_text","name_bert_64"]),
        left_on="variantid2",
        right_on="variantid",
        how="left",
    ).rename({"combined_text": "text_2","name_bert_64":"name_bert_2"[0]})
        
    train_data = train_data.join(
        attributes.select(["variantid", "categories"]),
        left_on="variantid1",
        right_on="variantid",
        how="left",
    ).rename({"categories": "categories_1"[0]})

    train_data = train_data.join(
        attributes.select(["variantid", "categories"]),
        left_on="variantid2",
        right_on="variantid",
        how="left",
    ).rename({"categories": "categories_2"[0]})
    return train_data.drop_nulls()

In [11]:
def prepare_data(train_data, tfidf_vectorizer):
    num_samples = 200000
    text_data = (train_data['text_1'][num_samples:num_samples*2] + ' ' + train_data['text_2'][num_samples:num_samples*2]).to_numpy()
    print(train_data['text_1'].shape)
    print(train_data['text_2'].shape)
    print(text_data.shape)
 
    text_embeddings = tfidf_vectorizer.fit_transform(text_data).toarray()
    del text_data
    print("Созданы text_embeddings")
    train_data = train_data.drop(['text_1', 'text_2',])
    gc.collect()
    name_bert_1 = train_data['name_bert_1'][num_samples:num_samples*2]
    name_bert_2 = train_data['name_bert_2'][num_samples:num_samples*2]
    print(name_bert_1.shape)
    print(name_bert_2.shape)
    name_bert= []
    cosine_sim_name_bert = []
    for i in tqdm(range(0, num_samples), desc="Обработка образцов"):
        name_bert.append(np.concatenate([name_bert_1[i].to_numpy().astype(np.float32), name_bert_2[i].to_numpy().astype(np.float32)]))
        cosine_sim_name_bert.append(cosine_similarity(name_bert_1[i].to_numpy().astype(np.float32).reshape(1, -1), name_bert_2[i].to_numpy().astype(np.float32).reshape(1, -1)).flatten())
    print(cosine_sim_name_bert[0].shape)
    name_bert = np.asarray(name_bert, dtype=np.float32)
    cosine_sim_name_bert = np.asarray(cosine_sim_name_bert, dtype=np.float32)
    del name_bert_1
    del name_bert_2
    train_data = train_data.drop(['name_bert_1', 'name_bert_1'])
    gc.collect()
    print("Созданы name_bert embeddings")
 
    combined_embeddings = []
    # Обработка данных пакетами
    for i in tqdm(range(0, num_samples), desc="Обработка образцов"):
        pic_emb_1 = np.concatenate([np.array(x) for x in train_data[i,'pic_embeddings_1'].to_list()]).astype(np.float32)
        pic_emb_2 = np.concatenate([np.array(x) for x in train_data[i,'pic_embeddings_2'].to_list()]).astype(np.float32)
        text_emb = text_embeddings[i].astype(np.float32)
 
        cat_1 = train_data['categorie_1_1', 'categorie_1_2', 'categorie_1_3', 'categorie_1_4'][i].to_numpy()
        cat_2 = train_data['categorie_2_1', 'categorie_2_2', 'categorie_2_3', 'categorie_2_4'][i].to_numpy()
        cosine_sim_attr = cosine_similarity(cat_1, cat_2)[0][0]
        if pic_emb_1.size > 0 and pic_emb_2.size > 0:  # Убедимся, что эмбеддинги не пусты
            cosine_sim = cosine_similarity([pic_emb_1], [pic_emb_2])[0][0]
        else:
            cosine_sim = 0.0
        combined = np.concatenate([text_emb, name_bert[i], [cosine_sim], cosine_sim_name_bert[i], cat_1.flatten(), cat_2.flatten(), [cosine_sim_attr]])
        combined_embeddings.append(combined)
    train_data = train_data.drop(['categorie_1_1', 'categorie_1_2', 'categorie_1_3', 'categorie_1_4', 'categorie_2_1', 'categorie_2_2', 'categorie_2_3', 'categorie_2_4'])
    del text_embeddings
    del name_bert
    del cosine_sim_name_bert
    y = train_data[num_samples:num_samples*2]['target'].to_numpy()
    del train_data
    gc.collect()
    print("Не нужные колонки удалены")
    X = np.vstack(combined_embeddings)
    del combined_embeddings
 
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=23)
    gc.collect()
    return X_train, X_val, y_train, y_val

In [None]:
def train_model(X_train, y_train):
    model = LogisticRegression(max_iter=5000)
    model.fit(X_train, y_train)
    joblib.dump(model, "baseline.pkl")
    return model


def evaluate_model(model, X_val, y_val):
    y_pred_prob = model.predict_proba(X_val)[:, 1]
    y_pred = (y_pred_prob >= 0.5).astype(int)

    precision, recall, _ = precision_recall_curve(y_val, y_pred_prob)
    prauc = auc(recall, precision)
    print(f"PRAUC: {prauc[0]}")


In [12]:
def load_train_data():
    data_train_path = "/kaggle/input/data-trian-ozon-hack/data_train.parquet"
    data_train = pl.read_parquet(data_train_path)
    return data_train

In [None]:
attributes, resnet, text_and_bert, train = load_data()
print("данные загружены")

In [None]:
attributes["characteristic_attributes_mapping"][4516]

In [None]:
t1 = 0  
t2 = 0  
t3 = 0  
b=0
for i in range(10):
    characteristic_attributes = json.loads(attributes["characteristic_attributes_mapping"][i])
    if "Тип" in characteristic_attributes:
        characteristic_attributes["Тип"]=f"{characteristic_attributes.get('Тип',[['']])[0]} {characteristic_attributes.get('Бренд',[''])[0]} {characteristic_attributes.get('Страна-изготовитель',[''])[0]} {characteristic_attributes.get('Материал',[''])[0]}"
        print(characteristic_attributes)
        
    elif "Материал подошвы обуви" in characteristic_attributes or "Российский размер (обуви)" in characteristic_attributes:
        characteristic_attributes["Тип"]=f"{characteristic_attributes.get('Бренд в одежде и обуви',[''])[0]} {characteristic_attributes.get('Вид застёжки',[''])[0]} {characteristic_attributes.get('Вид каблука',[''])[0]} {characteristic_attributes.get('Пол',[''])[0]} {characteristic_attributes.get('Российский размер (обуви)',[''])[0]} {characteristic_attributes.get('Страна-изготовитель',[''])[0]}"
    elif "Тип книги" in characteristic_attributes:
        characteristic_attributes["Тип"]=f"{characteristic_attributes.get('Автор',[''])[0]} {characteristic_attributes.get('Год выпуска',[''])[0]} {characteristic_attributes.get('Издательство',[''])[0]} {characteristic_attributes.get('Тип обложки',[''])[0]} {characteristic_attributes.get('Количество страниц',[''])[0]}"
    elif "Модель процессора" in characteristic_attributes:
        characteristic_attributes["Тип"]=f"{characteristic_attributes.get('Бренд',[''])[0]} {characteristic_attributes.get('Конфигурация',[''])[0]}"
    elif "Комплектация постельного белья" in characteristic_attributes:
        characteristic_attributes["Тип"]="постельное белье"
    elif "Педали велосипеда" in characteristic_attributes or "Диаметр колес, дюймы" in characteristic_attributes or "Количество колес" in characteristic_attributes:
        characteristic_attributes["Тип"]="велосипед"
    elif "Пульт ДУ" in characteristic_attributes or "Тип экрана" in characteristic_attributes:
         characteristic_attributes["Тип"]="камера для машины"
    elif "Предназначено для" in characteristic_attributes:
         characteristic_attributes["Тип"]=characteristic_attributes["Предназначено для"]
    elif "Материал тента" in characteristic_attributes:
         characteristic_attributes["Тип"]="тент"
    elif "Ширина ремешка, мм" in characteristic_attributes:
         characteristic_attributes["Тип"]="аксесуар на руку"
    else:
        t3 += 1

print(t1, t2, t3)


In [None]:
text_and_bert = process_text_and_bert(text_and_bert)
print(text_and_bert.shape)
print("обработка text")

In [None]:
#Объединение данных
train_data = merge_data(train,attributes, resnet, text_and_bert)
print(train_data.shape)
print(train_data.head(5))
print("данные объединены")

In [None]:
train_data

In [None]:
del attributes
del resnet 
del text_and_bert
del train
print("attributes, resnet, text_and_bert, train - удалены")

In [None]:
def parse_data(row):
    parts=json.loads(row)
    return parts["1"], parts["2"], parts["3"], parts["4"]

train_data = train_data.with_columns([
    pl.col("categories_1").map_elements(lambda x: parse_data(x)[0],return_dtype=str).alias("categorie_1_1"),
    pl.col("categories_1").map_elements(lambda x: parse_data(x)[1],return_dtype=str).alias("categorie_1_2"),
    pl.col("categories_1").map_elements(lambda x: parse_data(x)[2],return_dtype=str).alias("categorie_1_3"),
    pl.col("categories_1").map_elements(lambda x: parse_data(x)[3],return_dtype=str).alias("categorie_1_4"),
])
print(train_data.head(3))

train_data = train_data.with_columns([
    pl.col("categories_2").map_elements(lambda x: parse_data(x)[0],return_dtype=str).alias("categorie_2_1"),
    pl.col("categories_2").map_elements(lambda x: parse_data(x)[1],return_dtype=str).alias("categorie_2_2"),
    pl.col("categories_2").map_elements(lambda x: parse_data(x)[2],return_dtype=str).alias("categorie_2_3"),
    pl.col("categories_2").map_elements(lambda x: parse_data(x)[3],return_dtype=str).alias("categorie_2_4"),
])



In [None]:
import polars as pl
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

train_data = train_data.with_columns([
    pl.col("categorie_1_1").str.to_lowercase().cast(pl.Utf8),
    pl.col("categorie_1_2").str.to_lowercase().cast(pl.Utf8),
    pl.col("categorie_1_3").str.to_lowercase().cast(pl.Utf8),
    pl.col("categorie_1_4").str.to_lowercase().cast(pl.Utf8),
    pl.col("categorie_2_1").str.to_lowercase().cast(pl.Utf8),
    pl.col("categorie_2_2").str.to_lowercase().cast(pl.Utf8),
    pl.col("categorie_2_3").str.to_lowercase().cast(pl.Utf8),
    pl.col("categorie_2_4").str.to_lowercase().cast(pl.Utf8),
])


for i in ['categorie_1_1', 'categorie_1_2', 'categorie_1_3', 'categorie_1_4', 'categorie_2_1', 'categorie_2_2', 'categorie_2_3', 'categorie_2_4']:
    combined_categories = train_data[f'{i[0]}'].to_list()
    encoded_categories = label_encoder.fit_transform(combined_categories).astype('int16')
    train_data = train_data.with_columns(
        pl.Series(f'{i[0]}', encoded_categories),
    )

In [None]:
train_data

In [13]:
train_data = load_train_data()
print("data_train загружен")

data_train загружен


In [14]:
tfidf_vectorizer = TfidfVectorizer(max_features=3200)
X_train, X_val, y_train, y_val = prepare_data(train_data, tfidf_vectorizer)
print("данные разбиты на train и val")

(1168516,)
(1168516,)
(200000,)
Созданы text_embeddings
(200000,)
(200000,)


Обработка образцов: 100%|██████████| 200000/200000 [01:02<00:00, 3208.14it/s]


(1,)
Созданы name_bert embeddings


Обработка образцов: 100%|██████████| 200000/200000 [03:51<00:00, 864.97it/s]


Не нужные колонки удалены
данные разбиты на train и val


In [15]:
del train_data
print("train_data удален")


train_data удален


In [16]:
joblib.dump(tfidf_vectorizer, "vectorizer.pkl")

['vectorizer.pkl']

In [None]:
class IDECModel(nn.Module):
    def __init__(self, input_dim, embedding_dim):
        super(IDECModel, self).__init__()
        
        # Кодер
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 500),
            nn.ReLU(),
            nn.Linear(500, embedding_dim)
        )
        
        # Декодер
        self.decoder = nn.Sequential(
            nn.Linear(embedding_dim, 500),
            nn.ReLU(),
            nn.Linear(500, input_dim)  # выходной размер должен совпадать с input_dim
        )

    def forward(self, x):
        # Кодирование
        embedding = self.encoder(x)
        
        # Декодирование
        reconstruction = self.decoder(embedding)
        
        return reconstruction, embedding


def train_IDEC(model, data, num_clusters, num_epochs=10):
    model.to(device)  # Переносим модель на устройство
    data = data.to(device)  # Переносим данные на устройство
    
    # Инициализация KMeans
    kmeans = KMeans(n_clusters=num_clusters, n_init='auto')
    
    # Начальная инициализация кластерных центров
    embeddings = model.encoder(data).detach().cpu().numpy()
    kmeans.fit(embeddings)
    cluster_centers = torch.tensor(kmeans.cluster_centers_, dtype=torch.float32).to(device)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = torch.nn.MSELoss()
    
    for epoch in range(num_epochs):
        model.train()  # Устанавливаем модель в режим обучения
        
        # Прямой проход
        reconstructions, embeddings = model(data)
        
        # Вычисление потерь
        loss_reconstruction = criterion(reconstructions, data)
        loss_cluster = torch.mean(torch.norm(embeddings.unsqueeze(1) - cluster_centers, dim=2))
        
        # Общая потеря
        loss = loss_reconstruction + loss_cluster
        
        # Обратное распространение и обновление параметров
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    return model

In [17]:
from catboost import CatBoostClassifier

In [18]:
def train_model2(X_train, y_train, cat_features=None):
    model = CatBoostClassifier(task_type='GPU', iterations=9000, learning_rate=0.005, depth=9, verbose=200, bagging_temperature = 0.5)
    
    model.fit(X_train, y_train, cat_features=cat_features)
    return model

def evaluate_model2(model, X_val, y_val):
    y_pred_prob = model.predict_proba(X_val)[:, 1]
    y_pred = (y_pred_prob >= 0.5).astype(int)

    precision, recall, _ = precision_recall_curve(y_val, y_pred_prob)
    prauc = auc(recall, precision)
    print(f"PRAUC: {prauc}")

In [19]:
model = train_model2(X_train, y_train)
joblib.dump(model, "baseline_catboost_350k_v3.pkl")
evaluate_model2(model, X_val, y_val)

0:	learn: 0.6921859	total: 2.88s	remaining: 7h 11m 55s
400:	learn: 0.5864644	total: 1m 3s	remaining: 22m 36s
600:	learn: 0.5745334	total: 1m 32s	remaining: 21m 33s
800:	learn: 0.5660990	total: 2m 1s	remaining: 20m 44s
1000:	learn: 0.5595598	total: 2m 30s	remaining: 20m 3s
1200:	learn: 0.5540097	total: 2m 59s	remaining: 19m 26s
1400:	learn: 0.5491921	total: 3m 28s	remaining: 18m 51s
1600:	learn: 0.5449351	total: 3m 57s	remaining: 18m 17s
1800:	learn: 0.5409196	total: 4m 26s	remaining: 17m 45s
2000:	learn: 0.5373727	total: 4m 55s	remaining: 17m 13s
2200:	learn: 0.5338834	total: 5m 24s	remaining: 16m 42s
2400:	learn: 0.5304591	total: 5m 53s	remaining: 16m 12s
2600:	learn: 0.5271096	total: 6m 22s	remaining: 15m 42s
2800:	learn: 0.5237672	total: 6m 52s	remaining: 15m 12s
3000:	learn: 0.5205166	total: 7m 21s	remaining: 14m 43s
3200:	learn: 0.5173701	total: 7m 51s	remaining: 14m 14s
3400:	learn: 0.5144698	total: 8m 20s	remaining: 13m 44s
3600:	learn: 0.5115798	total: 8m 50s	remaining: 13m 15s

In [None]:
del model

In [None]:
gc.collect()