In [1]:
### Создадим пути до директорий по датам использования ноутбука

path = "/"
path_data = "test_data/"
path_result = "result/"
path_models = "models/"

### Выгрузка данных из БД (и сохраненение локально)

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sqlalchemy import create_engine
from tqdm import tqdm
import gc


def get_engine():
    SQLALCHEMY_DATABASE_URL = "postgresql://robot-startml-ro:pheiph0hahj1Vaif@postgres.lab.karpov.courses:6432/startml"
    return create_engine(SQLALCHEMY_DATABASE_URL)


def execute_query(engine, query, chunksize=50000):
    with engine.connect() as connection:
        for chunk in pd.read_sql(query, connection, chunksize=chunksize):
            yield chunk


def load_and_save_table(engine, table_name, save_file_path, chunksize):
    data_chunks = []
    query = f"SELECT * FROM public.{table_name}"
    for chunk in tqdm(execute_query(engine, query, chunksize=chunksize), desc=f"Loading {table_name}..."):
        data_chunks.append(chunk)
        del chunk
        gc.collect()
    data = pd.concat(data_chunks)
    save_to_csv(data, save_file_path, chunksize)
    return data


def save_to_csv(data, filepath, chunksize=50000):
    for i in tqdm(range(0, data.shape[0], chunksize), desc=f"Saving {filepath} to CSV..."):
        chunk = data.iloc[i:i+chunksize]
        if i == 0:
            chunk.to_csv(filepath, index=False, mode='w', sep=',')
        else:
            chunk.to_csv(filepath, index=False, mode='a', header=False, sep=',')
        del chunk
        gc.collect()


def load_initial_tables(engine, save_file_path, chunksize=50000):
    user_data = load_and_save_table(engine, "user_data", save_file_path + "user_data.csv", chunksize)
    post_text_df = load_and_save_table(engine, "post_text_df", save_file_path + "post_text_df.csv", chunksize)
    feed_data = load_and_save_table(engine, "feed_data limit 5000", save_file_path + "feed_data.csv", chunksize)

    # Объединение таблиц
    data = feed_data.merge(user_data, on='user_id', how='inner')
    data = data.merge(post_text_df, on='post_id', how='inner')

    ### Отсортируем данные по дате
    display("Сортировка данных по дате: ")
    data["timestamp"] = pd.to_datetime(data["timestamp"])
    data = data.sort_values(by="timestamp")
    data.reset_index(drop=True, inplace=True)
    
    save_to_csv(data, save_file_path + "all_data.csv", chunksize)
    return data


def train_test_split_sorted(data, train_size=0.8):
    ### Делим выборку 80 на 20
    split_index = int(len(data) * train_size)
    train = data.iloc[:split_index].copy()
    test = data.iloc[split_index:].copy()
    display("Предварительная выборка на трейн: ")
    display(train)
    display("Предварительная выборка на тест: ")
    display(test)
    
    return train, test

### ----->>

# 1. Вытягиваем данные из БД и сохранение в локальном репозитории
engine = get_engine()
chunksize = 50000
initial_table = load_initial_tables(engine=engine, save_file_path=path_data, chunksize=50000)
display("1. Начальные данные загружены!")

# 1.1 Разделим данные на трейн и тест в соотношении 80 на 20 
train_data, test_data = train_test_split_sorted(initial_table, train_size=0.8)

Loading user_data...: 4it [00:02,  1.60it/s]
Saving test_data/user_data.csv to CSV...: 100%|██████████| 4/4 [00:00<00:00,  5.82it/s]
Loading post_text_df...: 1it [00:00,  1.57it/s]
Saving test_data/post_text_df.csv to CSV...: 100%|██████████| 1/1 [00:00<00:00,  3.81it/s]
Loading feed_data limit 5000...: 1it [00:00,  2.93it/s]
Saving test_data/feed_data.csv to CSV...: 100%|██████████| 1/1 [00:00<00:00,  9.39it/s]


'Сортировка данных по дате: '

Saving test_data/all_data.csv to CSV...: 100%|██████████| 1/1 [00:00<00:00,  4.13it/s]


'1. Начальные данные загружены!'

'Предварительная выборка на трейн: '

Unnamed: 0,timestamp,user_id,post_id,action,target,gender,age,country,city,exp_group,os,source,text,topic
0,2021-10-01 12:46:11,51687,4348,view,0,0,29,Russia,Krasnogorsk,2,Android,ads,I was unlucky enough to have seen this at the ...,movie
1,2021-10-01 12:46:31,51687,675,view,0,0,29,Russia,Krasnogorsk,2,Android,ads,Greer attacks bully Big Brother\n\nGermaine Gr...,entertainment
2,2021-10-01 12:47:12,51687,5807,view,0,0,29,Russia,Krasnogorsk,2,Android,ads,"Sharp, well-made documentary focusing on Mardi...",movie
3,2021-10-01 12:47:39,51687,6558,view,0,0,29,Russia,Krasnogorsk,2,Android,ads,I first saw Thief as a child which makes me al...,movie
4,2021-10-01 12:48:31,51687,3559,view,0,0,29,Russia,Krasnogorsk,2,Android,ads,How this year is going (baseball edition) #202...,covid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,2021-12-07 06:56:15,51686,2975,view,1,1,18,Russia,Yekaterinburg,2,Android,ads,It was reported that a witness said he heard t...,covid
3996,2021-12-07 06:58:17,51686,2975,like,0,1,18,Russia,Yekaterinburg,2,Android,ads,It was reported that a witness said he heard t...,covid
3997,2021-12-07 06:58:19,51686,1836,view,1,1,18,Russia,Yekaterinburg,2,Android,ads,Ferguson rues failure to cut gap\n\nBoss Sir A...,sport
3998,2021-12-07 07:00:56,51686,1836,like,0,1,18,Russia,Yekaterinburg,2,Android,ads,Ferguson rues failure to cut gap\n\nBoss Sir A...,sport


'Предварительная выборка на тест: '

Unnamed: 0,timestamp,user_id,post_id,action,target,gender,age,country,city,exp_group,os,source,text,topic
4000,2021-12-07 07:02:29,51686,6745,view,0,1,18,Russia,Yekaterinburg,2,Android,ads,"Michael Allreds comic book stories, particular...",movie
4001,2021-12-07 07:03:43,51686,1898,view,1,1,18,Russia,Yekaterinburg,2,Android,ads,Serena becomes world number two\n\nSerena Will...,sport
4002,2021-12-07 07:05:02,51686,1898,like,0,1,18,Russia,Yekaterinburg,2,Android,ads,Serena becomes world number two\n\nSerena Will...,sport
4003,2021-12-07 07:05:04,51686,1134,view,1,1,18,Russia,Yekaterinburg,2,Android,ads,Lib Dems unveil election slogan\n\nThe Liberal...,politics
4004,2021-12-07 07:07:45,51686,1134,like,0,1,18,Russia,Yekaterinburg,2,Android,ads,Lib Dems unveil election slogan\n\nThe Liberal...,politics
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,2021-12-29 18:19:41,498,1837,view,0,0,33,Russia,Bol’shoye Boldino,1,iOS,ads,Ferguson hails Man Utds resolve\n\nManchester ...,sport
4996,2021-12-29 18:21:52,498,4203,view,0,0,33,Russia,Bol’shoye Boldino,1,iOS,ads,#Serena sets up #Venus clash on return after #...,covid
4997,2021-12-29 18:22:10,498,1787,view,0,0,33,Russia,Bol’shoye Boldino,1,iOS,ads,Officials respond in court row\n\nAustralian t...,sport
4998,2021-12-29 18:24:22,498,3433,view,0,0,33,Russia,Bol’shoye Boldino,1,iOS,ads,Important #mask #tip- the Val es defeat the pu...,covid


### Функция для подготовки данных к трансформеру

In [3]:
def prepare_data(data):

    ### 1. Отмечаем начальные признаки из БД -->>
    
    # Список категориальных признаков
    categorical_cols = ["country", "city", "os", "source", "topic"]
    # Список числовых признаков
    numeric_cols = []
    # Список столбцов, для которых не производим первичную обработку (кодирование, скалирование и тд.)
    passthrough_cols = ["gender", "age", "exp_group", "post_id"]

    ### 2. Сформируем таргетные переменные -->>
    
    # Создаем таргет like_target - признак того, что юзер лайкнул пост или нет
    data["like_target"] = data.apply(lambda row: 1 if row['action'] == 'like' or row['target'] == 1 else 0, axis=1)
    data.drop(["action", "target"], axis=1, inplace=True)
    target_name = ["like_target"]

    """
    # Создадим рейтинг по каждому юзеру (новая таргетная переменная) - перейдем к задаче регрессии,
    # для того чтобы в последствии ранжировать посты по рейтингу (выбираем топ 5 постов)
    
    # Группируем по пользователю и топику, считаем средний рейтинг по лайкам
    user_topic_rating = data.groupby(["user_id", "topic"])["like_target"].mean().reset_index()
    user_topic_rating.columns = ["user_id", "topic", "user_topic_rating"]

    # Добавляем колонку user_topic_rating обратно в исходный DataFrame через map
    data = data.merge(user_topic_rating, on=["user_id", "topic"], how="left")
    """
    
    ### 3. Дропаем осатвшиеся ненужные признаки -->>
    
    # data.drop(["timestamp", "post_id", "text"], axis=1, inplace=True)
    data.set_index("user_id", inplace=True)
    
    ### 4. Разделим на X и y -->>
    
    X_data = data.drop("like_target", axis=1)
    y_data = data["like_target"]

    return data, X_data, y_data, categorical_cols, numeric_cols, passthrough_cols, target_name
    
    

train_data, X_train, y_train, categorical_cols, numeric_cols, passthrough_cols, target_name = prepare_data(train_data)
display("На трейн: ")
display(train_data)

test_data, X_test, y_test, categorical_cols, numeric_cols, passthrough_cols, target_name = prepare_data(test_data)
display("На тест: ")
display(test_data)

'На трейн: '

Unnamed: 0_level_0,timestamp,post_id,gender,age,country,city,exp_group,os,source,text,topic,like_target,user_topic_rating
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
51687,2021-10-01 12:46:11,4348,0,29,Russia,Krasnogorsk,2,Android,ads,I was unlucky enough to have seen this at the ...,movie,0,0.306748
51687,2021-10-01 12:46:31,675,0,29,Russia,Krasnogorsk,2,Android,ads,Greer attacks bully Big Brother\n\nGermaine Gr...,entertainment,0,0.333333
51687,2021-10-01 12:47:12,5807,0,29,Russia,Krasnogorsk,2,Android,ads,"Sharp, well-made documentary focusing on Mardi...",movie,0,0.306748
51687,2021-10-01 12:47:39,6558,0,29,Russia,Krasnogorsk,2,Android,ads,I first saw Thief as a child which makes me al...,movie,0,0.306748
51687,2021-10-01 12:48:31,3559,0,29,Russia,Krasnogorsk,2,Android,ads,How this year is going (baseball edition) #202...,covid,0,0.376068
...,...,...,...,...,...,...,...,...,...,...,...,...,...
51686,2021-12-07 06:56:15,2975,1,18,Russia,Yekaterinburg,2,Android,ads,It was reported that a witness said he heard t...,covid,1,0.229008
51686,2021-12-07 06:58:17,2975,1,18,Russia,Yekaterinburg,2,Android,ads,It was reported that a witness said he heard t...,covid,1,0.229008
51686,2021-12-07 06:58:19,1836,1,18,Russia,Yekaterinburg,2,Android,ads,Ferguson rues failure to cut gap\n\nBoss Sir A...,sport,1,0.219512
51686,2021-12-07 07:00:56,1836,1,18,Russia,Yekaterinburg,2,Android,ads,Ferguson rues failure to cut gap\n\nBoss Sir A...,sport,1,0.219512


'На тест: '

Unnamed: 0_level_0,timestamp,post_id,gender,age,country,city,exp_group,os,source,text,topic,like_target,user_topic_rating
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
51686,2021-12-07 07:02:29,6745,1,18,Russia,Yekaterinburg,2,Android,ads,"Michael Allreds comic book stories, particular...",movie,0,0.188679
51686,2021-12-07 07:03:43,1898,1,18,Russia,Yekaterinburg,2,Android,ads,Serena becomes world number two\n\nSerena Will...,sport,1,0.608696
51686,2021-12-07 07:05:02,1898,1,18,Russia,Yekaterinburg,2,Android,ads,Serena becomes world number two\n\nSerena Will...,sport,1,0.608696
51686,2021-12-07 07:05:04,1134,1,18,Russia,Yekaterinburg,2,Android,ads,Lib Dems unveil election slogan\n\nThe Liberal...,politics,1,0.444444
51686,2021-12-07 07:07:45,1134,1,18,Russia,Yekaterinburg,2,Android,ads,Lib Dems unveil election slogan\n\nThe Liberal...,politics,1,0.444444
...,...,...,...,...,...,...,...,...,...,...,...,...,...
498,2021-12-29 18:19:41,1837,0,33,Russia,Bol’shoye Boldino,1,iOS,ads,Ferguson hails Man Utds resolve\n\nManchester ...,sport,0,0.160000
498,2021-12-29 18:21:52,4203,0,33,Russia,Bol’shoye Boldino,1,iOS,ads,#Serena sets up #Venus clash on return after #...,covid,0,0.204082
498,2021-12-29 18:22:10,1787,0,33,Russia,Bol’shoye Boldino,1,iOS,ads,Officials respond in court row\n\nAustralian t...,sport,0,0.160000
498,2021-12-29 18:24:22,3433,0,33,Russia,Bol’shoye Boldino,1,iOS,ads,Important #mask #tip- the Val es defeat the pu...,covid,0,0.204082


### 1. Трансформер (одиночный, без группировки (только кодирование и скалирование))

In [4]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder as SklearnOneHotEncoder
from category_encoders import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

class CustomTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, categorical_cols_ohe, categorical_cols_mte, numeric_cols, passthrough_cols, target_name, noise_k=0.006):
        self.categorical_cols_ohe = categorical_cols_ohe
        self.categorical_cols_mte = categorical_cols_mte
        self.numeric_cols = numeric_cols
        self.passthrough_cols = passthrough_cols
        self.target_name = target_name
        self.noise_k = noise_k
        self.col_transform = None

    def fit(self, X, y):
        cols_for_ohe_idx = [list(X.columns).index(col) for col in self.categorical_cols_ohe]
        cols_for_mte_idx = [list(X.columns).index(col) for col in self.categorical_cols_mte]
        numeric_cols_idx = [list(X.columns).index(col) for col in self.numeric_cols]
        passthrough_cols_idx = [list(X.columns).index(col) for col in self.passthrough_cols]
        
        """
        Параметры handle_unknown='ignore' и handle_unknown='impute'
        используются для обработки категорий в данных,
        которые не встречались в обучающем наборе (или в процессе fit).
        """
        
        t = [
            ('OneHotEncoder', SklearnOneHotEncoder(handle_unknown='ignore', drop='first'), cols_for_ohe_idx),
            ('MeanTargetEncoder', TargetEncoder(handle_unknown='impute'), cols_for_mte_idx),
            ('StandardScaler', StandardScaler(), numeric_cols_idx),
            ('Passthrough', 'passthrough', passthrough_cols_idx)
        ]

        self.col_transform = ColumnTransformer(transformers=t)
        self.col_transform.fit(X, y)
        
        # Вызов функции для сохранения статистики по пользователям
        self._save_user_stats(X, y)

        return self

    def transform(self, X):
        X_transformed = self.col_transform.transform(X)
        
        X_transformed = pd.DataFrame(X_transformed)
        
        # Вызов функции для создания дополнительных признаков
        X_transformed = self._create_additional_features(X, X_transformed)
        
        # Последняя сеть - назначаем имена столбцам --->>
        X_transformed.columns = self.get_feature_names_out()
        
        return X_transformed

    def _save_user_stats(self, X, y):
        X_with_target = pd.concat([X, y], axis=1)
        user_count_views = X_with_target.groupby('user_id').size()
        user_means = X_with_target.groupby('user_id')[self.target_name].mean()
        
        self.user_stats = {
            'views': user_count_views.to_dict(),
            'means': user_means.to_dict()
        }

    def _create_additional_features(self, X, X_transformed):
        user_count_views = pd.Series(self.user_stats['views'])
        user_means = pd.Series(self.user_stats['means'][self.target_name[0]])
        
        X_ = X.copy()
        X_.reset_index(inplace=True)
        
        ### Среднее кол-во просмотров
        X_transformed['userViews'] = X_['user_id'].map(user_count_views).fillna(user_count_views.mean())
        ### Средняя оценка
        X_transformed['userMeans'] = X_['user_id'].map(user_means).fillna(user_means.mean()) + np.random.normal(0, self.noise_k, X.shape[0])
        
        X_transformed["user_id"] = X_["user_id"]
        
        X_transformed.set_index("user_id", inplace=True)
        
        return X_transformed

    def get_feature_names_out(self):
        ohe_feature_names = self.col_transform.named_transformers_['OneHotEncoder'].get_feature_names_out(self.categorical_cols_ohe)
        mte_feature_names = self.categorical_cols_mte  # TargetEncoder не изменяет имена колонок
        numeric_feature_names = self.numeric_cols
        passthrough_feature_names = self.passthrough_cols
        additional_feature_names = ['userViews', 'userMeans']
        return np.concatenate([ohe_feature_names, mte_feature_names, numeric_feature_names, passthrough_feature_names, additional_feature_names])


In [5]:

cols_for_ohe = [x for x in categorical_cols if X_train[x].nunique() < 5]
cols_for_mte = [x for x in categorical_cols if X_train[x].nunique() >= 5]
numeric_cols = numeric_cols             # 
passthrough_cols = passthrough_cols     # -->> Получаем от функции prepare_data
target_name = target_name               #

pipeline = Pipeline([
    ('custom_transformer', CustomTransformer(
        categorical_cols_ohe=cols_for_ohe,
        categorical_cols_mte=cols_for_mte,
        numeric_cols=numeric_cols,
        passthrough_cols = passthrough_cols,
        target_name=target_name
    ))
])

pipeline.fit(X_train, y_train)
X_transformed = pipeline.transform(X_train)
display(X_transformed)




Unnamed: 0_level_0,country_Turkey,country_Ukraine,os_iOS,source_organic,city,topic,gender,age,exp_group,post_id,userViews,userMeans
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
51687,0.0,0.0,0.0,0.0,0.288848,0.217807,0.0,29.0,2.0,4348.0,547,0.286332
51687,0.0,0.0,0.0,0.0,0.288848,0.236025,0.0,29.0,2.0,675.0,547,0.287479
51687,0.0,0.0,0.0,0.0,0.288848,0.217807,0.0,29.0,2.0,5807.0,547,0.291366
51687,0.0,0.0,0.0,0.0,0.288848,0.217807,0.0,29.0,2.0,6558.0,547,0.291766
51687,0.0,0.0,0.0,0.0,0.288848,0.236038,0.0,29.0,2.0,3559.0,547,0.290505
...,...,...,...,...,...,...,...,...,...,...,...,...
51686,0.0,0.0,0.0,0.0,0.209073,0.236038,1.0,18.0,2.0,2975.0,507,0.216176
51686,0.0,0.0,0.0,0.0,0.209073,0.236038,1.0,18.0,2.0,2975.0,507,0.201945
51686,0.0,0.0,0.0,0.0,0.209073,0.190476,1.0,18.0,2.0,1836.0,507,0.196769
51686,0.0,0.0,0.0,0.0,0.209073,0.190476,1.0,18.0,2.0,1836.0,507,0.201749


### 1.1 Проверка работоспособности трансформера

In [6]:
display('Ввод: ')
display(X_test.iloc[:1, :])

display('Вывод: ')
X_transformed_test = pipeline.transform(X_test.iloc[:1, :])
display(X_transformed_test)

'Ввод: '

Unnamed: 0_level_0,timestamp,post_id,gender,age,country,city,exp_group,os,source,text,topic,user_topic_rating
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
51686,2021-12-07 07:02:29,6745,1,18,Russia,Yekaterinburg,2,Android,ads,"Michael Allreds comic book stories, particular...",movie,0.188679


'Вывод: '

Unnamed: 0_level_0,country_Turkey,country_Ukraine,os_iOS,source_organic,city,topic,gender,age,exp_group,post_id,userViews,userMeans
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
51686,0.0,0.0,0.0,0.0,0.209073,0.217807,1.0,18.0,2.0,6745.0,507,0.216682


### 2. Добавим в трансформер группировку по "topic" (для уменьшения кол-ва записей)

In [7]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder as SklearnOneHotEncoder
from category_encoders import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

class CustomTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, categorical_cols_ohe, categorical_cols_mte, numeric_cols, passthrough_cols, target_name, noise_k=0.006):
        self.categorical_cols_ohe = categorical_cols_ohe
        self.categorical_cols_mte = categorical_cols_mte
        self.numeric_cols = numeric_cols
        self.passthrough_cols = passthrough_cols
        self.target_name = target_name
        self.noise_k = noise_k
        self.col_transform = None

    def fit(self, X, y):
        cols_for_ohe_idx = [list(X.columns).index(col) for col in self.categorical_cols_ohe]
        cols_for_mte_idx = [list(X.columns).index(col) for col in self.categorical_cols_mte]
        numeric_cols_idx = [list(X.columns).index(col) for col in self.numeric_cols]
        passthrough_cols_idx = [list(X.columns).index(col) for col in self.passthrough_cols]
        
        """
        Параметры handle_unknown='ignore' и handle_unknown='impute'
        используются для обработки категорий в данных,
        которые не встречались в обучающем наборе (или в процессе fit).
        """
        
        t = [
            ('OneHotEncoder', SklearnOneHotEncoder(handle_unknown='ignore', drop='first'), cols_for_ohe_idx),
            ('MeanTargetEncoder', TargetEncoder(handle_unknown='impute'), cols_for_mte_idx),
            ('StandardScaler', StandardScaler(), numeric_cols_idx),
            ('Passthrough', 'passthrough', passthrough_cols_idx)
        ]

        self.col_transform = ColumnTransformer(transformers=t)
        self.col_transform.fit(X, y)
        
        # Вызов функции для сохранения статистики по пользователям
        self._save_user_stats(X, y)

        return self

    def transform(self, X):
        X_transformed = self.col_transform.transform(X)
        
        X_transformed = pd.DataFrame(X_transformed)
        
        # Вызов функции для создания дополнительных признаков
        X_transformed = self._create_additional_features(X, X_transformed)
        
        # Последняя сеть - назначаем имена столбцам --->>
        X_transformed.columns = self.get_feature_names_out()
        
        # Группировка по user_id и topic
        X_transformed = self._group_by_user_and_topic(X, X_transformed)
        
        return X_transformed

    def _save_user_stats(self, X, y):
        X_with_target = pd.concat([X, y], axis=1)
        user_count_views = X_with_target.groupby('user_id').size()
        user_means = X_with_target.groupby('user_id')[self.target_name].mean()
        
        self.user_stats = {
            'views': user_count_views.to_dict(),
            'means': user_means.to_dict()
        }

    def _create_additional_features(self, X, X_transformed):
        user_count_views = pd.Series(self.user_stats['views'])
        user_means = pd.Series(self.user_stats['means'][self.target_name[0]])
        
        X_ = X.copy()
        X_.reset_index(inplace=True)
        
        ### Среднее кол-во просмотров
        X_transformed['userViews'] = X_['user_id'].map(user_count_views).fillna(user_count_views.mean())
        ### Средняя оценка
        X_transformed['userMeans'] = X_['user_id'].map(user_means).fillna(user_means.mean()) + np.random.normal(0, self.noise_k, X.shape[0])
        
        X_transformed["user_id"] = X_["user_id"]
        
        X_transformed.set_index("user_id", inplace=True)
        
        return X_transformed
        
    def _group_by_user_and_topic(self, X, X_transformed):
        # Определение OHE-столбцов
        ohe_columns = [col for col in X_transformed.columns if any(cat in col for cat in self.categorical_cols_ohe)]
        # Определение столбцов, над которыми не производились преобразования
        passthrough_columns = self.passthrough_cols
        
        # Столбцы для преобразования по методу аггрегации - мода
        mode_cols = ohe_columns + passthrough_columns
        
        # Создание словаря для агрегации: мода для OHE столбцов, среднее для остальных
        agg_dict = {}

        for col in X_transformed.columns:
            if col in mode_cols:
                agg_dict[col] = lambda x: x.mode()[0] if not x.mode().empty else np.nan  # Мода для OHE столбцов
            elif col != 'user_id' and col != 'topic':
                agg_dict[col] = 'mean'  # Среднее для остальных столбцов

        # Группировка по user_id и topic с использованием словаря агрегации
        X_grouped = X_transformed.groupby(['user_id', 'topic']).agg(agg_dict)
        
        # Сбрасываем индекс, сохраняя user_id и topic, но не добавляя их повторно в DataFrame
        X_grouped = X_grouped.reset_index()

        # Устанавливаем user_id в качестве индекса
        X_grouped.set_index("user_id", inplace=True)

        return X_grouped

    def get_feature_names_out(self):
        ohe_feature_names = self.col_transform.named_transformers_['OneHotEncoder'].get_feature_names_out(self.categorical_cols_ohe)
        mte_feature_names = self.categorical_cols_mte  # TargetEncoder не изменяет имена колонок
        numeric_feature_names = self.numeric_cols
        passthrough_feature_names = self.passthrough_cols
        additional_feature_names = ['userViews', 'userMeans']
        return np.concatenate([ohe_feature_names, mte_feature_names, numeric_feature_names, passthrough_feature_names, additional_feature_names])

In [8]:

cols_for_ohe = [x for x in categorical_cols if X_train[x].nunique() < 5]
cols_for_mte = [x for x in categorical_cols if X_train[x].nunique() >= 5]
numeric_cols = numeric_cols             # 
passthrough_cols = passthrough_cols     # -->> Получаем от функции prepare_data
target_name = target_name               #

pipeline = Pipeline([
    ('custom_transformer', CustomTransformer(
        categorical_cols_ohe=cols_for_ohe,
        categorical_cols_mte=cols_for_mte,
        numeric_cols=numeric_cols,
        passthrough_cols = passthrough_cols,
        target_name=target_name
    ))
])

pipeline.fit(X_train, y_train)
X_transformed = pipeline.transform(X_train)
display(X_transformed)




Unnamed: 0_level_0,topic,country_Turkey,country_Ukraine,os_iOS,source_organic,city,gender,age,exp_group,post_id,userViews,userMeans
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
495,0.155172,0.0,1.0,1.0,0.0,0.453608,1.0,21.0,1.0,237.0,97.0,0.448818
495,0.175781,0.0,1.0,1.0,0.0,0.453608,1.0,21.0,1.0,986.0,97.0,0.454471
495,0.178344,0.0,1.0,1.0,0.0,0.453608,1.0,21.0,1.0,2130.0,97.0,0.457888
495,0.190476,0.0,1.0,1.0,0.0,0.453608,1.0,21.0,1.0,1450.0,97.0,0.454926
495,0.217807,0.0,1.0,1.0,0.0,0.453608,1.0,21.0,1.0,4252.0,97.0,0.453933
...,...,...,...,...,...,...,...,...,...,...,...,...
141788,0.178344,0.0,0.0,0.0,1.0,0.103321,1.0,18.0,3.0,2166.0,271.0,0.102598
141788,0.190476,0.0,0.0,0.0,1.0,0.103321,1.0,18.0,3.0,1442.0,271.0,0.102266
141788,0.217807,0.0,0.0,0.0,1.0,0.103321,1.0,18.0,3.0,4572.0,271.0,0.102320
141788,0.236025,0.0,0.0,0.0,1.0,0.103321,1.0,18.0,3.0,599.0,271.0,0.102440


### 2.1 Проверка работоспособности трансформера

In [9]:
display('Ввод: ')
display(X_test.iloc[:1, :])

display('Вывод: ')
X_transformed_test = pipeline.transform(X_test.iloc[:1, :])
display(X_transformed_test)

'Ввод: '

Unnamed: 0_level_0,timestamp,post_id,gender,age,country,city,exp_group,os,source,text,topic,user_topic_rating
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
51686,2021-12-07 07:02:29,6745,1,18,Russia,Yekaterinburg,2,Android,ads,"Michael Allreds comic book stories, particular...",movie,0.188679


'Вывод: '

Unnamed: 0_level_0,topic,country_Turkey,country_Ukraine,os_iOS,source_organic,city,gender,age,exp_group,post_id,userViews,userMeans
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
51686,0.217807,0.0,0.0,0.0,0.0,0.209073,1.0,18.0,2.0,6745.0,507.0,0.208667


### 3. Трансформер. Группировка по "post_id" (для уменьшения кол-ва записей)

In [10]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder as SklearnOneHotEncoder
from category_encoders import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

class CustomTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, categorical_cols_ohe, categorical_cols_mte, numeric_cols, passthrough_cols, target_name, noise_k=0.006):
        self.categorical_cols_ohe = categorical_cols_ohe
        self.categorical_cols_mte = categorical_cols_mte
        self.numeric_cols = numeric_cols
        self.passthrough_cols = passthrough_cols
        self.target_name = target_name
        self.noise_k = noise_k
        self.col_transform = None

    def fit(self, X, y):
        cols_for_ohe_idx = [list(X.columns).index(col) for col in self.categorical_cols_ohe]
        cols_for_mte_idx = [list(X.columns).index(col) for col in self.categorical_cols_mte]
        numeric_cols_idx = [list(X.columns).index(col) for col in self.numeric_cols]
        passthrough_cols_idx = [list(X.columns).index(col) for col in self.passthrough_cols]
        
        """
        Параметры handle_unknown='ignore' и handle_unknown='impute'
        используются для обработки категорий в данных,
        которые не встречались в обучающем наборе (или в процессе fit).
        """
        
        t = [
            ('OneHotEncoder', SklearnOneHotEncoder(handle_unknown='ignore', drop='first'), cols_for_ohe_idx),
            ('MeanTargetEncoder', TargetEncoder(handle_unknown='impute'), cols_for_mte_idx),
            ('StandardScaler', StandardScaler(), numeric_cols_idx),
            ('Passthrough', 'passthrough', passthrough_cols_idx)
        ]

        self.col_transform = ColumnTransformer(transformers=t)
        self.col_transform.fit(X, y)
        
        # Вызов функции для сохранения статистики по пользователям
        self._save_user_stats(X, y)

        return self

    def transform(self, X):
        X_transformed = self.col_transform.transform(X)
        
        X_transformed = pd.DataFrame(X_transformed)
        
        # Вызов функции для создания дополнительных признаков
        X_transformed = self._create_additional_features(X, X_transformed)
        
        # Последняя сеть - назначаем имена столбцам --->>
        X_transformed.columns = self.get_feature_names_out()
        
        # Группировка по user_id и topic
        X_transformed = self._group_by_user_and_post(X, X_transformed)
        
        return X_transformed

    def _save_user_stats(self, X, y):
        X_with_target = pd.concat([X, y], axis=1)
        user_count_views = X_with_target.groupby('user_id').size()
        user_means = X_with_target.groupby('user_id')[self.target_name].mean()
        
        self.user_stats = {
            'views': user_count_views.to_dict(),
            'means': user_means.to_dict()
        }

    def _create_additional_features(self, X, X_transformed):
        user_count_views = pd.Series(self.user_stats['views'])
        user_means = pd.Series(self.user_stats['means'][self.target_name[0]])
        
        X_ = X.copy()
        X_.reset_index(inplace=True)
        
        ### Среднее кол-во просмотров
        X_transformed['userViews'] = X_['user_id'].map(user_count_views).fillna(user_count_views.mean())
        ### Средняя оценка
        X_transformed['userMeans'] = X_['user_id'].map(user_means).fillna(user_means.mean()) + np.random.normal(0, self.noise_k, X.shape[0])
        
        X_transformed["user_id"] = X_["user_id"]
        
        X_transformed.set_index("user_id", inplace=True)
        
        return X_transformed
        
    def _group_by_user_and_post(self, X, X_transformed):
        # Определение OHE-столбцов
        ohe_columns = [col for col in X_transformed.columns if any(cat in col for cat in self.categorical_cols_ohe)]
        # Определение столбцов, над которыми не производились преобразования
        passthrough_columns = self.passthrough_cols
        
        # Столбцы для преобразования по методу аггрегации - мода
        mode_cols = ohe_columns + passthrough_columns
        
        # Создание словаря для агрегации: мода для OHE столбцов, среднее для остальных
        agg_dict = {}

        for col in X_transformed.columns:
            if col in mode_cols:
                agg_dict[col] = lambda x: x.mode()[0] if not x.mode().empty else np.nan  # Мода для OHE столбцов
            elif col != 'user_id' and col != 'post_id':
                agg_dict[col] = 'mean'  # Среднее для остальных столбцов
                
        # Группировка по user_id и post_id с использованием словаря агрегации
        X_grouped = X_transformed.groupby(['user_id', 'post_id']).agg(agg_dict)
    
        X_grouped = X_grouped.drop(['post_id'], axis=1)
        
        # Сбрасываем индекс, сохраняя user_id и topic, но не добавляя их повторно в DataFrame
        X_grouped = X_grouped.reset_index()

        # Устанавливаем user_id в качестве индекса
        X_grouped.set_index("user_id", inplace=True)

        return X_grouped

    def get_feature_names_out(self):
        ohe_feature_names = self.col_transform.named_transformers_['OneHotEncoder'].get_feature_names_out(self.categorical_cols_ohe)
        mte_feature_names = self.categorical_cols_mte  # TargetEncoder не изменяет имена колонок
        numeric_feature_names = self.numeric_cols
        passthrough_feature_names = self.passthrough_cols
        additional_feature_names = ['userViews', 'userMeans']
        return np.concatenate([ohe_feature_names, mte_feature_names, numeric_feature_names, passthrough_feature_names, additional_feature_names])

In [11]:

cols_for_ohe = [x for x in categorical_cols if X_train[x].nunique() < 5]
cols_for_mte = [x for x in categorical_cols if X_train[x].nunique() >= 5]
numeric_cols = numeric_cols             # 
passthrough_cols = passthrough_cols     # -->> Получаем от функции prepare_data
target_name = target_name               #

pipeline = Pipeline([
    ('custom_transformer', CustomTransformer(
        categorical_cols_ohe=cols_for_ohe,
        categorical_cols_mte=cols_for_mte,
        numeric_cols=numeric_cols,
        passthrough_cols = passthrough_cols,
        target_name=target_name
    ))
])

pipeline.fit(X_train, y_train)
X_transformed = pipeline.transform(X_train)
display(X_transformed)




Unnamed: 0_level_0,post_id,country_Turkey,country_Ukraine,os_iOS,source_organic,city,topic,gender,age,exp_group,userViews,userMeans
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
495,237.0,0.0,1.0,1.0,0.0,0.453608,0.155172,1.0,21.0,1.0,97.0,0.458147
495,285.0,0.0,1.0,1.0,0.0,0.453608,0.155172,1.0,21.0,1.0,97.0,0.458647
495,640.0,0.0,1.0,1.0,0.0,0.453608,0.236025,1.0,21.0,1.0,97.0,0.450139
495,692.0,0.0,1.0,1.0,0.0,0.453608,0.236025,1.0,21.0,1.0,97.0,0.446157
495,762.0,0.0,1.0,1.0,0.0,0.453608,0.236025,1.0,21.0,1.0,97.0,0.451722
...,...,...,...,...,...,...,...,...,...,...,...,...
141788,7237.0,0.0,0.0,0.0,1.0,0.103321,0.217807,1.0,18.0,3.0,271.0,0.105953
141788,7241.0,0.0,0.0,0.0,1.0,0.103321,0.217807,1.0,18.0,3.0,271.0,0.112093
141788,7272.0,0.0,0.0,0.0,1.0,0.103321,0.217807,1.0,18.0,3.0,271.0,0.110812
141788,7283.0,0.0,0.0,0.0,1.0,0.103321,0.217807,1.0,18.0,3.0,271.0,0.098708


### 3.1 Проверка работоспособности трансформера

In [12]:
display('Ввод: ')
display(X_test.iloc[:1, :])

display('Вывод: ')
X_transformed_test = pipeline.transform(X_test.iloc[:1, :])
display(X_transformed_test)

'Ввод: '

Unnamed: 0_level_0,timestamp,post_id,gender,age,country,city,exp_group,os,source,text,topic,user_topic_rating
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
51686,2021-12-07 07:02:29,6745,1,18,Russia,Yekaterinburg,2,Android,ads,"Michael Allreds comic book stories, particular...",movie,0.188679


'Вывод: '

Unnamed: 0_level_0,post_id,country_Turkey,country_Ukraine,os_iOS,source_organic,city,topic,gender,age,exp_group,userViews,userMeans
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
51686,6745.0,0.0,0.0,0.0,0.0,0.209073,0.217807,1.0,18.0,2.0,507.0,0.204908


### 4. Функция для преобразования таргетной переменной (обработка y) для трансформера

In [98]:
def group_target(df, target_name):    
    # Группировка по user_id и post_id с агрегацией по моде для target_name
    df_grouped = (        
        df.groupby(['user_id', 'post_id'])[target_name]
        .agg(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)        
        .reset_index()
        .drop('post_id', axis=1)
        .set_index('user_id')
    )   
    return df_grouped

In [102]:
# Применение функции для группировки таргетной переменной
grouped_y = group_target(pd.concat([X_train, y_train], axis=1), "like_target")

display("y")
display(grouped_y)

display("X")
display(X_transformed)

df_train = pd.concat([X_transformed, grouped_y], axis=1)
display("Склейка: ")
display(df_train)

'y'

Unnamed: 0_level_0,like_target
user_id,Unnamed: 1_level_1
495,0
495,0
495,1
495,1
495,1
...,...
141788,0
141788,0
141788,0
141788,0


'X'

Unnamed: 0_level_0,post_id,country_Turkey,country_Ukraine,os_iOS,source_organic,city,topic,gender,age,exp_group,userViews,userMeans
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
495,237.0,0.0,1.0,1.0,0.0,0.453608,0.155172,1.0,21.0,1.0,97.0,0.459908
495,285.0,0.0,1.0,1.0,0.0,0.453608,0.155172,1.0,21.0,1.0,97.0,0.458991
495,640.0,0.0,1.0,1.0,0.0,0.453608,0.236025,1.0,21.0,1.0,97.0,0.454396
495,692.0,0.0,1.0,1.0,0.0,0.453608,0.236025,1.0,21.0,1.0,97.0,0.457615
495,762.0,0.0,1.0,1.0,0.0,0.453608,0.236025,1.0,21.0,1.0,97.0,0.449090
...,...,...,...,...,...,...,...,...,...,...,...,...
141788,7237.0,0.0,0.0,0.0,1.0,0.103321,0.217807,1.0,18.0,3.0,271.0,0.103645
141788,7241.0,0.0,0.0,0.0,1.0,0.103321,0.217807,1.0,18.0,3.0,271.0,0.117769
141788,7272.0,0.0,0.0,0.0,1.0,0.103321,0.217807,1.0,18.0,3.0,271.0,0.104872
141788,7283.0,0.0,0.0,0.0,1.0,0.103321,0.217807,1.0,18.0,3.0,271.0,0.109030


'Склейка: '

Unnamed: 0_level_0,post_id,country_Turkey,country_Ukraine,os_iOS,source_organic,city,topic,gender,age,exp_group,userViews,userMeans,like_target
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
495,237.0,0.0,1.0,1.0,0.0,0.453608,0.155172,1.0,21.0,1.0,97.0,0.459908,0
495,285.0,0.0,1.0,1.0,0.0,0.453608,0.155172,1.0,21.0,1.0,97.0,0.458991,0
495,640.0,0.0,1.0,1.0,0.0,0.453608,0.236025,1.0,21.0,1.0,97.0,0.454396,1
495,692.0,0.0,1.0,1.0,0.0,0.453608,0.236025,1.0,21.0,1.0,97.0,0.457615,1
495,762.0,0.0,1.0,1.0,0.0,0.453608,0.236025,1.0,21.0,1.0,97.0,0.449090,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
141788,7237.0,0.0,0.0,0.0,1.0,0.103321,0.217807,1.0,18.0,3.0,271.0,0.103645,0
141788,7241.0,0.0,0.0,0.0,1.0,0.103321,0.217807,1.0,18.0,3.0,271.0,0.117769,0
141788,7272.0,0.0,0.0,0.0,1.0,0.103321,0.217807,1.0,18.0,3.0,271.0,0.104872,0
141788,7283.0,0.0,0.0,0.0,1.0,0.103321,0.217807,1.0,18.0,3.0,271.0,0.109030,0
