In [1]:
import pandas as pd
import numpy as np

In [10]:
history=pd.read_csv('https://drive.google.com/uc?export=download&id=1SmFlAHKnaqPy-lp8H3ZPegqQpz6C6gUq', sep='\t')
users=pd.read_csv('https://drive.google.com/uc?export=download&id=16UVDTOX6Gli4amZnDsRrow11ywI-WHl2', sep='\t')
validate_answers=pd.read_csv('https://drive.google.com/uc?export=download&id=1Nz-z3heM8zhUYahcpcsQ7wXxYUo-Ud2B', sep='\t')
validate=pd.read_csv('https://drive.google.com/uc?export=download&id=121sJ7OJ-ZXKAElaXiNHNsuZ49Qmbd2ki', sep='\t')

In [11]:
data = pd.concat([validate, validate_answers], axis=1)
data.head()
train = data[data['hour_start']>1300]
test = data[data['hour_end']<=1300]
validate_train = train[validate.columns]
validate_answers_train = train[validate_answers.columns]
validate_test = test[validate.columns]
validate_answers_test = test[validate_answers.columns]
history_train = history[history['hour']<=1300]

In [12]:
def prepare_users_dataset(users, history):
    user_features = history.groupby('user_id').agg({
        'cpm': ['mean', 'median', 'min'],
        'hour': 'count',
        'publisher': pd.Series.nunique
    })
    user_features.columns = ['mean_cpm', 'median_cpm', 'min_cpm', 'total_views', 'unique_publishers']
    user_features = user_features.reset_index()

    users_features_night = (history[history['hour']%24<6]).groupby('user_id').agg({
        'cpm': ['mean', 'median', 'min'],
        'hour': 'count'
    })
    users_features_night.columns = ['mean_night_cpm', 'median_night_cpm', 'min_night_cpm', 'total_views_night']
    users_features_night = users_features_night.reset_index()

    users_features_morning = (history[(history['hour']%24>=6) & (history['hour']%24<12)]).groupby('user_id').agg({
        'cpm': ['mean', 'median', 'min'],
        'hour': 'count'
    })
    users_features_morning.columns = ['mean_morning_cpm', 'median_morning_cpm', 'min_morning_cpm', 'total_views_morning']
    users_features_morning = users_features_morning.reset_index()

    users_features_day = (history[(history['hour']%24>=12) & (history['hour']%24<18)]).groupby('user_id').agg({
        'cpm': ['mean', 'median', 'min'],
        'hour': 'count'
    })
    users_features_day.columns = ['mean_day_cpm', 'median_day_cpm', 'min_day_cpm', 'total_views_day']
    users_features_day = users_features_day.reset_index()

    users_features_evening = (history[history['hour']%24>=18]).groupby('user_id').agg({
        'cpm': ['mean', 'median', 'min'],
        'hour': 'count'
    })
    users_features_evening.columns = ['mean_evening_cpm', 'median_evening_cpm', 'min_evening_cpm', 'total_views_evening']
    users_features_evening = users_features_evening.reset_index()

    users_processed = pd.merge(users, user_features, on='user_id', how='left')
    users_processed = pd.merge(users_processed, users_features_night, on='user_id', how='left')
    users_processed = pd.merge(users_processed, users_features_morning, on='user_id', how='left')
    users_processed = pd.merge(users_processed, users_features_day, on='user_id', how='left')
    users_processed = pd.merge(users_processed, users_features_evening, on='user_id', how='left')
    users_processed.fillna({
        'mean_cpm': 0,
        'median_cpm': 0,
        'total_views': 0,
        'unique_publishers': 0,
        'mean_night_cpm': 0,
        'median_night_cpm': 0,
        'min_night_cpm': 0,
        'total_views_night': 0,
        'mean_morning_cpm': 0,
        'median_morning_cpm': 0,
        'min_morning_cpm': 0,
        'total_views_morning': 0,
        'mean_day_cpm': 0,
        'median_day_cpm': 0,
        'min_day_cpm': 0,
        'total_views_day': 0,
        'mean_evening_cpm': 0,
        'median_evening_cpm': 0,
        'min_evening_cpm': 0,
        'total_views_evening': 0
    }, inplace=True)
    return users_processed

In [13]:
users_train = prepare_users_dataset(users, history_train)
users_train.to_csv('users_train.csv', sep='\t')

In [6]:
history_train = history_train.copy()
history_train['hour_mod_24'] = history_train['hour']%24
history_train.to_csv('history_train.csv', sep='\t')

In [None]:
def create_users_parametrs(user_ids):
    user_ = users_train[users_train['user_id'].isin(user_ids)]

    if user_.empty:
        return pd.Series({
            'average_age': 0,
            'percent_6_15': 0,
            'percent_16_25': 0,
            'percent_26_45': 0,
            'percent_46_60': 0,
            'percent_60_75': 0,
            'percent_70_90': 0,
            'percent_men': 0,
            'unique_cities': 0,
            'percent_cities_3_7_19_25': 0,
            'mean_users_mean_cpm': 0,
            'mean_users_median_cpm': 0,
            'mean_users_min_cpm': 0,
            'mean_total_views': 0,
            'unique_publishers': 0,
            'mean_users_mean_night_cpm': 0,
            'mean_users_median_night_cpm': 0,
            'mean_users_min_night_cpm': 0,
            'mean_users_total_views_night': 0,
            'mean_users_mean_morning_cpm': 0,
            'mean_users_median_morning_cpm': 0,
            'mean_users_min_morning_cpm': 0,
            'mean_users_total_views_morning': 0,
            'mean_users_mean_day_cpm': 0,
            'mean_users_median_day_cpm': 0,
            'mean_users_min_day_cpm': 0,
            'mean_users_total_views_day': 0,
            'mean_users_mean_evening_cpm': 0,
            'mean_users_median_evening_cpm': 0,
            'mean_users_min_evening_cpm': 0,
            'mean_users_total_views_evening': 0

        })

    average_age = user_['age'].mean()
    total_users = len(user_)
    percent_6_15 = (user_[(user_['age'] >= 6) & (user_['age'] <= 15)].shape[0] / total_users) * 100
    percent_16_25 = (user_[(user_['age'] >= 16) & (user_['age'] <= 25)].shape[0] / total_users) * 100
    percent_26_45 = (user_[(user_['age'] >= 26) & (user_['age'] <= 45)].shape[0] / total_users) * 100
    percent_46_60 = (user_[(user_['age'] >= 46) & (user_['age'] <= 60)].shape[0] / total_users) * 100
    percent_60_75 = (user_[(user_['age'] >= 60) & (user_['age'] <= 75)].shape[0] / total_users) * 100
    percent_70_90 = (user_[(user_['age'] >= 70) & (user_['age'] <= 90)].shape[0] / total_users) * 100

    percent_men = (user_[user_['sex'] == 1].shape[0] / total_users) * 100

    unique_cities = user_['city_id'].nunique()
    cities_of_interest = [3, 7, 19, 25]
    percent_big_cities = (user_[user_['city_id'].isin(cities_of_interest)].shape[0] / total_users) * 100

    return pd.Series({
        'average_age': average_age,
        'percent_6_15': percent_6_15,
        'percent_16_25': percent_16_25,
        'percent_26_45': percent_26_45,
        'percent_46_60': percent_46_60,
        'percent_60_75': percent_60_75,
        'percent_70_90': percent_70_90,
        'percent_men': percent_men,
        'unique_cities': unique_cities,
        'percent_cities_3_7_19_25': percent_big_cities,
        'mean_users_mean_cpm': user_['mean_cpm'].mean(),
        'mean_users_median_cpm': user_['median_cpm'].mean(),
        'mean_users_min_cpm': user_['min_cpm'].mean(),
        'mean_total_views': user_['total_views'].mean(),
        'unique_publishers': user_['unique_publishers'].mean(),
        'mean_users_mean_night_cpm': user_['mean_night_cpm'].mean(),
        'mean_users_median_night_cpm': user_['median_night_cpm'].mean(),
        'mean_users_min_night_cpm': user_['min_night_cpm'].mean(),
        'mean_users_total_views_night': user_['total_views_night'].mean(),
        'mean_users_mean_morning_cpm': user_['mean_morning_cpm'].mean(),
        'mean_users_median_morning_cpm': user_['median_morning_cpm'].mean(),
        'mean_users_min_morning_cpm': user_['min_morning_cpm'].mean(),
        'mean_users_total_views_morning': user_['total_views_morning'].mean(),
        'mean_users_mean_day_cpm': user_['mean_day_cpm'].mean(),
        'mean_users_median_day_cpm': user_['median_day_cpm'].mean(),
        'mean_users_min_day_cpm': user_['min_day_cpm'].mean(),
        'mean_users_total_views_day': user_['total_views_day'].mean(),
        'mean_users_mean_evening_cpm': user_['mean_evening_cpm'].mean(),
        'mean_users_median_evening_cpm': user_['median_evening_cpm'].mean(),
        'mean_users_min_evening_cpm': user_['min_evening_cpm'].mean(),
        'mean_users_total_views_evening': user_['total_views_evening'].mean()
    })

In [None]:
validate_train = validate_train.copy()
validate_train['user_ids'] = validate_train['user_ids'].apply(lambda x: eval(x))
params = validate_train['user_ids'].apply(create_users_parametrs)
validate_train = pd.concat([validate_train, params], axis=1)

In [None]:
validate_test = validate_test.copy()
validate_test['user_ids'] = validate_test['user_ids'].apply(lambda x: eval(x))
params = validate_test['user_ids'].apply(create_users_parametrs)
validate_test = pd.concat([validate_test, params], axis=1)

In [None]:
def create_publishers_parametrs(publishers):
    if isinstance(publishers, str):
        publishers = eval(publishers)
    elif isinstance(publishers, int):
        publishers = [publishers]

    filtered_history = history_train[history_train['publisher'].isin(publishers)]

    if filtered_history.empty:
        return pd.Series({
            'average_cpm': 0,
            'min_cpm': 0,
            'max_cpm': 0,
            'most_active_hour': None,
            'second_active_hour': None,
            'third_active_hour': None,
            'fourth_active_hour': None,
            'fifth_active_hour': None
        })

    average_cpm = filtered_history['cpm'].mean()
    min_cpm = filtered_history['cpm'].min()
    max_cpm = filtered_history['cpm'].max()

    active_hours = filtered_history['hour_mod_24'].value_counts().nlargest(5)

    most_active_hour = active_hours.index[0] if len(active_hours) > 0 else None
    second_active_hour = active_hours.index[1] if len(active_hours) > 1 else None
    third_active_hour = active_hours.index[2] if len(active_hours) > 2 else None
    fourth_active_hour = active_hours.index[3] if len(active_hours) > 3 else None
    fifth_active_hour = active_hours.index[4] if len(active_hours) > 4 else None

    return pd.Series({
        'average_cpm': average_cpm,
        'min_cpm': min_cpm,
        'max_cpm': max_cpm,
        'most_active_hour': most_active_hour,
        'second_active_hour': second_active_hour,
        'third_active_hour': third_active_hour,
        'fourth_active_hour': fourth_active_hour,
        'fifth_active_hour': fifth_active_hour
    })

In [None]:
validate_train['publishers'] = validate_train['publishers'].apply(lambda x: eval(x))
cpm_metrics = validate_train['publishers'].apply(create_publishers_parametrs)
validate_train = pd.concat([validate_train, cpm_metrics], axis=1)

In [None]:
validate_test['publishers'] = validate_test['publishers'].apply(lambda x: eval(x))
cpm_metrics = validate_test['publishers'].apply(create_publishers_parametrs)
validate_test = pd.concat([validate_test, cpm_metrics], axis=1)

In [None]:
validate_train.to_csv('validate_train.csv', sep='\t')
validate_test.to_csv('validate_test.csv', sep='\t')
validate_answers_train.to_csv('validate_answers_train.csv', sep='\t')
validate_answers_test.to_csv('validate_answers_test.csv', sep='\t')

In [14]:
validate_train_url = "https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/DZW9I4MwAJrl_A"
validate_test_url = "https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/8nSFbNQY92HCng"
validate_answers_train_url = 'https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/UsSATGKzLrhBFQ'
validate_answers_test = 'https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/iu8jIJk1C15mww'
history_train_url = 'https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/BkZWvVVDjfB1rw'
users_train_url = 'https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/SEz-05NG0vpkKA'
validate_train = pd.read_csv(validate_train_url, sep='\t')
validate_test = pd.read_csv(validate_test_url, sep='\t')
validate_answers_train = pd.read_csv(validate_answers_train_url, sep='\t')
validate_answers_test = pd.read_csv(validate_answers_test, sep='\t')
history_train = pd.read_csv(history_train_url, sep='\t')
users_train = pd.read_csv(users_train_url, sep='\t')
