In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
history=pd.read_csv('https://drive.google.com/uc?export=download&id=1SmFlAHKnaqPy-lp8H3ZPegqQpz6C6gUq', sep='\t')
users=pd.read_csv('https://drive.google.com/uc?export=download&id=16UVDTOX6Gli4amZnDsRrow11ywI-WHl2', sep='\t')
validate_answers=pd.read_csv('https://drive.google.com/uc?export=download&id=1Nz-z3heM8zhUYahcpcsQ7wXxYUo-Ud2B', sep='\t')
validate=pd.read_csv('https://drive.google.com/uc?export=download&id=121sJ7OJ-ZXKAElaXiNHNsuZ49Qmbd2ki', sep='\t')

In [3]:
def calculate_mean_age(user_ids_str, users_df):
    if pd.isna(user_ids_str) or user_ids_str == '':
        return np.nan

    user_ids = [int(uid) for uid in user_ids_str.split(',') if uid.strip().isdigit()]

    ages = users_df[users_df['user_id'].isin(user_ids)]['age']

    return ages.mean() if not ages.empty else np.nan

In [4]:
user_features = history.groupby('user_id').agg({
    'cpm': ['mean', 'median'],
    'hour': 'count',
    'publisher': pd.Series.nunique
})
user_features.columns = ['avg_cpm', 'median_cpm', 'total_views', 'unique_publishers']
user_features = user_features.reset_index()

In [5]:
users_processed = pd.merge(users, user_features, on='user_id', how='left')
users_processed.fillna({
    'avg_cpm': 0,
    'median_cpm': 0,
    'total_views': 0,
    'unique_publishers': 0
}, inplace=True)

In [6]:
users_processed.to_csv('users_processed.csv', sep='\t')

In [7]:
history_processed = history.copy()
history_processed['hour_mod_24'] = history_processed['hour'] % 24

In [8]:
def create_users_parametrs(user_ids):
    user_ = users[users['user_id'].isin(user_ids)]

    if user_.empty:
        return pd.Series({
            'average_age': 0,
            'percent_6_15': 0,
            'percent_16_25': 0,
            'percent_26_45': 0,
            'percent_46_60': 0,
            'percent_60_75': 0,
            'percent_70_90': 0,
            'percent_men': 0,
            'unique_cities': 0,
            'percent_cities_3_7_19_25': 0
        })

    average_age = user_['age'].mean()
    total_users = len(user_)
    percent_6_15 = (user_[(user_['age'] >= 6) & (user_['age'] <= 15)].shape[0] / total_users) * 100
    percent_16_25 = (user_[(user_['age'] >= 16) & (user_['age'] <= 25)].shape[0] / total_users) * 100
    percent_26_45 = (user_[(user_['age'] >= 26) & (user_['age'] <= 45)].shape[0] / total_users) * 100
    percent_46_60 = (user_[(user_['age'] >= 46) & (user_['age'] <= 60)].shape[0] / total_users) * 100
    percent_60_75 = (user_[(user_['age'] >= 60) & (user_['age'] <= 75)].shape[0] / total_users) * 100
    percent_70_90 = (user_[(user_['age'] >= 70) & (user_['age'] <= 90)].shape[0] / total_users) * 100

    percent_men = (user_[user_['sex'] == 1].shape[0] / total_users) * 100

    unique_cities = user_['city_id'].nunique()
    cities_of_interest = [3, 7, 19, 25]
    percent_big_cities = (user_[user_['city_id'].isin(cities_of_interest)].shape[0] / total_users) * 100

    return pd.Series({
        'average_age': average_age,
        'percent_6_15': percent_6_15,
        'percent_16_25': percent_16_25,
        'percent_26_45': percent_26_45,
        'percent_46_60': percent_46_60,
        'percent_60_75': percent_60_75,
        'percent_70_90': percent_70_90,
        'percent_men': percent_men,
        'unique_cities': unique_cities,
        'percent_cities_3_7_19_25': percent_big_cities
    })

In [9]:
validate_processed = validate.copy()
validate_processed['user_ids'] = validate_processed['user_ids'].apply(lambda x: eval(x))
params = validate_processed['user_ids'].apply(create_users_parametrs)
validate_processed = pd.concat([validate_processed, params], axis=1)

In [10]:
def create_publishers_parametrs(publishers):
    if isinstance(publishers, str):
        publishers = eval(publishers)
    elif isinstance(publishers, int):
        publishers = [publishers]

    filtered_history = history_processed[history_processed['publisher'].isin(publishers)]

    if filtered_history.empty:
        return pd.Series({
            'average_cpm': 0,
            'min_cpm': 0,
            'max_cpm': 0,
            'most_active_hour': None,
            'second_active_hour': None,
            'third_active_hour': None,
            'fourth_active_hour': None,
            'fifth_active_hour': None
        })

    average_cpm = filtered_history['cpm'].mean()
    min_cpm = filtered_history['cpm'].min()
    max_cpm = filtered_history['cpm'].max()

    active_hours = filtered_history['hour_mod_24'].value_counts().nlargest(5)

    most_active_hour = active_hours.index[0] if len(active_hours) > 0 else None
    second_active_hour = active_hours.index[1] if len(active_hours) > 1 else None
    third_active_hour = active_hours.index[2] if len(active_hours) > 2 else None
    fourth_active_hour = active_hours.index[3] if len(active_hours) > 3 else None
    fifth_active_hour = active_hours.index[4] if len(active_hours) > 4 else None

    return pd.Series({
        'average_cpm': average_cpm,
        'min_cpm': min_cpm,
        'max_cpm': max_cpm,
        'most_active_hour': most_active_hour,
        'second_active_hour': second_active_hour,
        'third_active_hour': third_active_hour,
        'fourth_active_hour': fourth_active_hour,
        'fifth_active_hour': fifth_active_hour
    })

In [11]:
validate_processed['publishers'] = validate_processed['publishers'].apply(lambda x: eval(x))
cpm_metrics = validate_processed['publishers'].apply(create_publishers_parametrs)
validate_processed = pd.concat([validate_processed, cpm_metrics], axis=1)

In [12]:
validate_processed.to_csv('validate_processed.csv', sep='\t')

In [13]:
history_processed.to_csv('history_processed.csv', sep='\t')

Ссылки на обновленные датасеты.

In [16]:
url_history_processed = "https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/ulaYp7kk5GVtHQ"
url_users_processed = "https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/AktYELbxh5oxKw"
url_validate_processed = 'https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/qHYejfzpLrhMEA'

In [24]:
def load_answers(answers_filename):
    return pd.read_csv(answers_filename, sep="\t")


def get_smoothed_log_mape_column_value(responses_column, answers_column, epsilon):
    return np.abs(np.log(
        (responses_column + epsilon)
        / (answers_column + epsilon)
    )).mean()


def get_smoothed_mean_log_accuracy_ratio(answers, responses, epsilon=0.005):
    log_accuracy_ratio_mean = np.array(
        [
            get_smoothed_log_mape_column_value(responses.at_least_one, answers.at_least_one, epsilon),
            get_smoothed_log_mape_column_value(responses.at_least_two, answers.at_least_two, epsilon),
            get_smoothed_log_mape_column_value(responses.at_least_three, answers.at_least_three, epsilon),
        ]
    ).mean()

    percentage_error = 100 * (np.exp(log_accuracy_ratio_mean) - 1)

    return percentage_error.round(
        decimals=2
    )

In [27]:


from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

X_train, X_test, y_train, y_test = train_test_split(validate_processed[['cpm', 'hour_start', 'hour_end', 'average_age', 'percent_6_15', 'percent_16_25', 'percent_26_45', 'percent_46_60', 'percent_60_75',
       'percent_70_90', 'percent_men', 'unique_cities',
       'percent_cities_3_7_19_25', 'average_cpm', 'min_cpm', 'max_cpm',
       'most_active_hour', 'second_active_hour', 'third_active_hour',
       'fourth_active_hour', 'fifth_active_hour']], validate_answers, test_size=0.3, random_state=42)

model_at_least_one = RandomForestRegressor(
    n_estimators=200,
    max_depth=15,
    random_state=42
)

model_at_least_two = RandomForestRegressor(
    n_estimators=200,
    max_depth=15,
    random_state=42
)

model_at_least_three = RandomForestRegressor(
    n_estimators=200,
    max_depth=15,
    random_state=42
)

model_at_least_one.fit(X_train, y_train['at_least_one'])
model_at_least_two.fit(X_train, y_train['at_least_two'])
model_at_least_three.fit(X_train, y_train['at_least_three'])

pred_at_least_one = model_at_least_one.predict(X_train)
pred_at_least_two = model_at_least_two.predict(X_train)
pred_at_least_three = model_at_least_three.predict(X_train)


responses = pd.DataFrame({
    'at_least_one': pred_at_least_one,
    'at_least_two': pred_at_least_two,
    'at_least_three': pred_at_least_three
})



Используя только общие данные о выборке пользователей мы получили результат 327%.

In [28]:
print(get_smoothed_mean_log_accuracy_ratio(y_test, responses)) #

327.31
