In [1]:
import pandas as pd
import numpy as np

In [2]:
validate_train_url = "https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/DZW9I4MwAJrl_A"
validate_test_url = "https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/8nSFbNQY92HCng"
validate_answers_train_url = 'https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/UsSATGKzLrhBFQ'
validate_answers_test = 'https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/iu8jIJk1C15mww'
history_train_url = 'https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/BkZWvVVDjfB1rw'
users_train_url = 'https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/SEz-05NG0vpkKA'
validate_train = pd.read_csv(validate_train_url, sep='\t')
validate_test = pd.read_csv(validate_test_url, sep='\t')
validate_answers_train = pd.read_csv(validate_answers_train_url, sep='\t')
validate_answers_test = pd.read_csv(validate_answers_test, sep='\t')
history_train = pd.read_csv(history_train_url, sep='\t')
users_train = pd.read_csv(users_train_url, sep='\t')


In [3]:
usefull_columns = ['cpm', 'hour_start', 'hour_end', 'audience_size', 'average_age', 'percent_6_15',
       'percent_16_25', 'percent_26_45', 'percent_46_60', 'percent_60_75',
       'percent_70_90', 'percent_men', 'unique_cities',
       'percent_cities_3_7_19_25', 'mean_users_mean_cpm',
       'mean_users_median_cpm', 'mean_users_min_cpm', 'mean_total_views',
       'unique_publishers', 'mean_users_mean_night_cpm',
       'mean_users_median_night_cpm', 'mean_users_min_night_cpm',
       'mean_users_total_views_night', 'mean_users_mean_morning_cpm',
       'mean_users_median_morning_cpm', 'mean_users_min_morning_cpm',
       'mean_users_total_views_morning', 'mean_users_mean_day_cpm',
       'mean_users_median_day_cpm', 'mean_users_min_day_cpm',
       'mean_users_total_views_day', 'mean_users_mean_evening_cpm',
       'mean_users_median_evening_cpm', 'mean_users_min_evening_cpm',
       'mean_users_total_views_evening', 'average_cpm', 'min_cpm', 'max_cpm',
       'most_active_hour', 'second_active_hour', 'third_active_hour',
       'fourth_active_hour', 'fifth_active_hour']

In [4]:
from typing import Tuple
from sklearn.metrics import make_scorer

def load_answers(answers_filename):
    return pd.read_csv(answers_filename, sep="\t")


def get_smoothed_log_mape_column_value(responses_column, answers_column, epsilon):
    return np.abs(np.log(
        (responses_column + epsilon)
        / (answers_column + epsilon)
    )).mean()


def get_smoothed_mean_log_accuracy_ratio(answers, responses, epsilon=0.005):
    log_accuracy_ratio_mean = np.array(
        [
            get_smoothed_log_mape_column_value(responses.at_least_one, answers.at_least_one, epsilon),
            get_smoothed_log_mape_column_value(responses.at_least_two, answers.at_least_two, epsilon),
            get_smoothed_log_mape_column_value(responses.at_least_three, answers.at_least_three, epsilon),
        ]
    ).mean()

    percentage_error = 100 * (np.exp(log_accuracy_ratio_mean) - 1)

    return percentage_error.round(
        decimals=2
    )

def custom_scorer(y_true, y_pred):
    responses = pd.DataFrame(y_pred, columns=['at_least_one', 'at_least_two', 'at_least_three'])
    answers = y_true.reset_index(drop=True)
    return get_smoothed_mean_log_accuracy_ratio(answers, responses)

custom_scorer_func = make_scorer(custom_scorer, greater_is_better=False)

Начнем эксперементы с 3-х одинаковых RandomForestRegressor.

In [5]:
def make_xy(X_data: pd.DataFrame, y_data: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
    X = X_data[usefull_columns]
    y = y_data[['at_least_one', 'at_least_two', 'at_least_three']]
    return X, y


In [16]:
import warnings
from sklearn.linear_model import LinearRegression

warnings.filterwarnings("ignore", category=RuntimeWarning)

X_train, y_train = make_xy(validate_train, validate_answers_train)
X_test, y_test = make_xy(validate_test, validate_answers_test)
model = LinearRegression()
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

responses_train = pd.DataFrame(y_pred_train, columns=['at_least_one', 'at_least_two', 'at_least_three'])
answers_train = y_train.reset_index(drop=True)
responses_test = pd.DataFrame(y_pred_test, columns=['at_least_one', 'at_least_two', 'at_least_three'])
answers_test = y_test.reset_index(drop=True)

percentage_error_train = get_smoothed_mean_log_accuracy_ratio(answers_train, responses_train)
percentage_error_test = get_smoothed_mean_log_accuracy_ratio(answers_test, responses_test)
print(f'Percentage Error on Train: {percentage_error_train}%')
print(f'Percentage Error on Test: {percentage_error_test}%')

Percentage Error on Train: 153.67%
Percentage Error on Test: 192.56%


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
import pandas as pd
import numpy as np

warnings.filterwarnings("ignore", category=RuntimeWarning)
custom_scorer_func = make_scorer(get_smoothed_mean_log_accuracy_ratio,
                                greater_is_better=False)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

models = {
    'Ridge Regression': (Ridge(), {'alpha': np.logspace(-5, 3, 30), 'max_iter': [1000, 5000, 10000]}),
    'Lasso Regression': (Lasso(), {'alpha': np.logspace(-5, 3, 30), 'max_iter': [1000, 5000, 10000]})
}

error_res = {}

for model_name, (model, params) in models.items():
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=params,
        scoring=custom_scorer_func,
        cv=5,
        n_jobs=-1,
        verbose=1
    )

    grid_search.fit(X_train_scaled, y_train)
    best_model = grid_search.best_estimator_

    y_pred_train = best_model.predict(X_train_scaled)
    y_pred_test = best_model.predict(X_test_scaled)
    responses_train = pd.DataFrame(y_pred_train, columns=['at_least_one', 'at_least_two', 'at_least_three'])
    answers_train = pd.DataFrame(y_train.reset_index(drop=True), columns=['at_least_one', 'at_least_two', 'at_least_three'])

    responses_test = pd.DataFrame(y_pred_test, columns=['at_least_one', 'at_least_two', 'at_least_three'])
    answers_test = pd.DataFrame(y_test.reset_index(drop=True), columns=['at_least_one', 'at_least_two', 'at_least_three'])

    train_error = get_smoothed_mean_log_accuracy_ratio(answers_train, responses_train)
    test_error = get_smoothed_mean_log_accuracy_ratio(answers_test, responses_test)

    error_res[model_name] = {
        'best_params': grid_search.best_params_,
        'train_error': train_error,
        'test_error': test_error,
        'model': best_model
    }

In [18]:
for model_name, errors in error_res.items():
    print(f"\n{model_name}:")
    print(f"Best Parameters: {errors['best_params']}")
    print(f"Train Error: {errors['train_error']:.2f}%")
    print(f"Test Error: {errors['test_error']:.2f}%")


Ridge Regression:
Best Parameters: {'alpha': np.float64(1e-05), 'max_iter': 1000}
Train Error: 153.66%
Test Error: 192.56%

Lasso Regression:
Best Parameters: {'alpha': np.float64(1e-05), 'max_iter': 1000}
Train Error: 150.40%
Test Error: 190.87%


In [36]:
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor(
    n_estimators=100,
    learning_rate=0.01,
    max_depth=3,
    min_samples_split=5,
    min_samples_leaf=2
)
y_pred_train = pd.DataFrame()
y_pred_test = pd.DataFrame()

for column in y_train.columns:
    model.fit(X_train, y_train[column])
    y_pred_train[column] = model.predict(X_train)
    y_pred_test[column] = model.predict(X_test)

responses_train = pd.DataFrame(y_pred_train, columns=['at_least_one', 'at_least_two', 'at_least_three'])
answers_train = y_train.reset_index(drop=True)
responses_test = pd.DataFrame(y_pred_test, columns=['at_least_one', 'at_least_two', 'at_least_three'])
answers_test = y_test.reset_index(drop=True)

percentage_error_train = get_smoothed_mean_log_accuracy_ratio(answers_train, responses_train)
percentage_error_test = get_smoothed_mean_log_accuracy_ratio(answers_test, responses_test)
print(f'Percentage Error on Train: {percentage_error_train}%')
print(f'Percentage Error on Test: {percentage_error_test}%')

Percentage Error on Train: 184.16%
Percentage Error on Test: 283.6%


In [39]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

param_grid = [
    {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [1, 2, 3],
    },
    {
        'n_estimators': [50, 100],
        'max_depth': [2, 3],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 3, 5],
        'subsample': [0.8, 0.9, 1.0],
    },
    {
        'n_estimators': [200],
        'learning_rate': [0.05, 0.1],
        'max_depth': [2, 3],
        'validation_fraction': [0.1, 0.2],
        'n_iter_no_change': [5, 10],
        'tol': [1e-4],
    }
]

model = GradientBoostingRegressor(random_state=42)

best_models = {}
for column in y_train.columns:
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring=custom_scorer_func,
        cv=5,
        n_jobs=-1,
        verbose=2
    )
    grid_search.fit(X_train, y_train[column])
    best_models[column] = grid_search.best_estimator_
    print(f"Лучшие параметры для {column}: {grid_search.best_params_}")

y_pred_test = pd.DataFrame()
for column, model in best_models.items():
    y_pred_test[column] = model.predict(X_test)

percentage_error_test = get_smoothed_mean_log_accuracy_ratio(y_test, y_pred_test)
print(f'Percentage Error on Test: {percentage_error_test}%')

Fitting 5 folds for each of 151 candidates, totalling 755 fits


 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan]


Лучшие параметры для at_least_one: {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 50}
Fitting 5 folds for each of 151 candidates, totalling 755 fits


 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan]


Лучшие параметры для at_least_two: {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 50}
Fitting 5 folds for each of 151 candidates, totalling 755 fits
Лучшие параметры для at_least_three: {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 50}
Percentage Error on Test: 248.21%


 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan]


In [40]:
print(f'Percentage Error on Test: {percentage_error_test}%')

Percentage Error on Test: 248.21%


In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor


warnings.filterwarnings("ignore", category=RuntimeWarning)
custom_scorer_func = make_scorer(get_smoothed_mean_log_accuracy_ratio,
                                greater_is_better=False)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

models = {
        'Decision Tree': (DecisionTreeRegressor(), {
            'max_depth': [1, 2, 3, 5, 7],
            'min_samples_split': [2, 5, 10, 20],
            'min_samples_leaf': [1, 2, 5, 10],
            'max_features': ['sqrt', 'log2', None]
        }),
        'Random Forest': (RandomForestRegressor(), {
            'n_estimators': [50, 100, 150, 200],
            'max_depth': [3, 5],
            'min_samples_split': [2, 5, 10],
            #'min_samples_leaf': [1, 2, 4],
            'max_features': ['sqrt', 'log2', 0.8, None],
            'bootstrap': [True, False]
        })
}

error_res = {}

for model_name, (model, params) in models.items():
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=params,
        scoring=custom_scorer_func,
        cv=5,
        n_jobs=-1,
        verbose=1
    )

    grid_search.fit(X_train_scaled, y_train)

    best_model = grid_search.best_estimator_

    y_pred_train = best_model.predict(X_train_scaled)
    y_pred_test = best_model.predict(X_test_scaled)

    responses_train = pd.DataFrame(y_pred_train, columns=['at_least_one', 'at_least_two', 'at_least_three'])
    answers_train = pd.DataFrame(y_train.reset_index(drop=True), columns=['at_least_one', 'at_least_two', 'at_least_three'])

    responses_test = pd.DataFrame(y_pred_test, columns=['at_least_one', 'at_least_two', 'at_least_three'])
    answers_test = pd.DataFrame(y_test.reset_index(drop=True), columns=['at_least_one', 'at_least_two', 'at_least_three'])

    train_error = get_smoothed_mean_log_accuracy_ratio(answers_train, responses_train)
    test_error = get_smoothed_mean_log_accuracy_ratio(answers_test, responses_test)

    error_res[model_name] = {
        'best_params': grid_search.best_params_,
        'train_error': train_error,
        'test_error': test_error,
        'model': best_model
    }

In [None]:
for model_name, errors in error_res.items():
    print(f"\n{model_name}:")
    print(f"Best Parameters: {errors['best_params']}")
    print(f"Train Error: {errors['train_error']:.2f}%")
    print(f"Test Error: {errors['test_error']:.2f}%")


Decision Tree:
Best Parameters: {'max_depth': 1, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2}
Train Error: 227.65%
Test Error: 247.27%

Random Forest:
Best Parameters: {'bootstrap': True, 'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 50}
Train Error: 152.90%
Test Error: 344.42%


К сожалению я столкнулся с сильным переобучением и пока не понял как от него избавится.