In [1]:
import warnings

import numpy as np
import polars as pl
from sklearn.feature_extraction import FeatureHasher
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")

# データ読み込み

In [2]:
df_path = "../data/train_data.csv"
df = pl.read_csv(df_path, separator=",")

# 前処理

In [3]:
feature_hasher = FeatureHasher(n_features=2**18, input_type="string")
feature_hasher.fit(np.array(df.select(pl.all().cast(str))))

In [4]:
df_train, df_test = train_test_split(df, test_size=0.1)
df_train, df_valid = train_test_split(df_train, test_size=0.1)

In [5]:
feature_names = [
    "hour",
    "banner_pos",
    "site_id",
    "site_domain",
    "site_category",
    "app_id",
    "app_domain",
    "app_category",
    "device_id",
    "device_ip",
    "device_model",
    "device_type",
]
target_name = "click"

In [6]:
X_train = df_train[feature_names]
y_train = df_train[target_name]
X_test = df_test[feature_names]
y_test = df_test[target_name]
X_valid = df_valid[feature_names]
y_valid = df_valid[target_name]

In [7]:
def preprocess(df: pl.dataframe.frame.DataFrame):
    hashed_feature = feature_hasher.transform(np.array(df.select(pl.all().cast(str))))
    return hashed_feature

In [8]:
X_train_preprocessed = preprocess(X_train)
X_valid_preprocessed = preprocess(X_valid)
X_test_preprocessed = preprocess(X_test)

# 学習

In [9]:
sample_model = SGDClassifier(loss="log_loss", penalty="l2", random_state=42, alpha=1e-3)
sample_model.fit(X_train_preprocessed, y_train)
print("valid accuracy: {}".format(sample_model.score(X_valid_preprocessed, y_valid)))

valid accuracy: 0.8303514991617885


# ハイパラチューニング

In [10]:
def grid_search(X_train, y_train, X_valid, y_valid) -> float:
    best_score = 0.0
    best_alpha = 0.0
    for alpha in [100, 1e-6, 5e-6, 1e-5, 5e-5]:
        model = SGDClassifier(
            loss="log_loss", penalty="l2", random_state=42, alpha=alpha
        )
        model.fit(X_train, y_train)
        model_pred_probas = model.predict_proba(X_valid)[:, 1]
        score = roc_auc_score(y_valid, model_pred_probas)
        print(f"Grid Search| alpha: {alpha}, roc auc score on valid data: {score}")

        if score > best_score:
            best_score = score
            best_alpha = alpha
    return best_alpha

# 学習・評価

In [11]:
best_alpha = grid_search(X_train_preprocessed, y_train, X_test_preprocessed, y_test)
print("best alpha: {}".format(best_alpha))
best_model = SGDClassifier(
    loss="log_loss", penalty="l2", random_state=42, alpha=best_alpha
)
best_model.fit(X_train_preprocessed, y_train)

Grid Search| alpha: 100, roc auc score on valid data: 0.45242502459460965
Grid Search| alpha: 1e-06, roc auc score on valid data: 0.6887098502183274
Grid Search| alpha: 5e-06, roc auc score on valid data: 0.716780589298412
Grid Search| alpha: 1e-05, roc auc score on valid data: 0.7231088282234897
Grid Search| alpha: 5e-05, roc auc score on valid data: 0.7232240378709593
best alpha: 5e-05


In [12]:
print(
    "accuracy of best model: {}".format(best_model.score(X_valid_preprocessed, y_valid))
)

accuracy of best model: 0.8316706516063429


# 動作確認

In [13]:
print(
    "testデータ0番目のクリック確率",
    best_model.predict_proba(preprocess(X_test[0]))[0][1],
)

testデータ0番目のクリック確率 0.15412996335032883


In [14]:
print(y_test[0])

0
