In [None]:
import os
import random

from joblib import Parallel, delayed
import numpy as np
import pandas as pd
# from tqdm.auto import tqdm

In [None]:
DIRECTORY = os.path.join(os.path.expanduser("~"), "atmacup16")

In [None]:
%%time
# 都道府県ごとのランキング
label = pd.read_csv(os.path.join(DIRECTORY, "input", "train_label.csv"))
yado = pd.read_csv(os.path.join(DIRECTORY, "input", "yado.csv"))
count_by_prefecture = pd.merge(label, yado).groupby("ken_cd")["yad_no"].value_counts().sort_values(ascending=False)
count_by_prefecture

In [None]:
%%time
log_test = pd.read_csv(os.path.join(DIRECTORY, "input", "test_log.csv"))
log_test

In [None]:
co_occurance_rate = np.load(os.path.join(DIRECTORY, "features", "cooccurance_rate.npy"))
co_occurance_rate.shape

In [None]:
yad_numbers = [i for i in range(1, co_occurance_rate.shape[0] + 1)]

In [None]:
K = 10

def get_prediction(session_id: str, session_df: pd.DataFrame) -> dict:

    prediction = {"session_id": session_id}
    session_df.sort_values("seq_no", inplace=True)

    # セッション中に閲覧した宿は候補、ただし最後は除く
    yad_no_last = session_df.iloc[session_df.shape[0] - 1, session_df.columns.get_loc("yad_no")]
    yad_numers_in_session = session_df["yad_no"].unique().tolist()
    candicates = [no for no in yad_numers_in_session if no != yad_no_last]

    # 共起行列から共起割合が高い宿を取得`
    rate_dfs = []
    for yad_no in yad_numers_in_session:
        rate_dfs.append(pd.DataFrame({"yad_no": yad_numbers, "ratio": co_occurance_rate[yad_no - 1]}))
    rate_df = pd.concat(rate_dfs).sort_values("ratio", ascending=False).query("ratio > 0").query(f"yad_no != {yad_no_last}")
    rate_df = rate_df[~rate_df["yad_no"].isin(candicates)]
    if len(candicates) < K:
        candicates += rate_df["yad_no"].drop_duplicates().tolist()[:K-len(candicates)]

    if len(candicates) < K:
        session_df = pd.merge(session_df, yado[["yad_no", "ken_cd"]])
        most_frequently_seen_prefecture = session_df["ken_cd"].mode()[0]
        ranking = count_by_prefecture.loc[most_frequently_seen_prefecture]
        candicates += ranking.index.tolist()[:K-len(candicates)]
    while len(candicates) < K:
        r = random.choice(yad_numbers)
        if r not in candicates and r != yad_no_last:
            candicates.append(r)

    for i, c in enumerate(candicates[:K]):
        prediction[f"predict_{i}"] = c

    return prediction

In [None]:
predictions = Parallel(n_jobs=7, verbose=1)(delayed(get_prediction)(i, df) for i, df in log_test.groupby("session_id"))
predictions = pd.DataFrame(predictions).set_index("session_id").sort_index()
predictions

In [None]:
%%time
test_session = pd.read_csv(os.path.join(DIRECTORY, "input", "test_session.csv"))
test_session

In [None]:
test_session[predictions.columns] = predictions.loc[test_session["session_id"], predictions.columns].values
test_session

In [None]:
test_session.drop(columns=["session_id"]).to_csv(os.path.join(DIRECTORY, "submissions", "exp003.csv"), index=False)