クエリセットの用意
クエリに対するドキュメントセットの用意
スコアのゲット

In [None]:
# 準備
!pip install pandas requests janome
!wget http://download.joachims.org/svm_rank/current/svm_rank_linux64.tar.gz
!tar -zxvf svm_rank_linux64.tar.gz 


In [1]:
import pandas as pd
pd.set_option('display.max_rows', 200)
import requests
import random
random.seed(0)

from janome.tokenizer import Tokenizer
tokenizer = Tokenizer()


def token_diversificate(token):
    rand = random.random()
    if rand < 0.7:
        return token.surface
    elif rand < 0.80:
        return token.reading
    elif rand < 0.90:
        return token.surface[:len(token.surface)-1]
    else:
        return token.surface[1:]


def query_diversificate(query):
    return "".join([token_diversificate(token) for token in tokenizer.tokenize(query)])
    
solr_endpoint="http://localhost:8983/solr/fukuoka/select"

In [2]:
# Solrから全件取得することで、idと施設名の対応を取得する
# 今回の例では施設が100件に満たないためrows=100でリクエストを送っている

r = requests.get(solr_endpoint, params={"q":"*:*", "rows":"100"})
dict_all_spots = r.json()["response"]["docs"]
df_all_spots = pd.DataFrame.from_dict(dict_all_spots)

# nameに誤変換や脱字を加えることで揺らしたクエリを作成する
df_all_spots["query"] = df_all_spots["name"].map(query_diversificate)

# 学習時に必要となるリクエストごとの通し番号`qid`を振る
df_all_spots["qid"] = range(len(df_all_spots))

# qidに対する施設のidが正例となるので、ここで結び付けて置く、のちに特徴量テーブルとjoinする
df_query2id_poslabel = df_all_spots[["qid", "id"]].copy()
df_query2id_poslabel["label"]=1

In [3]:
# requestsを送ってid2scoreをgetする関数

def get_query2scores(query, field):
    payload = {
        "q":f"{field}:({query})",
        "fl":"id,score",
        "rows":"100"
    }
    r = requests.get(solr_endpoint, params=payload)
    dict_query2scores = r.json()["response"]["docs"]
    return pd.DataFrame.from_dict(dict_query2scores).rename(columns={"score":f"{field}_score"})

def get_query2scores_with_qid(query, field, qid):
    query2scores = get_query2scores(query, field)
    query2scores["qid"] = qid
    return query2scores

In [4]:
# 各フィールドでのスコアを取得
scores_name = pd.concat([get_query2scores_with_qid(query, "name", qid) for query,qid in zip(df_all_spots["query"], df_all_spots["qid"])])
scores_name_yomi = pd.concat([get_query2scores_with_qid(query, "name_yomi", qid) for query,qid in zip(df_all_spots["query"], df_all_spots["qid"])])
scores_name_ngram  = pd.concat([get_query2scores_with_qid(query, "name_ngram", qid) for query,qid in zip(df_all_spots["query"], df_all_spots["qid"])])

In [5]:
# scoreをjoinする、あるフィールドへの検索で出たがあるフィールドで出なかったアイテムはSolr上でスコア0となるため、outer joinして0埋めしている

scores = scores_name.merge(scores_name_yomi, how="outer", on=["qid", "id"]).merge(scores_name_ngram, how="outer", on=["qid", "id"]).fillna(0)

In [6]:
# qidに対する正解施設IDをjoinし、それ以外の施設はラベルを0埋めする
scores_with_label = scores.merge(df_query2id_poslabel, on=["qid", "id"],how="left").fillna(0)
scores_with_label["label"] = scores_with_label["label"].astype("int")

In [7]:
# pair-wise学習では同じqid内の異なるラベル同士でのスコア差を用いて学習する
# 1つのqidに対して1つのlabelしか結び付いていない場合は学習にも評価にも使用できない
# qid2unique_label_cntそのためデータから除いている

qid2unique_label_cnt = scores_with_label.groupby("qid")["label"].nunique().reset_index()
qid_only_same_label = qid2unique_label_cnt[qid2unique_label_cnt["label"] == 1]["qid"]
scores_dropped_only_same_label = scores_with_label[~scores_with_label["qid"].isin(qid_only_same_label)].sort_values("qid")

In [8]:
# svm_rank用に特徴量を`通し番号:数値`の形式に書き換えている

train_qid = scores_dropped_only_same_label["qid"].drop_duplicates().sample(frac=0.5, random_state=0)
df_train = scores_dropped_only_same_label[scores_dropped_only_same_label["qid"].isin(train_qid)]
df_train4svm_rank = df_train[["label"]].copy()
df_train4svm_rank["qid"] = df_train["qid"].map(lambda x:f"qid:{x}")
df_train4svm_rank["name_score"] = df_train["name_score"].map(lambda x:f"1:{x}")
df_train4svm_rank["name_yomi_score"] = df_train["name_yomi_score"].map(lambda x:f"2:{x}")
df_train4svm_rank["name_ngram_score"] = df_train["name_ngram_score"].map(lambda x:f"3:{x}")
df_train4svm_rank.to_csv("./train.dat", index=False, header=False, sep=" ")
df_test = scores_dropped_only_same_label[~scores_dropped_only_same_label["qid"].isin(train_qid)].copy()
df_test.to_csv("./test.tsv", index=False, sep="\t")

In [9]:
!./svm_rank_learn -c 3 train.dat
!cat svm_struct_model

Reading training examples...done
Training set properties: 3 features, 42 rankings, 844 examples
NOTE: Adjusted stopping criterion relative to maximum loss: eps=0.019095
Iter 1: .........*(NumConst=1, SV=1, CEps=19.0952, QPEps=0.0000)
Iter 2: .........*(NumConst=2, SV=1, CEps=4.6874, QPEps=0.0000)
Iter 3: .........*(NumConst=3, SV=1, CEps=2.7067, QPEps=0.0000)
Iter 4: .........*(NumConst=4, SV=1, CEps=1.4227, QPEps=0.6373)
Iter 5: .........*(NumConst=5, SV=2, CEps=1.3502, QPEps=0.0000)
Iter 6: .........*(NumConst=6, SV=3, CEps=0.5485, QPEps=0.0665)
Iter 7: .........*(NumConst=7, SV=3, CEps=0.1737, QPEps=0.0162)
Iter 8: .........*(NumConst=8, SV=3, CEps=0.0536, QPEps=0.0087)
Iter 9: .........*(NumConst=9, SV=2, CEps=0.0229, QPEps=0.0000)
Iter 10: .........(NumConst=9, SV=2, CEps=0.0124, QPEps=0.0000)
Final epsilon on KKT-Conditions: 0.01235
Upper bound on duality gap: 0.03702
Dual objective value: dval=4.57710
Primal objective value: pval=4.61412
Total number of constraints in final work

In [10]:
df_test = pd.read_csv("./test.tsv", sep="\t")

In [11]:
# svm rank結果の取得
# Solrの仕様でscoreの係数をマイナスにすることはできないので、マイナスの場合は0に置き換えている

with open("svm_struct_model") as f:
    svm_rank_result = f.readlines()[-1].split()

coef_name_score = float(svm_rank_result[1].split(":")[1]) if float(svm_rank_result[1].split(":")[1]) > 0 else 0 
coef_name_yomi_score = float(svm_rank_result[2].split(":")[1]) if float(svm_rank_result[2].split(":")[1]) > 0 else 0
coef_name_ngram_score = float(svm_rank_result[3].split(":")[1]) if float(svm_rank_result[3].split(":")[1]) > 0 else 0


In [12]:
df_test["ml_score"] = coef_name_score* df_test["name_score"] + coef_name_yomi_score*df_test["name_yomi_score"] + coef_name_ngram_score*df_test["name_ngram_score"]

In [13]:
df4eval = df_test.groupby("qid").idxmax().reset_index()
for field in ["name_score", "name_yomi_score", "name_ngram_score", "ml_score"]:
    recall = len(df4eval[df4eval["label"] == df4eval[field]])/len(df4eval)
    print(f"{field}ソートのrecall@1: {int(recall*100)}%")

name_scoreソートのrecall@1: 71%
name_yomi_scoreソートのrecall@1: 76%
name_ngram_scoreソートのrecall@1: 78%
ml_scoreソートのrecall@1: 90%


In [156]:
# 試しに投げてみる

def ml_ranked_search(query):
    payload = {
        "q":f"name:({query})^{coef_name_score} OR name_yomi:({query})^{coef_name_yomi_score} OR name_ngram:(query)^{coef_name_ngram_score}",
        "fl":"*,score",
        "rows":"100"
    }
    r = requests.get(solr_endpoint, params=payload)
    return r.json()["response"]["docs"]

In [157]:
ml_ranked_search("福岡こくさい")

[{'name': '福岡市民体育館',
  'name_kana': 'フクオカシミンタイイクカン',
  'address': '福岡市博多区東公園8-2',
  'latitude': 33.603546,
  'longitude': 130.41652,
  'geo_p': '33.603548,130.416512',
  'phone_number': '(092)641-9135',
  'category': 'sports',
  'id': '55ed5591-a889-4ae6-abec-5428d564c2ac',
  '_version_': 1672992268343050240,
  'score': 0.6162294},
 {'name': '福岡国際センター',
  'name_kana': 'フクオカコクサイセンター',
  'address': '福岡市博多区築港本町2-2',
  'latitude': 33.603092,
  'longitude': 130.40115,
  'geo_p': '33.603093,130.40116',
  'phone_number': '(092)272-1111',
  'category': 'culture',
  'id': '5f6a9a33-e87a-4e48-8f65-9dd41cfddea7',
  '_version_': 1672992268495093760,
  'score': 0.6162294},
 {'name': '福岡サンパレス',
  'name_kana': 'フクオカサンパレス',
  'address': '福岡市博多区築港本町2-1',
  'latitude': 33.60377,
  'longitude': 130.40219,
  'geo_p': '33.603772,130.402189',
  'phone_number': '(092)272-1123',
  'category': 'culture',
  'id': '31865fc3-cfae-498f-94f8-ec45f1891a40',
  '_version_': 1672992268499288064,
  'score': 0.6162294},
