クエリセットの用意
クエリに対するドキュメントセットの用意
スコアのゲット

In [None]:
# 準備
!pip3 install pandas requests janome
!wget http://download.joachims.org/svm_rank/current/svm_rank_linux64.tar.gz
!tar -zxvf svm_rank_linux64.tar.gz 


In [5]:
import pandas as pd
pd.set_option('display.max_rows', 200)
import requests
import random
random.seed(0)

from janome.tokenizer import Tokenizer
tokenizer = Tokenizer()


def token_diversificate(token):
    rand = random.random()
    if rand < 0.7:
        return token.surface
    elif rand < 0.80:
        return token.reading
    elif rand < 0.90:
        return token.surface[:len(token.surface)-1]
    else:
        return token.surface[1:]


def query_diversificate(query):
    return "".join([token_diversificate(token) for token in tokenizer.tokenize(query)])
    
solr_endpoint="http://localhost:8983/solr/fukuoka/select"

In [15]:
df_all_spots.sample(10)[["name","query"]]

Unnamed: 0,name,query
85,福岡市博物館,フクオカ市ハクブツカン
17,障がい者スポーツセンター,障がいスポーセンター
50,水産加工センター運動広場,水工センター運動場
12,中央市民プール,中央市プール
52,総合図書館,総合図書館
46,博多南地域交流センター,博多南地域交流センター
37,舞鶴公園,舞鶴公園
25,香椎浜公園,香椎園
11,博多市民プール,博多民ール
82,博多南地域交流センター,博多南地域交流センター


In [7]:
# Solrから全件取得することで、idと施設名の対応を取得する
# 今回の例では施設が100件に満たないためrows=100でリクエストを送っている

r = requests.get(solr_endpoint, params={"q":"*:*", "rows":"100"})
dict_all_spots = r.json()["response"]["docs"]
df_all_spots = pd.DataFrame.from_dict(dict_all_spots)

# nameに誤変換や脱字を加えることで揺らしたクエリを作成する
df_all_spots["query"] = df_all_spots["name"].map(query_diversificate)

# 学習時に必要となるリクエストごとの通し番号`qid`を振る
df_all_spots["qid"] = range(len(df_all_spots))

# qidに対する施設のidが正例となるので、ここで結び付けて置く、のちに特徴量テーブルとjoinする
df_query2id_poslabel = df_all_spots[["qid", "id"]].copy()
df_query2id_poslabel["label"]=1

In [17]:
# requestsを送ってid2scoreをgetする関数

def get_query2scores(query, field):
    payload = {
        "q":f"{field}:({query})",
        "fl":"id,score",
        "rows":"100"
    }
    r = requests.get(solr_endpoint, params=payload)
    dict_query2scores = r.json()["response"]["docs"]
    return pd.DataFrame.from_dict(dict_query2scores).rename(columns={"score":f"{field}_score"})

def get_query2scores_with_qid(query, field, qid):
    query2scores = get_query2scores(query, field)
    query2scores["qid"] = qid
    return query2scores

In [18]:
# 各フィールドでのスコアを取得
scores_name = pd.concat([get_query2scores_with_qid(query, "name", qid) for query,qid in zip(df_all_spots["query"], df_all_spots["qid"])])
scores_name_yomi = pd.concat([get_query2scores_with_qid(query, "name_yomi", qid) for query,qid in zip(df_all_spots["query"], df_all_spots["qid"])])
scores_name_ngram  = pd.concat([get_query2scores_with_qid(query, "name_ngram", qid) for query,qid in zip(df_all_spots["query"], df_all_spots["qid"])])

In [19]:
# scoreをjoinする、あるフィールドへの検索で出たがあるフィールドで出なかったアイテムはSolr上でスコア0となるため、outer joinして0埋めしている

scores = scores_name.merge(scores_name_yomi, how="outer", on=["qid", "id"]).merge(scores_name_ngram, how="outer", on=["qid", "id"]).fillna(0)

In [20]:
# qidに対する正解施設IDをjoinし、それ以外の施設はラベルを0埋めする
scores_with_label = scores.merge(df_query2id_poslabel, on=["qid", "id"],how="left").fillna(0)
# scores_with_label["label"] = scores_with_label["label"].astype("int")

In [21]:
# pair-wise学習では同じqid内の異なるラベル同士でのスコア差を用いて学習する
# 1つのqidに対して1つのlabelしか結び付いていない場合は学習にも評価にも使用できない
# qid2unique_label_cntそのためデータから除いている

qid2unique_label_cnt = scores_with_label.groupby("qid")["label"].nunique().reset_index()
qid_only_same_label = qid2unique_label_cnt[qid2unique_label_cnt["label"] == 1]["qid"]
scores_dropped_only_same_label = scores_with_label[~scores_with_label["qid"].isin(qid_only_same_label)].sort_values("qid")

In [22]:
# svm_rank用に特徴量を`通し番号:数値`の形式に書き換えている

train_qid = scores_dropped_only_same_label["qid"].drop_duplicates().sample(frac=0.5, random_state=0)
df_train = scores_dropped_only_same_label[scores_dropped_only_same_label["qid"].isin(train_qid)]
df_train4svm_rank = df_train[["label"]].copy()
df_train4svm_rank["qid"] = df_train["qid"].map(lambda x:f"qid:{x}")
df_train4svm_rank["name_score"] = df_train["name_score"].map(lambda x:f"1:{x}")
df_train4svm_rank["name_yomi_score"] = df_train["name_yomi_score"].map(lambda x:f"2:{x}")
df_train4svm_rank["name_ngram_score"] = df_train["name_ngram_score"].map(lambda x:f"3:{x}")
df_train4svm_rank.to_csv("./train.dat", index=False, header=False, sep=" ")
df_test = scores_dropped_only_same_label[~scores_dropped_only_same_label["qid"].isin(train_qid)].copy()
df_test.to_csv("./test.tsv", index=False, sep="\t")

In [23]:
df_test

Unnamed: 0,id,name_score,qid,name_yomi_score,name_ngram_score,label
0,5dbe04fa-9937-4b65-ab04-4eabf0322896,1.145546,0,1.145546,2.453156,0.0
1291,0abf9d0c-6f05-4d14-951d-9f7cfedf0638,0.000000,0,0.756985,0.000000,0.0
1292,f33d727a-8354-45a3-8222-d6bb82139fa8,0.000000,0,0.756985,0.000000,0.0
1293,00331b3b-c9f3-461b-9991-87c6cf5c05cf,0.000000,0,0.756985,0.000000,0.0
1294,83fd5dc4-078a-4515-82b6-5665c518f1ae,0.000000,0,0.756985,0.000000,0.0
...,...,...,...,...,...,...
1263,9ed6f5be-f3f7-457c-8aa9-7b3941cc0bba,0.683343,86,0.683343,2.015928,0.0
1264,60a1e4a4-a26a-41cb-ac52-ad89fc08b3dc,0.603847,86,0.603847,1.676440,0.0
1265,2e4b72f5-2b3b-4047-a111-381e3d9a8ae5,0.603847,86,0.603847,1.507225,0.0
1266,886e4320-e44c-4ee6-ad65-53fc2f27ea4f,0.603847,86,0.603847,1.676440,0.0


In [24]:
!./svm_rank_learn -c 3 train.dat
!cat svm_struct_model

Reading training examples...done
Training set properties: 3 features, 42 rankings, 844 examples
NOTE: Adjusted stopping criterion relative to maximum loss: eps=0.019095
Iter 1: .........*(NumConst=1, SV=1, CEps=19.0952, QPEps=0.0000)
Iter 2: .........*(NumConst=2, SV=1, CEps=4.6874, QPEps=0.0000)
Iter 3: .........*(NumConst=3, SV=1, CEps=2.7067, QPEps=0.0000)
Iter 4: .........*(NumConst=4, SV=1, CEps=1.4227, QPEps=0.6373)
Iter 5: .........*(NumConst=5, SV=2, CEps=1.3502, QPEps=0.0000)
Iter 6: .........*(NumConst=6, SV=3, CEps=0.5485, QPEps=0.0665)
Iter 7: .........*(NumConst=7, SV=3, CEps=0.1737, QPEps=0.0162)
Iter 8: .........*(NumConst=8, SV=3, CEps=0.0536, QPEps=0.0087)
Iter 9: .........*(NumConst=9, SV=2, CEps=0.0229, QPEps=0.0000)
Iter 10: .........(NumConst=9, SV=2, CEps=0.0124, QPEps=0.0000)
Final epsilon on KKT-Conditions: 0.01235
Upper bound on duality gap: 0.03702
Dual objective value: dval=4.57710
Primal objective value: pval=4.61412
Total number of constraints in final work

In [3]:
df_test = pd.read_csv("./test.tsv", sep="\t")

In [4]:
# svm rank結果の取得
# Solrの仕様でscoreの係数をマイナスにすることはできないので、マイナスの場合は0に置き換えている

with open("svm_struct_model") as f:
    svm_rank_result = f.readlines()[-1].split()

coef_name_score = float(svm_rank_result[1].split(":")[1]) if float(svm_rank_result[1].split(":")[1]) > 0 else 0 
coef_name_yomi_score = float(svm_rank_result[2].split(":")[1]) if float(svm_rank_result[2].split(":")[1]) > 0 else 0
coef_name_ngram_score = float(svm_rank_result[3].split(":")[1]) if float(svm_rank_result[3].split(":")[1]) > 0 else 0


In [8]:
df_test["ml_score"] = coef_name_score* df_test["name_score"] + coef_name_yomi_score*df_test["name_yomi_score"] + coef_name_ngram_score*df_test["name_ngram_score"]

In [9]:
df4eval = df_test.groupby("qid").idxmax().reset_index()
for field in ["name_score", "name_yomi_score", "name_ngram_score", "ml_score"]:
    accuracy = len(df4eval[df4eval["label"] == df4eval[field]])/len(df4eval)
    print(f"{field}ソートのaccuracy: {int(accuracy*100)}%")

name_scoreソートのaccuracy: 71%
name_yomi_scoreソートのaccuracy: 76%
name_ngram_scoreソートのaccuracy: 78%
ml_scoreソートのaccuracy: 90%


In [24]:
# 試しに投げてみる

def ml_ranked_search(query):
    payload = {
        "q":f"name:({query})^{coef_name_score} OR name_yomi:({query})^{coef_name_yomi_score} OR name_ngram:({query})^{coef_name_ngram_score}",
        "fl":"name,score"
    }
    r = requests.get(solr_endpoint, params=payload)
    print(r.url)
    return r.json()["response"]["docs"]

In [26]:
print(ml_ranked_search("ふくおか　体育"))

http://localhost:8983/solr/fukuoka/select?q=name%3A%28%E3%81%B5%E3%81%8F%E3%81%8A%E3%81%8B%E3%80%80%E4%BD%93%E8%82%B2%29%5E0.082438953+OR+name_yomi%3A%28%E3%81%B5%E3%81%8F%E3%81%8A%E3%81%8B%E3%80%80%E4%BD%93%E8%82%B2%29%5E0.67509377+OR+name_ngram%3A%28%E3%81%B5%E3%81%8F%E3%81%8A%E3%81%8B%E3%80%80%E4%BD%93%E8%82%B2%29%5E0.73553967&fl=name%2Cscore
[{'name': '福岡市民体育館', 'score': 1.2686647}, {'name': '福岡市総合体育館', 'score': 1.1592816}, {'name': '東体育館', 'score': 0.9021969}, {'name': '南体育館', 'score': 0.9021969}, {'name': '西体育館', 'score': 0.9021969}, {'name': '博多体育館', 'score': 0.83179194}, {'name': '中央体育館', 'score': 0.83179194}, {'name': '城南体育館', 'score': 0.83179194}, {'name': '早良体育館', 'score': 0.83179194}, {'name': 'ももち体育館', 'score': 0.77158}]
