# ライブラリの読み込み

In [2]:
#import libraries
#basic libralres
import pandas as pd
import numpy as np
#pre-processing library
from sklearn.preprocessing import OneHotEncoder
#processing libraries
from sklearn.metrics.pairwise import cosine_similarity
import heapq
#saving libraries
import openpyxl as op

# ファイルの読み込みとデータの前処理

In [3]:
#load data 
item = pd.read_csv('random/all/item_context.csv')

In [4]:
#One-Hot-Encoding to calculate cosine similarity
item_encoded = pd.get_dummies(item, columns=["item_feature_1", "item_feature_2", "item_feature_3"])

コサイン類似度の測定のために特徴量をベクトル化しているのでラベルエンコーディングのような方法は使えない
<br>
例えば「赤,Mサイズ」の特徴を持つ商品Aと「青,Mサイズ」の特徴を持つ商品Bを比較する時にラベルエンコーディングをしてしまうと、Aの特徴量ベクトルが(1,1)、Bが(2,1)のようになるが、コサイン類似度は1(なす角0度)となってしまう。一方でカテゴリ変数化することによって、Aの特徴量ベクトルが(1,0,1)、Bが(0,1,1)のようになり、異なる特徴を持つ商品として、コサイン類似度0.5を得ることができる。

In [5]:
#check item_context
item_encoded

Unnamed: 0,item_id,item_feature_0,item_feature_1_16448dc139799f910b997030f18d9db1,item_feature_1_21960468ff88ee1b782d2ef9d4ad45bf,item_feature_1_3d81a60e324a092ee5b1716c2e293693,item_feature_1_9b6d0728ea28dd8aa485eb824ce58f05,item_feature_1_9eeef99c68d05dee33645a764e61f0ff,item_feature_1_a5686bc5a9899026349b870a30bf4376,item_feature_1_b33f60576047e6435223a7311bb7ed86,item_feature_1_bfcc6df8d05cd9f88e9f8cd2e68004e0,...,item_feature_2_dc8c29cfc2640a557da8cc20867840b0,item_feature_2_f4ae697a3823f829e791beef466f848b,item_feature_2_f676eed05902d205b7b42fd403171834,item_feature_3_029e121cef34dc8045ea5eed5cc181a9,item_feature_3_57bed6721fd0a0f839271cdf0e9fb2c1,item_feature_3_6c49bd74ff3b09c44fee22c764bb5b2b,item_feature_3_ad64ce82bf623357a97a1164f463a5da,item_feature_3_c3076613abe7b4363a68a27890d8ba57,item_feature_3_d7d5b814435d18e8d2a5a7563fdaf1e6,item_feature_3_ff8b1f6688f83613aecd3fbc881bafb0
0,0,-0.499172,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,1,-0.543775,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,2,0.972752,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,3,-0.521473,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,4,1.909430,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,75,-0.432266,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
76,76,-0.610681,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
77,77,-1.056718,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
78,78,-0.588379,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0


カテゴリ変数化が成功していることが確認できた

# コンテンツベースフィルタリングの実装

### コンテンツベースフィルタリングの実装

In [30]:
#Implement containt-based filtering
def containt_based():
    
    #Define cosine similarity and register as the function
    def cos_sim(A, B):
        return np.dot(A, B) / (np.linalg.norm(A) * np.linalg.norm(B))
    
    #List-ize the features of items for each product
    item_id_n_feature = []
    for n in range(80):
        n_feature = np.array(item_encoded.iloc[n,1:])
        item_id_n_feature.append(n_feature)
        
    #Select one product and calculate the cosine similarity with other products
    #You can select and input item_id 
    ns_cos_sim_for_i = []
    print('Input item_id(0~79) and you got item_id which you should reccomend by containts base filtering')
    n = int(input())       
    if (n < 0 or n > 79):
        print('Please input 0~79')
    for j in range(80):
        cos = cos_sim(item_id_n_feature[n], item_id_n_feature[j])
        ns_cos_sim_for_i.append(cos)
        max_cos_list = heapq.nlargest(4, ns_cos_sim_for_i)
    
    #Display the top three item_id-s of items, which items have similar feature values to the selected item
    for n in range(3):
         print('item_id' + str([i for i, x in enumerate(ns_cos_sim_for_i) if x == max_cos_list[n+1]]) + ' should be reccomended.')

### 正常に機能するか確認

In [31]:
#You can actually try 
containt_based()

Input item_id(0~79) and you got item_id which you should reccomend by containts base filtering
1
item_id[27, 62] should be reccomended.
item_id[27, 62] should be reccomended.
item_id[0, 5] should be reccomended.


# レコメンドの結果をファイル出力

### ファイル出力用に関数を編集する

In [20]:
#Implement containt-based filtering for output
def containt_based_2(n,y):
    
    #Define cosine similarity and register as the function
    def cos_sim(v1, v2):
        return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
    
    #List-ize the features of items for each product
    item_id_n_feature = []
    for i in range(80):
        n_feature = np.array(item_encoded.iloc[i,1:])
        item_id_n_feature.append(n_feature)
        
    #Select one product and calculate the cosine similarity with other products
    #You can select and input item_id 
    ns_cos_sim_for_i = []
    if (n < 0 or n > 79):
        print('Please input 0~79')
    for j in range(80):
        cos = cos_sim(item_id_n_feature[n], item_id_n_feature[j])
        ns_cos_sim_for_i.append(cos)
        max_cos_list = heapq.nlargest(4, ns_cos_sim_for_i)
    
    #Display the top three item_id-s of items, which items have similar feature values to the selected item
    return [i for i, x in enumerate(ns_cos_sim_for_i) if x == max_cos_list[y+1]]

### 正常に機能するか確認

In [25]:
containt_based_2(1,0)

[27, 62]

item_id1と最も似ているアイテムはid27と62という意味

# ファイルに出力して保存

In [26]:
book = op.Workbook()
sheet = book.worksheets[0]

In [29]:
sheet.cell(row = 1, column = 1).value = 'item_id'                #１列目には推薦の基となるitem_idが出力される(例えば、その時見ているアイテム)
sheet.cell(row = 1, column = 2).value = 'first_reccomendation'   #２列目には最も推薦するべきアイテムのitem_idが出力される
sheet.cell(row = 1, column = 3).value = 'second_reccomendation'  #３列目には次に推薦するべきアイテムのitem_idが出力される
sheet.cell(row = 1, column = 4).value = 'third_reccomendation'   #４列目にはさらに次に推薦するべきアイテムのitem_idが出力される

#Output 
for i in range(80):
    sheet.cell(row = i+2, column = 1).value = i
    sheet.cell(row = i+2,column = 2).value = str(containt_based_2(i,0))
    sheet.cell(row = i+2, column = 3).value = str(containt_based_2(i,1))
    sheet.cell(row = i+2, column = 4).value = str(containt_based_2(i,2))

#save file
book.save('result.xlsx')
book.close()