<a href="https://colab.research.google.com/github/ShouMaGooo/MachineLearning_test/blob/main/recommend_cos0_similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# データセットのダウンロード用
import requests
import zipfile

# ベクトル計算用
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# 処理時間計測用
import time

In [2]:
# データセットをダウンロードし解凍
url = "https://files.grouplens.org/datasets/movielens/ml-1m.zip"
zip_path = "ml-1m.zip"

response = requests.get(url)
with open(zip_path, "wb") as f:
    f.write(response.content)

with zipfile.ZipFile(zip_path, "r") as zip_ref:
    zip_ref.extractall(".")

In [3]:
# 作品情報を読み込みデータフレーム化
movie_columns = ["movie_id", "title", "genres"]
movies_df = pd.read_csv(
    "./ml-1m/movies.dat",
    sep="::",
    engine="python",
    names=movie_columns,
    encoding="latin1"
)
# 確認
movies_df.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
# レビュー情報を読み込み
ratings_columns = ["user_id", "movie_id", "rating", "timestamp"]
ratings_df = pd.read_csv(
    "./ml-1m/ratings.dat",
    sep="::",
    engine="python",
    names=ratings_columns
)
#
item_indexed_rating_df = pd.pivot_table(
    data=ratings_df.drop("timestamp", axis=1),
    index="movie_id",
    columns="user_id",
    values="rating"
)
# NaNは0で埋める
item_indexed_rating_df = item_indexed_rating_df.fillna(0)
# 確認
item_indexed_rating_df

user_id,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,5.0,5.0,...,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,3.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3949,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3950,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3951,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# コサイン類似度を練習してみるよ！
'''
# サンプル
vector_A = np.array([1, 0, 1])
vector_B = np.array([0, 1, 1])

# 内積計算
dot_product = np.dot(vector_A, vector_B)
# ノルム（ベクトルの大きさ）の計算
norm_A = np.linalg.norm(vector_A)
norm_B = np.linalg.norm(vector_B)

# cos類似度計算
cosine_similarity = dot_product / (norm_A * norm_B)
print("Cosine_similarity:", cosine_similarity)
# Result -> Cosine_similarity: 0.4999999999999999

'''

Cosine_similarity: 0.4999999999999999


In [7]:
#実際のデータでコサイン類似度を計算
from sklearn.metrics.pairwise import cosine_similarity # Re-import the function

sim = cosine_similarity(item_indexed_rating_df.values)
similarity_df = pd.DataFrame(
    sim,
    index=item_indexed_rating_df.index,
    columns=item_indexed_rating_df.index
)
similarity_df

movie_id,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.390349,0.267943,0.178789,0.256569,0.347373,0.301490,0.125709,0.106620,0.377459,...,0.099502,0.020966,0.084105,0.081826,0.045949,0.309676,0.186633,0.093479,0.042829,0.182691
2,0.390349,1.000000,0.240946,0.155457,0.249970,0.244827,0.262772,0.196521,0.158469,0.386200,...,0.061819,0.015209,0.075310,0.095573,0.074271,0.213650,0.140781,0.087013,0.026063,0.122185
3,0.267943,0.240946,1.000000,0.192788,0.308290,0.187020,0.292230,0.092122,0.128378,0.245601,...,0.038492,0.065507,0.049512,0.087377,0.050985,0.190575,0.104837,0.062258,0.010073,0.097786
4,0.178789,0.155457,0.192788,1.000000,0.271990,0.125170,0.220024,0.049554,0.060334,0.133707,...,0.055486,0.053300,0.002227,0.025278,0.025204,0.118902,0.096318,0.022588,0.024769,0.095154
5,0.256569,0.249970,0.308290,0.271990,1.000000,0.148114,0.305107,0.095512,0.138392,0.237681,...,0.026632,0.083898,0.046399,0.047542,0.016156,0.174554,0.092403,0.051633,0.010750,0.112835
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,0.309676,0.213650,0.190575,0.118902,0.174554,0.236447,0.191689,0.090387,0.092347,0.237227,...,0.183859,0.053539,0.109062,0.210272,0.078341,1.000000,0.329339,0.168234,0.122279,0.363838
3949,0.186633,0.140781,0.104837,0.096318,0.092403,0.201419,0.117660,0.080523,0.099554,0.136374,...,0.244371,0.098568,0.070933,0.160150,0.107063,0.329339,1.000000,0.302649,0.199337,0.347805
3950,0.093479,0.087013,0.062258,0.022588,0.051633,0.115331,0.059262,0.084976,0.004956,0.097170,...,0.126068,0.211891,0.057350,0.124186,0.095905,0.168234,0.302649,1.000000,0.202809,0.234638
3951,0.042829,0.026063,0.010073,0.024769,0.010750,0.029136,0.036102,0.072141,0.000000,0.018359,...,0.170983,0.132019,0.086057,0.104873,0.015847,0.122279,0.199337,0.202809,1.000000,0.192972


In [8]:
def recommend_similar_item(base_item_id: int, rating_df, movies_df, similarity_df):
  """
  指定したitem_idに類似したitem_idを探索する関数.

  Args:
      base_item_id (int): 対象とする作品のID.
      rating_df (DataFrame): 表形式のRating情報のDataFrame.
      movies_df (DataFrame): IDとタイトル情報を持つDataFrame.
      similarity_df (DataFrame): 類似度を格納したDataFrame.

  Returns:
      DataFrame: 最も類似度の高い11作品（元作品含む）のDataFrame.
  """
  # 処理時間計測（任意）
  start_time = time.time()

  # base_item_idに合致する行が無い場合はエラーを返す（任意）
  if base_item_id not in rating_df.index:
    raise ValueError(f"base_item_id {base_item_id} is not found in rating_df.")

  # ログ（任意）
  base_item_title = movies_df[movies_df["movie_id"] == base_item_id]["title"]
  print(base_item_title, "に近い作品を探索します...")

  # 類似度を格納するDataFrameを作成
  item_sim_list = pd.DataFrame(columns=['base_item_id', 'target_item_id', 'similarity'])
  item_sim_list["similarity"] = similarity_df[base_item_id]
  item_sim_list["target_item_id"] = similarity_df.index
  item_sim_list["base_item_id"] = base_item_id

  # item_sim_listとitem_master_dfを結合
  item_sim_list_with_title = pd.merge(
      left=item_sim_list,
      right=movies_df,
      left_on="target_item_id",
      right_on="movie_id",
      how="outer"
  )

	# 類似度の高い作品を抽出
  recommends_top_ten = item_sim_list_with_title.sort_values(
      by="similarity",
      ascending=False
  ).head(11)[
      ["target_item_id",'title','similarity']
  ]

  print("実行時間:", time.time() - start_time, "sec.")
  return recommends_top_ten


In [9]:
# 推薦実行
recommend_similar_item(1580, item_indexed_rating_df, movies_df, similarity_df)


1539    Men in Black (1997)
Name: title, dtype: object に近い作品を探索します...
実行時間: 0.023384571075439453 sec.


Unnamed: 0,target_item_id,title,similarity
1539,1580.0,Men in Black (1997),1.0
476,480.0,Jurassic Park (1993),0.728619
585,589.0,Terminator 2: Judgment Day (1991),0.696649
2502,2571.0,"Matrix, The (1999)",0.684763
2847,2916.0,Total Recall (1990),0.684222
770,780.0,Independence Day (ID4) (1996),0.654158
2559,2628.0,Star Wars: Episode I - The Phantom Menace (1999),0.633866
1178,1196.0,Star Wars: Episode V - The Empire Strikes Back...,0.629406
257,260.0,Star Wars: Episode IV - A New Hope (1977),0.628935
1192,1210.0,Star Wars: Episode VI - Return of the Jedi (1983),0.624777
