<a href="https://colab.research.google.com/github/rujunz/data-course-sample/blob/main/A4_%E5%AF%A6%E4%BD%9C%E3%80%8CContent_based_Filtering%E3%80%8D%E7%9A%84%E6%8E%A8%E8%96%A6%E7%B3%BB%E7%B5%B1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 基礎建設

In [1]:
import pandas as pd
import gzip, json

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

## 載入資料

In [2]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

--2022-01-03 15:14:12--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15499476 (15M) [application/octet-stream]
Saving to: ‘All_Beauty.csv’


2022-01-03 15:14:13 (25.1 MB/s) - ‘All_Beauty.csv’ saved [15499476/15499476]

--2022-01-03 15:14:13--  http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10329961 (9.9M) [application/octet-stream]
Saving to: ‘meta_All_Beauty.json.gz’


2022-01-03 15:14:14 (19.5 MB/s) - ‘meta_All_Beauty.json.gz’ saved [10329961/10329961]



In [3]:
metadata = getDF('/content/meta_All_Beauty.json.gz')
ratings = pd.read_csv('/content/All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header=None)

In [None]:
metadata.head()

In [None]:
ratings.head()

## 資料切分

In [4]:
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')

In [5]:
ratings_trainings = ratings[
    (ratings['DATE'] < '2018-09-01')
]
ratings_testings = ratings[
    (ratings['DATE'] >= '2018-09-01') & 
    (ratings['DATE'] <= '2018-09-30')
]
ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
testing_users = list(ratings_testings_by_user.keys())

## 資料整理

In [6]:
metadata = metadata.set_index('asin')
metadata['last_review_date'] = ratings_trainings.groupby(by=['asin']).max()['DATE']
metadata = metadata.reset_index()

In [7]:
metadata = metadata[['asin', 'brand', 'title', 'price', 'description','last_review_date']]

In [8]:
import numpy as np
for col in metadata.columns:
  metadata[col] = metadata[col].apply(lambda x: np.nan if (isinstance(x, list) and len(x)==0) else (np.nan if x == '' else x))

In [9]:
metadata['price'] = metadata['price'].str.replace('$','')

In [10]:
metadata['price'] = metadata['price'].apply(lambda x: (float(x) if len(x)<=6 else np.nan) if x is not None and not isinstance(x, float) else x)

In [11]:
metadata['description'] = metadata['description'].apply(lambda x: (" ".join(x) if isinstance(x, list) else x) if x is not None else x)

In [12]:
metadata[metadata['title'].isnull()]

Unnamed: 0,asin,brand,title,price,description,last_review_date
27017,B01A9MZLS4,BCW,,,,2018-08-10


In [13]:
metadata = metadata.dropna(subset=['title'])

In [14]:
metadata['description'] = metadata['description'].fillna(metadata['title'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [15]:
import re
metadata['description'] = metadata['description'].apply(lambda x: " ".join(re.sub("(<br/>)|(<br>)|(<br\s*/><br\s*/>)|(\-)|(\/)|(\n)|(\t)|(;)|(&amp)", " ", re.sub("[.;:!\'?,\"()[]#]", "", x.lower())).split()))
metadata['title'] = metadata['title'].apply(lambda x: " ".join(re.sub("(<br/>)|(<br>)|(<br\s*/><br\s*/>)|(\-)|(\/)|(\n)|(\t)|(;)|(&amp)", " ", re.sub("[.;:!\'?,\"()[]#]", "", x.lower())).split()))

In [16]:
bought = []
for user in testing_users:
  bought.extend(ratings_trainings[ratings_trainings['reviewerID'] == user]['asin'].tolist())

In [17]:
metadata = metadata[(metadata['last_review_date'] >= '2018-08-01')|(metadata['asin'].isin(bought))]

In [18]:
metadata = metadata.drop_duplicates()
metadata = metadata.reset_index()
metadata = metadata.drop('index', axis=1)

## 產生推薦

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 計算商品用標題所表示的 tfidf 矩陣
df = metadata.drop_duplicates('title')
tf = TfidfVectorizer(analyzer='word')
tfidf_matrix = tf.fit_transform(df['title'])

# 計算商品間的相似程度
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(tfidf_matrix)
mapping = pd.Series(df.index,index = df['title'])

# 每個商品回傳 k 個最相近的商品
def recommend_item(item_input, k=2):
    try:
        item_index = mapping[item_input]
        similarity_score = list(enumerate(similarity_matrix[item_index]))
        similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
        similarity_score = similarity_score[:k]
        item_indices = [i[0] for i in similarity_score]
        return (df['asin'].iloc[item_indices].tolist())
    except:
        return []

# 利用使用者購買過的商品產生推薦
def recommend_items(items, k):
    res = []
    for d in items:
        res.extend(recommend_item(d, k))
    return res

In [20]:
def recommender(training_data, users=[], k=10):
    recommendations = {}
    ratings_trainings = training_data
    for user in users:
      if user in ratings_trainings['reviewerID'].to_list():
        recommendations[user] = recommend_items(metadata[metadata['asin'].isin(ratings_trainings[ratings_trainings['reviewerID'] == user]['asin'].tolist())]['title'].tolist(), k)
      else:
        recommendations[user] = ratings_trainings[ratings_trainings['DATE'] >= '2018-08-01']['asin'].value_counts().head(k).keys().values.tolist()
    return recommendations

ratings_by_user = recommender(ratings_trainings, testing_users)

In [None]:
ratings_by_user

## 結果評估

In [22]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))
    score = total / len(ratings_testings)
    return score

In [23]:
evaluate(ratings_testings_by_user, ratings_by_user)

0.1576271186440678