# Sample Code

## 基礎建設

In [1]:
import pandas as pd
import gzip, json

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

## 載入資料

In [2]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

--2022-01-07 03:55:17--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15499476 (15M) [application/octet-stream]
Saving to: ‘All_Beauty.csv.5’


2022-01-07 03:55:17 (70.2 MB/s) - ‘All_Beauty.csv.5’ saved [15499476/15499476]

--2022-01-07 03:55:17--  http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10329961 (9.9M) [application/octet-stream]
Saving to: ‘meta_All_Beauty.json.gz.5’


2022-01-07 03:55:17 (65.7 MB/s) - ‘meta_All_Beauty.json.gz.5’ saved [10329961/10329961]



In [3]:
#metadata = getDF('/content/meta_All_Beauty.json.gz')
ratings = pd.read_csv('/content/All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header=None)

In [4]:
#metadata.head()

In [5]:
ratings.head()

Unnamed: 0,asin,reviewerID,overall,unixReviewTime
0,143026860,A1V6B6TNIC10QE,1.0,1424304000
1,143026860,A2F5GHSXFQ0W6J,4.0,1418860800
2,143026860,A1572GUYS7DGSR,4.0,1407628800
3,143026860,A1PSGLFK1NSVO,5.0,1362960000
4,143026860,A6IKXKZMTKGSC,5.0,1324771200


## 資料整理

In [6]:
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')

## 資料切分

In [7]:
ratings_trainings = ratings[(ratings['DATE'] < '2018-09-01')]
ratings_testings = ratings[
    (ratings['DATE'] >= '2018-09-01') & 
    (ratings['DATE'] <= '2018-09-30')
]
ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
users = list(ratings_testings_by_user.keys())

## 產生推薦

In [None]:
#! pip install surprise

In [13]:
import pandas as pd
from itertools import combinations
from collections import defaultdict
from surprise import Reader, Dataset, KNNBasic

class RecSur:

    def __init__(self, training_data):
        self.training_data = training_data.sort_values("DATE", ascending=False).groupby(['reviewerID', 'asin']).head(1)

    def user_similarity(self, remove_obscure_user = True, user_rating_threshold = 3):

        # loading data from dataframe
        # user_to_items dict:
        # {
        #   'user': {
        #       'item': ratings...
        #   }...
        # }  
        user_to_items = defaultdict(dict)
        for _, row in self.training_data.iterrows():
            row = dict(row)
            user = row['reviewerID']
            item = row['asin']
            rating = float(row['overall'])

            user_to_items[user][item] = rating

        #print("total users before filtering: ", len(user_to_items))

        # remove obscure user to decrease data size
        # filtering params
        all_users = list(user_to_items.keys())
        for user in all_users:
            ratings = user_to_items[user]
            if remove_obscure_user and len(ratings) < user_rating_threshold:
                del user_to_items[user]

        # print("total users  after filtering: ", len(user_to_items))

        # generate item to user mapping dict
        # {
        #   'item': {
        #       'user': ratings...
        #   }...
        # }
        item_to_users = defaultdict(dict)
        for user, items in user_to_items.items():
            for item, rating in items.items():
                item_to_users[item][user] = rating

        # prepare data of computing user similarity 
        init_sim = lambda: [0 for _ in range(3)]
        factory = lambda: defaultdict(init_sim)
        pre_user_similarity = defaultdict(factory)
        n = len(item_to_users)
        index = 0
        for item, user_ratings in item_to_users.items():
            if len(user_ratings) > 1:
                # print(f"item: {item} have been rated by {len(user_ratings)} users progress: {index}/{n}")
                for user1, user2 in combinations(user_ratings.keys(), 2):
                    xy = user_ratings[user1] * user_ratings[user2]
                    xx = user_ratings[user1] ** 2
                    yy = user_ratings[user2] ** 2
                    pre_user_similarity[user1][user2][0] += xy
                    pre_user_similarity[user1][user2][1] += xx
                    pre_user_similarity[user1][user2][2] += yy

                    pre_user_similarity[user2][user1][0] += xy
                    pre_user_similarity[user2][user1][1] += xx
                    pre_user_similarity[user2][user1][2] += yy
            index += 1

        user_similarity = {}
        for src_user in pre_user_similarity:
            user_similarity_order = []
            for dst_user, val in pre_user_similarity[src_user].items():
                xy = val[0]
                xx = val[1]
                yy = val[2]
                div = ((xx*yy) ** 0.5)
                if div == 0:
                    continue
                similarity = xy / div
                if similarity < 0:
                    continue
                for i, s in enumerate(user_similarity_order):
                    target_similarity = s[1]
                    if target_similarity < similarity:
                        user_similarity_order.insert(i, (dst_user, similarity))
                        break
                else:
                    user_similarity_order.append((dst_user, similarity))
            user_similarity[src_user] = user_similarity_order
        
        return user_similarity, user_to_items

    def user_based_recommender(self, user_similarity, user_to_items, start_date, users=[], k=10, training_data = ''):
        # 建立 simple rule based建立 simple rule based
        if isinstance(training_data, pd.DataFrame):
            hot = training_data[(training_data['DATE'] >= start_date) & (training_data['DATE'] < '2018-09-01') & (training_data['overall'] >= 4)]
            hot = pd.DataFrame(hot.asin.value_counts())
            hot.reset_index(inplace=True)
            hot.columns = ['asin', 'count']

        recommendation = {}
        for user in users:
            if user in user_similarity:
                sim_users = user_similarity[user]
                recommended_items = []
                recommended_items_set = set()
                user_have_rated = set(user_to_items[user])
                stop_recommend = False
                for sim_user, _ in sim_users:
                    items_from_sim_user = sorted(list(user_to_items[sim_user].items()), key=lambda item: item[1])
                    for item, _ in items_from_sim_user:
                        if item not in recommended_items_set and item not in user_have_rated:
                            recommended_items.append(item)
                            recommended_items_set.add(item)
                        if len(recommended_items) >= k:
                            stop_recommend = True
                            break

                    # 推薦不滿 k 個用 rule-based 且使用者尚未評論過的補滿
                    if isinstance(training_data, pd.DataFrame):
                        if len(recommended_items) < k:
                            for item in hot.asin.tolist():
                                if item not in recommended_items and item not in user_have_rated:
                                    recommended_items.append(item)
                                if len(recommended_items) >= k:
                                    stop_recommend = True
                                    break

                    if stop_recommend:
                        break

                recommendation[user] = recommended_items
            else:
                # rule-based
                if isinstance(training_data, pd.DataFrame):
                    recommendation[user] = hot.asin[:k].tolist()
                else:
                    recommendation[user] = []
        return recommendation

    def item_based_recommender(self, users=[], k=10, rule_base=False, start_date = ''):
        if rule_base:
            assert start_date != '', "please fill start_date 'YYYY-MM-DD'"
            if isinstance(self.training_data, pd.DataFrame):
                hot = self.training_data[(self.training_data['DATE'] >= start_date) & (self.training_data['DATE'] < '2018-09-01') & (self.training_data['overall'] >= 4)]
                hot = pd.DataFrame(hot.asin.value_counts())
                hot.reset_index(inplace=True)
                hot.columns = ['asin', 'count']

        # loading data from dataframe
        # item_to_users dict:
        # {
        #   'item': {
        #       'user': ratings...
        #   }...
        # }
        item_to_users = defaultdict(dict)
        for _, row in self.training_data.iterrows():
            row = dict(row)
            user = row['reviewerID']
            item = row['asin']
            rating = float(row['overall'])
            item_to_users[item][user] = rating

        # print("data converted")

        user_to_items = defaultdict(dict)
        for item, rating_users in item_to_users.items():
            for user, rating in rating_users.items():
                user_to_items[user][item] = rating

        # print("data inverted")

        init_sim = lambda: [0, 0, 0]
        factory = lambda: defaultdict(init_sim)
        pre_item_similarity = defaultdict(factory)
        for user, items in user_to_items.items():
            if len(items) > 1:
                for i1, i2 in combinations(items.keys(), 2):
                    xy = items[i1] * items[i2]
                    xx = items[i1] ** 2
                    yy = items[i2] ** 2
                    pre_item_similarity[i1][i2][0] += xy
                    pre_item_similarity[i1][i2][1] += xx
                    pre_item_similarity[i1][i2][2] += yy

                    pre_item_similarity[i2][i1][0] += xy
                    pre_item_similarity[i2][i1][1] += xx
                    pre_item_similarity[i2][i1][2] += yy

        # print("sim data prepared")

        item_similarity = {}
        for src_item in pre_item_similarity:
            item_similarity_order = []
            for dst_item, val in pre_item_similarity[src_item].items():
                xy = val[0]
                xx = val[1]
                yy = val[2]
                div = ((xx*yy) ** 0.5)
                if div == 0:
                    continue
                similarity = xy / div
                if similarity < 0:
                    continue
                for i, s in enumerate(item_similarity_order):
                    target_similarity = s[1]
                    if target_similarity < similarity:
                        item_similarity_order.insert(i, (dst_item, similarity))
                        break
                else:
                    item_similarity_order.append((dst_item, similarity))
            item_similarity[src_item] = item_similarity_order

        # print(f"get {k} recommendation items for for user: {users}")

        recommendation = {}
        for user in users:
            items = []
            items_set = set()
            stop = False
            user_has_rated = set(user_to_items[user])
            for item in user_has_rated:
                if item in item_similarity:
                    for sim_item, _ in item_similarity[item]:
                        # skip the item user has rated
                        if sim_item not in user_has_rated and sim_item not in items_set:
                            items.append(sim_item)
                            items_set.add(sim_item)
                        if len(items) >= k:
                            stop = True
                            break
                    if stop:
                        break
            
            if rule_base and len(items) < k:
                hotlist = hot.asin.tolist()
                for item in hotlist:
                    if item not in items:
                        items.append(item)
                    if len(items) >= k:
                        break

            recommendation[user] = items
        return recommendation    

    def sur_algo(self, user_based=False, algo=KNNBasic):

        reader = Reader(rating_scale=(0, 5))
        self.data = self.training_data[self.training_data['DATE'] >= '2017-09-01'][['reviewerID', 'asin', 'overall']]
        self.data = Dataset.load_from_df(self.data, reader=reader)

        sim_options = {
            'name': 'cosine',
            'user_based': user_based
        }
        algo_impl = algo(sim_options = sim_options)
        trainset = self.data.build_full_trainset()
        algo_impl.fit(trainset)
        return algo_impl
    
    def sur_recommender(self, algo_impl, users=[], k=10, rule_base = False, start_date = ''):
        if rule_base:
            assert start_date != '', "please fill start_date 'YYYY-MM-DD'"
            if isinstance(self.training_data, pd.DataFrame):
                hot = self.training_data[(self.training_data['DATE'] >= start_date) & (self.training_data['DATE'] < '2018-09-01') & (self.training_data['overall'] >= 4)]
                hot = pd.DataFrame(hot.asin.value_counts())
                hot.reset_index(inplace=True)
                hot.columns = ['asin', 'count']

        recommendation = {}
        training_data = self.training_data[self.training_data['DATE'] >= '2017-09-01'][['reviewerID', 'asin', 'overall']]
        for user in users:
            items_user_rated = set(training_data.loc[training_data['reviewerID']==user]['asin'].tolist())
            recommend_item_list = []
            recommend_item_set = set()
            for item in items_user_rated:
                iid = algo_impl.trainset.to_inner_iid(item)
                recommend_item_iid = algo_impl.get_neighbors(iid, k)
                for sim_item_iid in recommend_item_iid:
                    item_raw_id = algo_impl.trainset.to_raw_iid(sim_item_iid)
                    if item_raw_id not in items_user_rated and item_raw_id not in recommend_item_set:
                        recommend_item_list.append(item_raw_id)
                        recommend_item_set.add(item_raw_id)

                if len(recommend_item_list) >= k:
                    recommend_item_list = recommend_item_list[:k]
                    break
                        
            if rule_base and len(recommend_item_list) < k:
                hotlist = hot.asin.tolist()
                for item in hotlist:
                    if item not in recommend_item_list:
                        recommend_item_list.append(item)
                    if len(recommend_item_list) >= k:
                        break
            
            recommendation[user] = recommend_item_list
        
        return recommendation

    def evaluate(self, ratings_testings_by_user={}, ratings_by_user={}, method=None):
        '''
        * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
        * ratings_by_user: dict 利用訓練資料學習的推薦商品
        * method: str
        * score: float
        '''
        total = 0
        for d in ratings_testings_by_user:
            if d in ratings_by_user:
                total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

        score = total / len(ratings_testings)
        return score


## 結果評估

In [16]:
start_dates = ['2017-09-01','2018-03-01','2018-06-01','2018-08-01']
ks = [10, 20, 30]

# set object
recsur = RecSur(ratings_trainings)

# user-based
user_simi, user_to_items = recsur.user_similarity()
for k in ks:
    ratings_by_user = recsur.user_based_recommender(user_simi, user_to_items, '', users, k=k)
    print(f'User-based 協同過濾, {k} 個推薦, Recall： {round(recsur.evaluate(ratings_testings_by_user, ratings_by_user), 3)}')
    for start_date in start_dates:
        ratings_by_user = recsur.user_based_recommender(user_simi, user_to_items, start_date, users, k=k, training_data=ratings_trainings)
        print(f'混合 Rule-based, 資料起始日：{start_date}, {k} 個推薦, Recall： {round(recsur.evaluate(ratings_testings_by_user, ratings_by_user), 3)}')
    print("-"*60)

del user_simi, user_to_items

User-based 協同過濾, 10 個推薦, Recall： 0.0
混合 Rule-based, 資料起始日：2017-09-01, 10 個推薦, Recall： 0.098
混合 Rule-based, 資料起始日：2018-03-01, 10 個推薦, Recall： 0.1
混合 Rule-based, 資料起始日：2018-06-01, 10 個推薦, Recall： 0.112
混合 Rule-based, 資料起始日：2018-08-01, 10 個推薦, Recall： 0.142
------------------------------------------------------------
User-based 協同過濾, 20 個推薦, Recall： 0.0
混合 Rule-based, 資料起始日：2017-09-01, 20 個推薦, Recall： 0.102
混合 Rule-based, 資料起始日：2018-03-01, 20 個推薦, Recall： 0.115
混合 Rule-based, 資料起始日：2018-06-01, 20 個推薦, Recall： 0.185
混合 Rule-based, 資料起始日：2018-08-01, 20 個推薦, Recall： 0.205
------------------------------------------------------------
User-based 協同過濾, 30 個推薦, Recall： 0.0
混合 Rule-based, 資料起始日：2017-09-01, 30 個推薦, Recall： 0.124
混合 Rule-based, 資料起始日：2018-03-01, 30 個推薦, Recall： 0.168
混合 Rule-based, 資料起始日：2018-06-01, 30 個推薦, Recall： 0.231
混合 Rule-based, 資料起始日：2018-08-01, 30 個推薦, Recall： 0.258
------------------------------------------------------------


In [17]:
# item-based
for k in ks:
    ratings_by_user = recsur.item_based_recommender(users, k=k)
    print(f'Item-based 協同過濾, {k} 個推薦, Recall： {round(recsur.evaluate(ratings_testings_by_user, ratings_by_user), 3)}')
    for start_date in start_dates:
        ratings_by_user =  recsur.item_based_recommender(users, k=k, rule_base=True, start_date=start_date)
        print(f'混合 Rule-based, 資料起始日：{start_date}, {k} 個推薦, Recall： {round(recsur.evaluate(ratings_testings_by_user, ratings_by_user), 3)}')
    print("-"*60)

Item-based 協同過濾, 10 個推薦, Recall： 0.002
混合 Rule-based, 資料起始日：2017-09-01, 10 個推薦, Recall： 0.098
混合 Rule-based, 資料起始日：2018-03-01, 10 個推薦, Recall： 0.1
混合 Rule-based, 資料起始日：2018-06-01, 10 個推薦, Recall： 0.112
混合 Rule-based, 資料起始日：2018-08-01, 10 個推薦, Recall： 0.139
------------------------------------------------------------
Item-based 協同過濾, 20 個推薦, Recall： 0.002
混合 Rule-based, 資料起始日：2017-09-01, 20 個推薦, Recall： 0.105
混合 Rule-based, 資料起始日：2018-03-01, 20 個推薦, Recall： 0.129
混合 Rule-based, 資料起始日：2018-06-01, 20 個推薦, Recall： 0.185
混合 Rule-based, 資料起始日：2018-08-01, 20 個推薦, Recall： 0.205
------------------------------------------------------------
Item-based 協同過濾, 30 個推薦, Recall： 0.002
混合 Rule-based, 資料起始日：2017-09-01, 30 個推薦, Recall： 0.122
混合 Rule-based, 資料起始日：2018-03-01, 30 個推薦, Recall： 0.161
混合 Rule-based, 資料起始日：2018-06-01, 30 個推薦, Recall： 0.239
混合 Rule-based, 資料起始日：2018-08-01, 30 個推薦, Recall： 0.237
------------------------------------------------------------


In [15]:
# surprise item-based
algo_impl = recsur.sur_algo()
for k in ks:
    ratings_by_user = recsur.sur_recommender(algo_impl, users, k=k)
    print(f'Surprise Item-based 協同過濾, {k} 個推薦, Recall： {round(recsur.evaluate(ratings_testings_by_user, ratings_by_user), 3)}')
    for start_date in start_dates:
        ratings_by_user =  recsur.sur_recommender(algo_impl, users, k=k, rule_base=True, start_date=start_date)
        print(f'混合 Rule-based, 資料起始日：{start_date}, {k} 個推薦, Recall： {round(recsur.evaluate(ratings_testings_by_user, ratings_by_user), 3)}')
    print("-"*60)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Surprise Item-based 協同過濾, 10 個推薦, Recall： 0.002
混合 Rule-based, 資料起始日：2017-09-01, 10 個推薦, Recall： 0.098
混合 Rule-based, 資料起始日：2018-03-01, 10 個推薦, Recall： 0.1
混合 Rule-based, 資料起始日：2018-06-01, 10 個推薦, Recall： 0.112
混合 Rule-based, 資料起始日：2018-08-01, 10 個推薦, Recall： 0.139
------------------------------------------------------------
Surprise Item-based 協同過濾, 20 個推薦, Recall： 0.002
混合 Rule-based, 資料起始日：2017-09-01, 20 個推薦, Recall： 0.105
混合 Rule-based, 資料起始日：2018-03-01, 20 個推薦, Recall： 0.132
混合 Rule-based, 資料起始日：2018-06-01, 20 個推薦, Recall： 0.186
混合 Rule-based, 資料起始日：2018-08-01, 20 個推薦, Recall： 0.207
------------------------------------------------------------
Surprise Item-based 協同過濾, 30 個推薦, Recall： 0.003
混合 Rule-based, 資料起始日：2017-09-01, 30 個推薦, Recall： 0.125
混合 Rule-based, 資料起始日：2018-03-01, 30 個推薦, Recall： 0.168
混合 Rule-based, 資料起始日：2018-06-01, 30 個推薦, Recall： 0.244
混合 Rule-based, 資料起始日：2018-08-01, 30 個推薦, Recall： 0.242
