# Sample Code

## 基礎建設

In [1]:
import pandas as pd
import numpy as np
import gzip, json

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

## 載入資料

In [2]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

--2021-12-25 05:43:06--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15499476 (15M) [application/octet-stream]
Saving to: ‘All_Beauty.csv’


2021-12-25 05:43:07 (19.1 MB/s) - ‘All_Beauty.csv’ saved [15499476/15499476]

--2021-12-25 05:43:07--  http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10329961 (9.9M) [application/octet-stream]
Saving to: ‘meta_All_Beauty.json.gz’


2021-12-25 05:43:08 (14.9 MB/s) - ‘meta_All_Beauty.json.gz’ saved [10329961/10329961]



In [3]:
metadata = getDF('/content/meta_All_Beauty.json.gz')
ratings = pd.read_csv('/content/All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header=None)

In [4]:
metadata.head()

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes
0,[],,[Loud 'N Clear Personal Sound Amplifier allows...,,Loud 'N Clear&trade; Personal Sound Amplifier,[],,idea village,[],"2,938,573 in Beauty & Personal Care (",[],{'ASIN: ': '6546546450'},All Beauty,,,,6546546450,[],[]
1,[],,[No7 Lift & Luminate Triple Action Serum 50ml ...,,No7 Lift &amp; Luminate Triple Action Serum 50...,"[B01E7LCSL6, B008X5RVME]",,,[],"872,854 in Beauty & Personal Care (",[],"{'Shipping Weight:': '0.3 ounces (', 'ASIN: ':...",All Beauty,"class=""a-bordered a-horizontal-stripes a-spa...",,$44.99,7178680776,[],[]
2,[],,[No7 Stay Perfect Foundation now stays perfect...,,No7 Stay Perfect Foundation Cool Vanilla by No7,[],,No7,[],"956,696 in Beauty & Personal Care (","[B01B8BR0O8, B01B8BR0NO, B014MHXXM8]","{'Shipping Weight:': '3.5 ounces (', 'ASIN: ':...",All Beauty,,,$28.76,7250468162,[],[]
3,[],,[],,Wella Koleston Perfect Hair Colour 44/44 Mediu...,[B0041PBXX8],,,[],"1,870,258 in Beauty & Personal Care (",[],"{'  Item Weight: ': '1.76 ounces', 'Sh...",All Beauty,,,,7367905066,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
4,[],,[Lacto Calamine Skin Balance Daily Nourishing ...,,Lacto Calamine Skin Balance Oil control 120 ml...,[],,Pirmal Healthcare,[],"67,701 in Beauty & Personal Care (","[3254895630, B007VL1D9S, B00EH9A0RI, B0773MBG4...","{'Shipping Weight:': '12 ounces (', 'ASIN: ': ...",All Beauty,,,$12.15,7414204790,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...


In [5]:
ratings.head()

Unnamed: 0,asin,reviewerID,overall,unixReviewTime
0,143026860,A1V6B6TNIC10QE,1.0,1424304000
1,143026860,A2F5GHSXFQ0W6J,4.0,1418860800
2,143026860,A1572GUYS7DGSR,4.0,1407628800
3,143026860,A1PSGLFK1NSVO,5.0,1362960000
4,143026860,A6IKXKZMTKGSC,5.0,1324771200


## 資料整理

In [6]:
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')

In [7]:
split = metadata['rank'].str.split(n=1, expand = True).rename(columns = {0:'rank', 1:'cat'})
rank = split['rank'].replace(',','',regex = True)
rank = pd.to_numeric(rank)
metadata = metadata[['asin','also_buy']].join(rank)

## 資料切分

In [8]:
ratings_trainings = ratings[
    (ratings['DATE'] < '2018-09-01')
]
ratings_testings = ratings[
    (ratings['DATE'] >= '2018-09-01') & 
    (ratings['DATE'] <= '2018-09-30')
]
ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
users = list(ratings_testings_by_user.keys())

## 產生推薦

In [11]:
def recommender(training_data, users=[], k=10):
    '''
    * training_data: dataframe 輸入的訓練資料集（2018-09-01 以前資料）
    * users: [] 需要被推薦的使用者
    * k: int 每個使用者需要推薦的商品數
    * recommendations: dict
      {
          使用者一： [推薦商品一, 推薦商品二, ...],
          使用者二： [...], ...
      }
    '''
    recommendations = {}
    
    ###推薦規則一： 個人化「買過這個商品的人也購買...」推薦清單（最多推5項）
    
    #重新計算美妝類銷售排行
    items = metadata.groupby('asin').min().reset_index()
    items.fillna(items['rank'].max())
    items['rank'] = items['rank'].rank(method = 'max')

    #將商品資訊合併進評論資訊，取得依據客戶曾購買的商品，推薦商品資訊裡的「也購買」項目清單
    ratings = ratings_trainings[['asin', 'reviewerID', 'overall']].merge(items, how = 'left', on = 'asin').fillna('')
    expanded_buy = ratings['also_buy'].apply(pd.Series).fillna('')
    buy = pd.concat([ratings[['reviewerID','asin', 'overall','rank']],expanded_buy], axis = 1)
    buy = pd.melt(buy, id_vars = ['reviewerID','asin'], value_vars=list(range(expanded_buy.shape[1])), value_name = 'also_buy')
    buy.drop(columns = 'variable',inplace=True)

    #計算同一項「也購買」商品總共被推薦給客戶幾次，取得初步個人化「買過這個商品的人也購買...」推薦清單
    count_alsobuy = buy[['reviewerID','also_buy']].value_counts()
    count_alsobuy = count_alsobuy.to_frame().reset_index().rename(columns = {0:'count_alsobuy'}).replace('',np.nan).dropna(subset = ['also_buy'])

    #計算商品銷量PR值，將資訊整合進個人化「買過這個商品的人也購買...」推薦清單，並刪除不屬於美妝類的商品
    items['PR'] = (len(items['rank'])-items['rank'])/len(items['rank'])*100
    items_buy = items[['asin','PR']].rename(columns = {"asin": "also_buy"})
    count_alsobuy = count_alsobuy.merge(items_buy, how = 'left', on ='also_buy')
    count_alsobuy.dropna(subset = ['PR'], inplace = True)

    #計算商品評價，將資訊整合進個人化「買過這個商品的人也購買...」推薦清單
    rate = ratings_trainings.groupby('asin')['overall'].mean().to_frame().reset_index().rename(columns = {"asin": "also_buy"})
    count_alsobuy = count_alsobuy.merge(rate[['also_buy','overall']], how = 'left', on = 'also_buy')

    #依照 推薦次數、銷量PR、商品評價 排序「買過這個商品的人也購買...」推薦清單，每位客戶只取排序前5名商品
    count_alsobuy = count_alsobuy.sort_values(by = ['reviewerID','count_alsobuy','PR','overall'], ascending = False)
    count_alsobuy = count_alsobuy[['reviewerID','also_buy']].groupby('reviewerID').head(5)
    count_alsobuy = count_alsobuy.groupby('reviewerID').agg(list)

    
    ###推薦規則二： 商品指標分數（補滿k項）
    
    #計算商品支持度
    score = (ratings_trainings.groupby('asin')['overall'].size()/
         ratings_trainings.shape[0]).to_frame().reset_index().rename(columns = {"asin": "also_buy","overall": "support"})
    
    #整合商品銷量與評價資訊
    score = score.merge(rate[['also_buy','overall']], how = 'left', on ='also_buy')
    score = score.merge(items_buy, how = 'left', on='also_buy')
    
    #排序：支持度>銷量>評價
    score = score.sort_values(by = ['support','PR','overall'], ascending = False)
    
    #規則一若有可用推薦品，則推薦之，若無，則推薦規則二清單至k項
    for user in users:
        if user in count_alsobuy.index:
            add = k - len(count_alsobuy['also_buy'][user])
            count_alsobuy['also_buy'][user].extend(score['also_buy'][:add])
            recommendations[user] = count_alsobuy['also_buy'][user]
        else:
            recommendations[user] = score['also_buy'][:k].tolist()
    
    return recommendations


ratings_by_user = recommender(ratings_trainings, users)

## 結果評估

In [12]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score

evaluate(ratings_testings_by_user, ratings_by_user)

0.08305084745762711