<a href="https://colab.research.google.com/github/rujunz/data-course-sample/blob/main/A2_%E5%AF%A6%E4%BD%9C%E3%80%8Crule_based%E3%80%8D%E7%9A%84%E6%8E%A8%E8%96%A6%E7%B3%BB%E7%B5%B1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sample Code

## 匯入資料

In [None]:
import pandas as pd
import gzip, json
from datetime import datetime, timedelta

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

## 載入資料

In [None]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

In [None]:
metadata = getDF('/content/meta_All_Beauty.json.gz')
ratings = pd.read_csv('/content/All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header=None)

## 整理資料格式

In [None]:
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')
metadata['rank'] = metadata['rank'].str.split(expand = True)[0]
metadata['rank'] = metadata['rank'].str.replace(',', '')
metadata['rank'] = pd.to_numeric(metadata['rank'])

## 資料切分

In [None]:
ratings_trainings = ratings[(ratings['DATE'] < '2018-09-01')]
ratings_testings = ratings[(ratings['DATE'] >= '2018-09-01') & (ratings['DATE'] <= '2018-09-30')]
ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
users = list(ratings_testings_by_user.keys())

## 結果評估

In [None]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score

evaluate(ratings_testings_by_user, ratings_by_user)

0.0

## 實作「randam-based」的推薦系統

In [None]:
def recommender(training_data, users=[], k=2):
    recommendations = {}
    '''
    random-based
    '''
    ratings_trainings = training_data
    recommendations = {user: ratings_trainings['asin'].sample(n=k).tolist() for user in users}
    return recommendations
ratings_by_user = recommender(ratings_trainings, users)
evaluate(ratings_testings_by_user, ratings_by_user)

## A2: 實作「rule-based」的推薦系統

### rank top 10

In [None]:
def recommender(training_data, users=[], k=10):
  recommendations = {}
  recommendations = {user: metadata.sort_values(by=['rank']).head(10)['asin'].values.tolist() for user in users}
  return recommendations

ratings_by_user = recommender(ratings_trainings, users)
evaluate(ratings_testings_by_user, ratings_by_user)

0.005084745762711864

### review count top 10

In [None]:
def recommender(training_data, users=[], k=10):
  recommendations = {}
  recommendations = {user: ratings_trainings['asin'].value_counts().head(10).keys().values.tolist() for user in users}
  return recommendations

ratings_by_user = recommender(ratings_trainings, users)
evaluate(ratings_testings_by_user, ratings_by_user)

0.08305084745762711

### within 6 months review count top 10

In [None]:
def recommender(training_data, users=[], k=10):
  recommendations = {}
  recommendations = {user: ratings_trainings[(datetime.strptime("2018-9-1", "%Y-%m-%d") - ratings_trainings['DATE']) < d]['asin'].value_counts().head(10).keys().values.tolist() for user in users}
  return recommendations

ratings_by_user = recommender(ratings_trainings, users)
evaluate(ratings_testings_by_user, ratings_by_user)

0.09661016949152543