## Chapter 5
### 5-1 推薦系統簡介-評測方法
### 5-2 協同過濾推薦法
### 5-3 內容過濾推薦法
### 5-4 電影推薦作業(LFW)

### 5-1 RSME and MAE

In [7]:
import math
import numpy as np

def RSME(target, pred):
    total = 0
    for t, p in zip(target, pred):
        total += math.pow(t-p, 2)
    rsme = total / len(target)
    rsme = math.sqrt(rsme)
    return rsme

def MAE(target, pred):
    total = 0
    for t, p in zip(target, pred):
        total += abs(t-p)
    mae = total / len(target)
    return mae

In [8]:
target = np.random.randint(0, 11, 100) # 原始分數
print("target:{}".format(target))
pred = np.random.randint(0, 11, 100) # 預測分數
print("pred:{}".format(pred))
print("-----------------------------------------")

rsme = RSME(target, pred)
print("RSME:{}".format(rsme))
mae = MAE(target, pred)
print("MAE:{}".format(mae))

target:[ 9  6  7  6  0  2 10  2  4  8  7  9  8  8  5  5  8  1  0  2  0  9  5  8
  2  0  6  1  9  8  7 10 10 10  9  3  6  4  6  0  7  1  8  3  6  2  5  5
  7  2  7  1  3  5  1  6  2 10  8  9  5  2  1  0 10  7  8  3  0  5 10  4
  5  8  6  5 10  1  8  5  0  3  5 10  7 10  2  2  0  9  7  6  8  3  3  0
  6  8  6  0]
pred:[ 6  5  4  1 10  2  4  2  4  5  7  7  3 10  2 10  5  2  3  2  8  5  5  2
  3  1  1  0  5  6  5  4  1  4  2  1  8  2  0  3  3  7 10  1  6  8  7  7
  4  0  4  0  2  3  9  0  4  8  0  7  4  4  1  7  8  3  8 10  9  0  9  3
  2  9  4  3  0  7  2  1  7  1  3  6 10  6  1  6  0  4  3  2  8  8  3 10
  3  1  3  6]
-----------------------------------------
RSME:4.29767378938886
MAE:3.43


In [28]:
import random

def Top_N_Score(target, pred):
    intersection = []
    for p in pred:
        if p in target:
            intersection.append(p)
    precision = len(intersection) / len(pred)
    recall = len(intersection) / len(target)
    try: 
        f1 = 2 * (precision * recall) / (precision + recall)
        return precision, recall, f1
    except ZeroDivisionError:
        return precision, recall , 0

In [32]:
product_number = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
target = random.sample(product_number, 5) # 原始分數
print("target:{}".format(target))
pred = random.sample(product_number, 5) # 預測分數
print("pred:{}".format(pred))
print("-----------------------------------------")

precision, recall, f1 = Top_N_Score(target, pred)
print("Precision:{}".format(precision))
print("Recall:{}".format(recall))
print("F1 score:{}".format(f1))

target:[15, 11, 3, 5, 9]
pred:[10, 4, 15, 8, 6]
-----------------------------------------
Precision:0.2
Recall:0.2
F1 score:0.20000000000000004


### 5-2 協同過濾推薦法

<p>dataset: <a href="https://grouplens.org/datasets/movielens/20m/">MovieLens 20M Dataset</a></p>

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation
from sklearn.model_selection import train_test_split

movies = pd.read_csv('./ml-20m/movies.csv', sep=',', encoding='latin-1', usecols=['movieId', 'title', 'genres'])
ratings = pd.read_csv('./ml-20m/ratings.csv', sep=',', encoding='latin-1', usecols=['userId', 'movieId', 'rating'])

# Fill NaN values in user_id and movie_id column with 0
ratings['userId'] = ratings['userId'].fillna(0)
ratings['movieId'] = ratings['movieId'].fillna(0)

# Replace NaN values in rating column with average of all values
ratings['rating'] = ratings['rating'].fillna(ratings['rating'].mean())

# Randomly sample 2% of the ratings dataset
small_data = ratings.sample(frac=0.02)

#### User-User Collaborative Filtering

In [2]:
train_data, test_data = train_test_split(small_data, test_size=0.2)
user_movies_df = train_data.pivot( index='userId', columns='movieId', values = "rating" ).reset_index(drop=True)
user_movies_df.fillna( 0, inplace = True )

user_sim = 1 - pairwise_distances( user_movies_df.values, metric="correlation" )
user_sim_df = pd.DataFrame( user_sim )

In [3]:
def get_user_similar_movies( user1, user2 ):
    common_movies = ratings[ratings.userId == user1].merge(
      ratings[ratings.userId == user2],
      on = "movieId",
      how = "outer" )

    return common_movies.merge( movies, on = 'movieId' )
get_user_similar_movies( 310, 247 )

Unnamed: 0,userId_x,movieId,rating_x,userId_y,rating_y,title,genres
0,310.0,17,5.0,,,Sense and Sensibility (1995),Drama|Romance
1,310.0,22,1.0,247.0,4.0,Copycat (1995),Crime|Drama|Horror|Mystery|Thriller
2,310.0,32,3.0,247.0,3.0,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
3,310.0,34,4.0,,,Babe (1995),Children|Drama
4,310.0,35,4.0,,,Carrington (1995),Drama|Romance
...,...,...,...,...,...,...,...
668,,5459,,247.0,4.0,Men in Black II (a.k.a. MIIB) (a.k.a. MIB 2) (...,Action|Comedy|Sci-Fi
669,,5478,,247.0,3.0,Eight Legged Freaks (2002),Action|Comedy|Horror|Sci-Fi
670,,5486,,247.0,3.0,Who Is Cletis Tout? (2001),Comedy
671,,5608,,247.0,3.0,"Das Experiment (Experiment, The) (2001)",Drama|Thriller


#### Item-Item Collaborative Filtering

In [4]:
ratings = ratings[:400000]
rating_mat = ratings.pivot(index='movieId', columns='userId', values = "rating" ).reset_index(drop=True)
rating_mat.fillna( 0, inplace = True )

In [5]:
movie_sim = 1 - pairwise_distances( rating_mat.values, metric="correlation" )
movie_sim_df = pd.DataFrame( movie_sim )

In [6]:
def get_similar_movies( movieid, topN = 5 ):
    movies['similarity'] = movie_sim_df.iloc[movieid-1]
    movies.columns = ['movie_id', 'title','genres', 'similarity']
    top_n = movies.sort_values( ["similarity"], ascending = False )[1:topN]
    print( "Similar Movies to: ", )
    return top_n
get_similar_movies( 10 )

Similar Movies to: 


Unnamed: 0,movie_id,title,genres,similarity
146,148,"Awfully Big Adventure, An (1995)",Drama,0.499782
158,160,Congo (1995),Action|Adventure|Mystery|Sci-Fi,0.499418
371,375,Safe Passage (1994),Drama,0.484646
201,203,"To Wong Foo, Thanks for Everything! Julie Newm...",Comedy,0.456313


### 5-3 內容過濾推薦法

In [65]:
# TF*IDF
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
#users = pd.read_csv("./ml-20m/users.csv", sep='\t', encoding='latin-1', usecols=['user_id', 'gender', 'zipcode', 'age_desc', 'occ_desc'])
movies = pd.read_csv('./ml-20m/movies.csv', sep=',', encoding='latin-1', usecols=['movieId', 'title', 'genres'])
# Break up the big genre string into a string array
movies['genres'] = movies['genres'].str.split('|')
# Convert genres to string value
movies['genres'] = movies['genres'].fillna("").astype('str')

In [66]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['genres'])
tfidf_matrix.shape

(27278, 189)

In [67]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim[:4, :4]

array([[1.        , 0.3170459 , 0.06282188, 0.05665841],
       [0.3170459 , 1.        , 0.        , 0.        ],
       [0.06282188, 0.        , 1.        , 0.35995626],
       [0.05665841, 0.        , 0.35995626, 1.        ]])

In [68]:
# Build a 1-dimensional array with movie titles
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])

# Function that get movie recommendations based on the cosine similarity score of movie genres
def genre_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [69]:
genre_recommendations('Good Will Hunting (1997)').head(10)

24                            Leaving Las Vegas (1995)
27                                   Persuasion (1995)
34                                   Carrington (1995)
45                How to Make an American Quilt (1995)
48                        When Night Is Falling (1995)
73                                 Bed of Roses (1996)
82     Once Upon a Time... When We Were Colored (1995)
84                           Angels and Insects (1995)
103              Bridges of Madison County, The (1995)
129                           Frankie Starlight (1995)
Name: title, dtype: object

#### 5-4社群網路文字分析作業引導
##### 實作Latent Factor Model(LFM)
<p>dataset: <a href="https://grouplens.org/datasets/movielens/20m/">MovieLens 20M Dataset</a></p>

In [157]:
import numpy as np
import pandas as pd
from math import exp
import time
import math

class LFM:
    def __init__(self, latenClass, iters, alpha, lamda, ratio, trainData):
        self.lclass = latenClass
        self.iters = iters
        self.alpha = alpha
        self.lamba = lamda
        self.ratio = ratio
        self.trainData = trainData
        
        self.userId = list(set(self.trainData['userId'].values))
        #print(self.trainData['userId'].values)
        self.movieId = list(set(self.trainData['movieId'].values))
        itemCount = [len(self.trainData[self.trainData['movieId'] == movieId]) for movieId in self.movieId]
        self.itemLen = pd.Series(itemCount, index=self.movieId).sort_values(ascending=False)
        
    def getUserPositiveItem(self, userId):
        # 生成正樣本
        series = self.trainData[self.trainData['userId'] == userId]['movieId']
        positive = list(series.values)
        return positive
    
    def getUserNagativeItem(self, userId):
        userItemList =  list(set(self.trainData[self.trainData['userId'] == userId]['movieId']))
        negative = []
        count = self.ratio * len(userItemList)
        for key, values in self.itemLen.iteritems():
            if count == 0:
                break
            if key in userItemList:
                continue
            negative.append(key)
            count = count -  1
        return negative
    
    def initUserItem(self, userId):
        positive = self.getUserPositiveItem(userId)
        negative = self.getUserNagativeItem(userId)
        itemDict = {}
        for item in positive:
            itemDict[item] = 1
        for item in negative:
            itemDict[item] = 0
        return itemDict
    
    def initModel(self):
        
        # 初始化矩陣
        
        arrayp = np.random.rand(len(self.userId), self.lclass) # p is user-classes
        arrayq = np.random.rand(self.lclass, len(self.movieId)) # q is classes-item
        df_p = pd.DataFrame(arrayp, columns=range(0, self.lclass), index=self.userId)
        df_q = pd.DataFrame(arrayq, columns=self.movieId, index=range(0, self.lclass))
        
        userItem = []
        for userId in self.userId:            
            itemDict = self.initUserItem(userId)
            userItem.append({userId:itemDict})
        return df_p, df_q, userItem
    
    def sigmoid(self, x):
        y = 1.0 / (1 + exp(-x))
        return y
    
    def lfmPredict(self, p, q, userId, movieId):
        
        p = np.mat(p.iloc[userId-1].values)
        q = np.mat(q[movieId].values).T
        r = (p * q).sum()
        r = self.sigmoid(r)
        return r
    
    def latenFactorModel(self):
        alpha = self.alpha
        p, q, userItem = self.initModel()
        
        for step in range(0, self.iters):
            for user in userItem:
                for userId, samples in user.items():
                    for movieId, rui in samples.items():
                        eui = rui - self.lfmPredict(p, q, userId, movieId)
                        for f in range(0, self.lclass):
                            p[f][userId] += alpha * (eui * q[movieId][f] - self.lamba * p[f][userId])
                            q[movieId][f] += alpha * (eui * p[f][userId] - self.lamba * p[f][userId])
            alpha *= 0.9
        return p, q
    
    def recommend(self, userId, p, q, topK=10):
        
        predictList = [self.lfmPredict(p, q, userId, movieId) for movieId in self.movieId]
        series = pd.DataFrame({"movieId": self.movieId, "love level": predictList})
        series = series.sort_values(['love level'], ascending=False)[:topK]
        return series

In [154]:
df_ratings = pd.read_csv('./ml-20m/ratings.csv', sep=',', encoding='latin-1', usecols=['userId', 'movieId', 'rating'])
df_ratings = df_ratings[:40000]
trainData = df_ratings[['userId', 'movieId']]

In [155]:
lfm = LFM(latenClass=20, iters=2, alpha=0.02, lamda=0.01, ratio=1, trainData=trainData)
p, q = lfm.latenFactorModel()
preitem = lfm.recommend(1, p, q)
preitem

Unnamed: 0,movieId,love level
530,588,0.72213
970,1196,0.708477
284,318,0.694367
130,150,0.671326
319,356,0.670661
31,32,0.669706
295,329,0.667997
1061,1291,0.660407
330,367,0.657535
282,316,0.653276


In [156]:
list_movie = preitem['movieId'].tolist() 
movies = pd.read_csv('./ml-20m/movies.csv', sep=',', encoding='latin-1', usecols=['movieId', 'title'])
df = {'movieId':list_movie, 'title':[]}
for l in list_movie:
    df['title'].append(movies.loc[movies['movieId'] == l]['title'].values[0]) 
df = pd.DataFrame(df)
df

Unnamed: 0,movieId,title
0,588,Aladdin (1992)
1,1196,Star Wars: Episode V - The Empire Strikes Back...
2,318,"Shawshank Redemption, The (1994)"
3,150,Apollo 13 (1995)
4,356,Forrest Gump (1994)
5,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
6,329,Star Trek: Generations (1994)
7,1291,Indiana Jones and the Last Crusade (1989)
8,367,"Mask, The (1994)"
9,316,Stargate (1994)
