# JeWook

## Import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install lightfm
!pip install scrapbook
!pip install recommenders

In [None]:
import sys
import os

import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scrapbook as sb

import lightfm
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm import cross_validation

# Import LightFM's evaluation metrics
from lightfm.evaluation import precision_at_k as lightfm_prec_at_k
from lightfm.evaluation import recall_at_k as lightfm_recall_at_k
from lightfm.evaluation import auc_score

# Import repo's evaluation metrics
from recommenders.evaluation.python_evaluation import precision_at_k, recall_at_k

from recommenders.utils.timer import Timer
from recommenders.datasets import movielens
from recommenders.models.lightfm.lightfm_utils import (
    track_model_metrics, prepare_test_df, prepare_all_predictions,
    compare_metric, similar_users, similar_items)

print("System version: {}".format(sys.version))
print("LightFM version: {}".format(lightfm.__version__))


System version: 3.7.15 (default, Oct 12 2022, 19:14:55) 
[GCC 7.5.0]
LightFM version: 1.16


In [None]:
dir = "/content/drive/MyDrive/GH x RippleAI/Dataset/movielens/fastcampus-movielens"

df = pd.read_csv(os.path.join(dir, "all.csv"))
df = df[df['tag_0'].notnull()]

movies_df = pd.read_csv(os.path.join(dir, "movies.csv"))
ratings_df = pd.read_csv(os.path.join(dir, "ratings.csv"))
tags_df = pd.read_csv(os.path.join(dir, "tags.csv"))

df = df.merge(movies_df[['movieId', 'title']], how='inner', on = 'movieId')
movies_df.columns= ['itemID', 'item_label', 'genres']

print(movies_df.columns)
print(ratings_df.columns)
print(tags_df.columns)

Index(['itemID', 'item_label', 'genres'], dtype='object')
Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')
Index(['userId', 'movieId', 'tag', 'timestamp'], dtype='object')


In [None]:
df.shape

(47318, 312)

In [None]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,user_total_watch,movie_total_watch,user_nth_watch,user_rating_mean,movie_rating_mean,user_rating_std,...,tag_291,tag_292,tag_293,tag_294,tag_295,tag_296,tag_297,tag_298,tag_299,title
0,1,1210,5.0,964980499,232,196,2,4.366379,4.137755,0.800048,...,0.047567,-0.124491,-0.0059,-0.006938,-0.083903,-0.005452,0.004089,-0.007629,0.081299,Star Wars: Episode VI - Return of the Jedi (1983)
1,7,1210,4.0,1106635965,152,196,45,3.230263,4.137755,1.329594,...,0.047567,-0.124491,-0.0059,-0.006938,-0.083903,-0.005452,0.004089,-0.007629,0.081299,Star Wars: Episode VI - Return of the Jedi (1983)
2,11,1210,4.0,902154266,64,196,12,3.78125,4.137755,0.933822,...,0.047567,-0.124491,-0.0059,-0.006938,-0.083903,-0.005452,0.004089,-0.007629,0.081299,Star Wars: Episode VI - Return of the Jedi (1983)
3,15,1210,5.0,1510572653,135,196,84,3.448148,4.137755,1.133404,...,0.047567,-0.124491,-0.0059,-0.006938,-0.083903,-0.005452,0.004089,-0.007629,0.081299,Star Wars: Episode VI - Return of the Jedi (1983)
4,17,1210,5.0,1305696490,105,196,28,4.209524,4.137755,0.50849,...,0.047567,-0.124491,-0.0059,-0.006938,-0.083903,-0.005452,0.004089,-0.007629,0.081299,Star Wars: Episode VI - Return of the Jedi (1983)


## Hyper Parameters

In [None]:
# Select MovieLens data size
TAG_COL = 11

# default number of recommendations
K = 10
# percentage of data used for testing
TEST_PERCENTAGE = 0.25
# model learning rate
LEARNING_RATE = 0.25
# no of latent factors
NO_COMPONENTS = 20
# no of epochs to fit model
NO_EPOCHS = 20
# no of threads to fit model
NO_THREADS = 32
# regularisation for both user and item features
ITEM_ALPHA = 1e-6
USER_ALPHA = 1e-6

# seed for pseudonumber generations
SEED = 42

## PreProcessing

In [None]:
data = df[['userId', 'movieId', 'rating', 'title']]
data.columns = ['userID', 'itemID', 'act', 'item_label']
data['act']= 1 # to implicit, watch or not
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,userID,itemID,act,item_label
0,1,1210,1,Star Wars: Episode VI - Return of the Jedi (1983)
1,7,1210,1,Star Wars: Episode VI - Return of the Jedi (1983)
2,11,1210,1,Star Wars: Episode VI - Return of the Jedi (1983)
3,15,1210,1,Star Wars: Episode VI - Return of the Jedi (1983)
4,17,1210,1,Star Wars: Episode VI - Return of the Jedi (1983)
...,...,...,...,...
47313,599,176419,1,Mother! (2017)
47314,567,117877,1,The Rabbi's Cat (Le chat du rabbin) (2011)
47315,594,7023,1,"Wedding Banquet, The (Xi yan) (1993)"
47316,606,6107,1,Night of the Shooting Stars (Notte di San Lore...


In [None]:
# tag_pool
tag_pool = df.columns[TAG_COL:-1]
tag_pool

Index(['tag_0', 'tag_1', 'tag_2', 'tag_3', 'tag_4', 'tag_5', 'tag_6', 'tag_7',
       'tag_8', 'tag_9',
       ...
       'tag_290', 'tag_291', 'tag_292', 'tag_293', 'tag_294', 'tag_295',
       'tag_296', 'tag_297', 'tag_298', 'tag_299'],
      dtype='object', length=300)

In [None]:
dataset = Dataset()
dataset.fit(users = data['userID'],
            items = data['itemID'],
            item_features = tag_pool)

In [None]:
"""
Build a item features matrix out of an iterable of the form (item id, [list of feature names]) or (item id, {feature name: feature weight}).

Parameters
data (iterable of the form) – (item id, [list of feature names]) or 
(item id, {feature name: feature weight}). Item and feature ids will be translated to internal indices constructed during the fit call.

normalize (bool, optional) – If true, will ensure that feature weights sum to 1 in every row.

Returns
feature matrix – Matrix of item features.

Return type
CSR matrix (num items, num features)

"""

feature_weight = []
for i in range(df.shape[0]):
  feature_weight.append(df.iloc[i, TAG_COL:-1].to_dict())

item_features = dataset.build_item_features(
    (x,y) for x,y in zip(data.itemID, feature_weight)
)

del feature_weight # for memory 

In [None]:
interactions, weights = dataset.build_interactions(data.iloc[:,:3].values)
train_interactions, test_interactions = cross_validation.random_train_test_split(
    interactions,
    test_percentage = TEST_PERCENTAGE,
    random_state = np.random.RandomState(SEED)
)

## Training

In [None]:
model = LightFM(loss = 'warp', no_components = NO_COMPONENTS,
                learning_rate = LEARNING_RATE,
                item_alpha = ITEM_ALPHA, 
                user_alpha = USER_ALPHA,
                random_state = np.random.RandomState(SEED)
                )

In [None]:
model.fit(interactions = train_interactions,
          item_features = item_features,
          epochs = NO_EPOCHS)

<lightfm.lightfm.LightFM at 0x7fdc4183eb90>

In [None]:
train_precision = lightfm_prec_at_k(model,
                                  test_interactions = train_interactions,
                                  k = K,
                                  item_features = item_features,
                                  num_threads= NO_THREADS
                                  ).mean()
test_precision = lightfm_prec_at_k(model, 
                                   test_interactions = test_interactions,
                                   train_interactions = train_interactions,
                                   k = K,
                                   item_features = item_features,
                                   num_threads= NO_THREADS
                                   ).mean()      

train_auc = auc_score(model, 
                      test_interactions = train_interactions,
                      item_features = item_features,
                      num_threads= NO_THREADS
                      ).mean()   
test_auc = auc_score(model, 
                      test_interactions = test_interactions,
                      train_interactions = train_interactions,
                      item_features = item_features,
                      num_threads= NO_THREADS
                      ).mean()  

print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))                       

Precision: train 0.37, test 0.17.
AUC: train 0.88, test 0.83.


## Recommendation

In [None]:
uid_map, uf_map, iid_map, if_map = dataset.mapping()

assert (len(uid_map), len(iid_map)) == train_interactions.shape

In [None]:
len(uid_map) # real userId -> internal id (sparse matrix column)
# Number of Users

610

In [None]:
uid_map # real itemID -> internal id (sparse matrix column)

In [None]:
len(iid_map) # Number of Items

1505

In [None]:
len(if_map) # Number of Movies + features (tag 300 Dim)

1805

In [None]:
# internal indices as DataFrame indcies
items = pd.merge(data['itemID'], movies_df[['itemID', 'item_label']],
                  how = 'inner', on = 'itemID')

items['iid'] = items['itemID'].apply(lambda x : iid_map[x])
items = items.set_index('iid').drop_duplicates()


In [None]:
items.head()

Unnamed: 0_level_0,itemID,item_label
iid,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1210,Star Wars: Episode VI - Return of the Jedi (1983)
1,2628,Star Wars: Episode I - The Phantom Menace (1999)
2,3578,Gladiator (2000)
3,101,Bottle Rocket (1996)
4,235,Ed Wood (1994)


In [None]:
def sample_recommendation(model, 
                          interactions,
                          data, 
                          items,
                          dataset,
                          real_user_ids):
  """ 
    Parameters
      model -- fitted LightFM Instance
      interactions -- (#items , #user) sparse matrix
      data -- DataFrame ['userID', 'movieID', 'rating', 'item_label']
      items -- item Info including ['item_label'] with index of its id
      dataset -- lightfm.data.Dataset Instance
      real_user_ids -- list of real_user_ids to recommend
  """
  n_users, n_items = interactions.shape
  uid_map, x, iid_map, y = dataset.mapping() 

  for real_user_id in real_user_ids:
      user_id = uid_map[real_user_id]

      known_positives = items['item_label'][interactions.tocsr()[user_id].indices]

      scores = model.predict(user_id, np.arange(n_items))
      top_items = items['item_label'][np.argsort(-scores)]
      
      print("User %s" % real_user_id)
      print("     Known positives:")
      
      for x in known_positives:
          print("        %s" % x)

      print("     Recommended:")
      
      for x in top_items[:3]:
          print("        %s" % x)


In [None]:
user_ids = [609]

sample_recommendation(model = model,
                      interactions = interactions, 
                      data = data,
                      items = items,
                      dataset = dataset,
                      real_user_ids = user_ids )

User 609
     Known positives:
        Forrest Gump (1994)
        Fugitive, The (1993)
        Braveheart (1995)
        Batman (1989)
        Jurassic Park (1993)
        Dances with Wolves (1990)
        Toy Story (1995)
        Pulp Fiction (1994)
        Shawshank Redemption, The (1994)
        Twelfth Night (1996)
        Apollo 13 (1995)
        Terminator 2: Judgment Day (1991)
        Star Trek: Generations (1994)
        Crimson Tide (1995)
        While You Were Sleeping (1995)
        Net, The (1995)
        Natural Born Killers (1994)
        Firm, The (1993)
        William Shakespeare's Romeo + Juliet (1996)
        Anne Frank Remembered (1995)
     Recommended:
        Clerks (1994)
        Room with a View, A (1986)
        Mrs. Doubtfire (1993)


In [None]:
data.groupby('userID').count()

Unnamed: 0_level_0,itemID,act,item_label
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,112,112,112
2,20,20,20
3,14,14,14
4,119,119,119
5,39,39,39
...,...,...,...
606,434,434,434
607,102,102,102
608,336,336,336
609,20,20,20
