#Movie Recommendation System with LightFM

In [None]:
!pip install lightfm

In [2]:
import pandas as pd
import numpy as np
from scipy import sparse
from lightfm import LightFM
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
from google.colab import drive
drive.mount('/content/drive/')

%cd /content/drive/My Drive/Colab Notebooks/

Mounted at /content/drive/
/content/drive/My Drive/Colab Notebooks


Посмотрим на табличные данные.

In [4]:
ratings = pd.read_csv('ml-latest-small/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
movies = pd.read_csv('ml-latest-small/movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Создадим матрицу User-Item.

In [6]:
def create_interaction_matrix(df, user_col, item_col, rating_col, norm=False, threshold=None):   
    interactions = df.groupby([user_col, item_col])[rating_col] \
            .sum().unstack().reset_index(). \
            fillna(0).set_index(user_col)
    if norm:
        interactions = interactions.applymap(lambda x: 1 if x > threshold else 0)
    return interactions

In [7]:
interactions = create_interaction_matrix(df = ratings,
                                         user_col = 'userId',
                                         item_col = 'movieId',
                                         rating_col = 'rating',
                                         threshold = '3')
interactions.shape

(610, 9724)

Создадим модель.

In [10]:
# n_components - размерность фичей эмбединга
# k - кол-во позитивных примеров

def runMF(interactions, n_components=30, loss='warp', k=15, epoch=30, n_jobs = 4):
    x = sparse.csr_matrix(interactions.values)
    model = LightFM(no_components=n_components, loss=loss, k=k)
    model.fit(x, epochs=epoch, num_threads = n_jobs)
    return model

In [11]:
mf_model = runMF(interactions = interactions,
                 n_components = 30,
                 loss = 'warp',
                 k = 15,
                 epoch = 30,
                 n_jobs = 4)

Создадим словари.

In [12]:
def create_user_dict(interactions):
    user_id = list(interactions.index)
    user_dict = dict(zip(list(user_id), list(range(0, len(user_id)))))
    return user_dict

def create_item_dict(df, id_col, name_col):
    item_dict = dict(zip(df[id_col], df[name_col])) 
    return item_dict

def create_item_emdedding_distance_matrix(model,interactions):
    df_item_norm_sparse = sparse.csr_matrix(model.item_embeddings)
    similarities = cosine_similarity(df_item_norm_sparse)
    item_emdedding_distance_matrix = pd.DataFrame(similarities)
    item_emdedding_distance_matrix.columns = interactions.columns
    item_emdedding_distance_matrix.index = interactions.columns
    return item_emdedding_distance_matrix

In [13]:
user_dict = create_user_dict(interactions=interactions)

movies_dict = create_item_dict(df = movies,
                               id_col = 'movieId',
                               name_col = 'title')

item_item_dist = create_item_emdedding_distance_matrix(model = mf_model,
                                                       interactions = interactions)

Функции:

In [14]:
def sample_recommendation_user(model, interactions, user_id, user_dict, 
                               item_dict, threshold = 0, nrec_items = 10, show = True):
    n_users, n_items = interactions.shape
    user_x = user_dict[user_id]
    scores = pd.Series(model.predict(user_x, np.arange(n_items)))
    scores.index = interactions.columns
    scores = list(pd.Series(scores.sort_values(ascending=False).index))
    
    known_items = list(pd.Series(interactions.loc[user_id,:] \
                                 [interactions.loc[user_id,:] > threshold].index).sort_values(ascending=False))
    
    scores = [x for x in scores if x not in known_items]
    return_score_list = scores[0:nrec_items]
    known_items = list(pd.Series(known_items).apply(lambda x: item_dict[x]))
    scores = list(pd.Series(return_score_list).apply(lambda x: item_dict[x]))
    if show == True:
        print("Known Likes:")
        counter = 1
        for i in known_items[:10]:
            print(str(counter) + '- ' + i)
            counter+=1

        print("\n Recommended Items:")
        counter = 1
        for i in scores:
            print(str(counter) + '- ' + i)
            counter+=1
    return return_score_list

def sample_recommendation_item(model, interactions, item_id, user_dict, item_dict, number_of_user):
    n_users, n_items = interactions.shape
    x = np.array(interactions.columns)
    scores = pd.Series(model.predict(np.arange(n_users), np.repeat(x.searchsorted(item_id),n_users)))
    user_list = list(interactions.index[scores.sort_values(ascending=False).head(number_of_user).index])
    return user_list 

def item_item_recommendation(item_emdedding_distance_matrix, item_id, 
                             item_dict, n_items = 10, show = True):
    recommended_items = list(pd.Series(item_emdedding_distance_matrix.loc[item_id,:]. \
                                  sort_values(ascending = False).head(n_items+1). \
                                  index[1:n_items+1]))
    if show == True:
        print("Item of interest :{0}".format(item_dict[item_id]))
        print("Item similar to the above item:")
        counter = 1
        for i in recommended_items:
            print(str(counter) + '- ' +  item_dict[i])
            counter+=1
    return recommended_items

Предсказания:

In [15]:
rec_list = sample_recommendation_user(model = mf_model, 
                                      interactions = interactions, 
                                      user_id = 1, 
                                      user_dict = user_dict,
                                      item_dict = movies_dict, 
                                      threshold = 4,
                                      nrec_items = 10)

Known Likes:
1- M*A*S*H (a.k.a. MASH) (1970)
2- X-Men (2000)
3- Shaft (1971)
4- Road Warrior, The (Mad Max 2) (1981)
5- Mad Max (1979)
6- Blazing Saddles (1974)
7- Gladiator (2000)
8- Grumpy Old Men (1993)
9- Good Morning, Vietnam (1987)
10- Red Dawn (1984)

 Recommended Items:
1- Godfather, The (1972)
2- Jurassic Park (1993)
3- Men in Black (a.k.a. MIB) (1997)
4- Ferris Bueller's Day Off (1986)
5- Independence Day (a.k.a. ID4) (1996)
6- Die Hard (1988)
7- Airplane! (1980)
8- Terminator 2: Judgment Day (1991)
9- Alien (1979)
10- RoboCop (1987)


In [16]:
sample_recommendation_item(model = mf_model,
                           interactions = interactions,
                           item_id = 1,
                           user_dict = user_dict,
                           item_dict = movies_dict,
                           number_of_user = 15)

[557, 399, 81, 173, 379, 126, 347, 423, 569, 507, 189, 8, 194, 77, 37]

In [17]:
rec_list = item_item_recommendation(item_emdedding_distance_matrix = item_item_dist,
                                    item_id = 1,
                                    item_dict = movies_dict,
                                    n_items = 10)

Item of interest :Toy Story (1995)
Item similar to the above item:
1- Forrest Gump (1994)
2- Jurassic Park (1993)
3- Home Alone (1990)
4- Groundhog Day (1993)
5- Independence Day (a.k.a. ID4) (1996)
6- Lion King, The (1994)
7- Mrs. Doubtfire (1993)
8- Silence of the Lambs, The (1991)
9- Star Wars: Episode IV - A New Hope (1977)
10- Ghost (1990)
