In [1]:
import numpy as np
import pandas as pd

In [2]:
!curl -O "http://files.grouplens.org/datasets/movielens/ml-1m.zip"
!unzip -o "ml-1m.zip"

# read-in movie names
movies = pd.read_table("ml-1m/movies.dat", sep="::", engine="python", header=None, encoding='latin-1')
movies.columns = ['movie_id', 'title', 'genre']

# read-in our ratings data-set and merge with it movie names
R = pd.read_table("ml-1m/ratings.dat", sep="::", engine="python", header=None, encoding='latin-1')
R.columns = ['user_id', 'movie_id', 'rating', 'timestamp']
R = pd.merge(movies, R, on='movie_id')

# pivot to be tabular; n=6000+ users by m=3700+ movies
R = pd.pivot_table(R, index="user_id", columns="title", values="rating")

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 5778k  100 5778k    0     0  6782k      0 --:--:-- --:--:-- --:--:-- 6782k
Archive:  ml-1m.zip
  inflating: ml-1m/movies.dat        
  inflating: ml-1m/ratings.dat       
  inflating: ml-1m/README            
  inflating: ml-1m/users.dat         


In [3]:
R

title,"$1,000,000 Duck (1971)",'Night Mother (1986),'Til There Was You (1997),"'burbs, The (1989)",...And Justice for All (1979),1-900 (1994),10 Things I Hate About You (1999),101 Dalmatians (1961),101 Dalmatians (1996),12 Angry Men (1957),...,"Young Poisoner's Handbook, The (1995)",Young Sherlock Holmes (1985),Young and Innocent (1937),Your Friends and Neighbors (1998),Zachariah (1971),"Zed & Two Noughts, A (1985)",Zero Effect (1998),Zero Kelvin (Kjærlighetens kjøtere) (1995),Zeus and Roxanne (1997),eXistenZ (1999)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,,3.0,,,,,2.0,4.0,,,...,,3.0,,,,,,,,2.0
6037,,,,,,,,,,4.0,...,,,,,,,,,,
6038,,,,,,,,,,,...,,,,,,,,,,
6039,,,,,,,,,,,...,,3.0,,,,,,,,


In [10]:
from numpy.linalg import solve

def als(R, k, lambda_x=0.4, lambda_y=0.4, epochs=30, seed=None):
    """ Construct an Alternating Least Squares (ALS) object. """
    rng = np.random.RandomState(seed)
    n, m = np.shape(R)
    R = pd.DataFrame(R).astype(float).fillna(0)
    rows, columns = list(R.index), list(R.columns)
    X = rng.uniform(size=(n, k))
    Y = rng.uniform(size=(m, k))
    
    for epoch in range(epochs):
        try:
            # Solve for the X factor (user) by holding the Y factor constant
            yty = np.dot(Y.T, Y)
            eye = np.eye(k) * lambda_x
            for u in range(R.shape[0]):
                X[u] = solve(yty + eye, R.values[u, :].dot(Y))
                
            # Solve for the Y factor (item) by holding the X factor constant
            xtx = np.dot(X.T, X)
            eye = np.eye(k) * lambda_y
            for i in range(R.shape[1]):
                Y[i] = solve(xtx + eye, R.values[:, i].dot(X))

        # sometimes, you may see singular matrices; non-invertible arrays
        except np.linalg.LinAlgError:
            continue
                
        rmse = np.sqrt(np.sum(np.square(R.values - X.dot(Y.T))))
        print("Iteration {} / {} - Error: {:,}".format(epoch+1, epochs, rmse))
    return X, Y

In [11]:
X, Y = als(R, k=30, epochs=20)

Iteration 1 / 20 - Error: 3,057.6370254947133
Iteration 2 / 20 - Error: 2,805.9211248187803
Iteration 3 / 20 - Error: 2,774.476853088114
Iteration 4 / 20 - Error: 2,765.3149647430346
Iteration 5 / 20 - Error: 2,761.8792552968716
Iteration 6 / 20 - Error: 2,760.393021572816
Iteration 7 / 20 - Error: 2,759.627024811279
Iteration 8 / 20 - Error: 2,759.161414006881
Iteration 9 / 20 - Error: 2,758.8417077777844
Iteration 10 / 20 - Error: 2,758.602685795284
Iteration 11 / 20 - Error: 2,758.412811613975
Iteration 12 / 20 - Error: 2,758.2554172558303
Iteration 13 / 20 - Error: 2,758.1214147892083
Iteration 14 / 20 - Error: 2,758.00589979055
Iteration 15 / 20 - Error: 2,757.9062340499136
Iteration 16 / 20 - Error: 2,757.820844678424
Iteration 17 / 20 - Error: 2,757.748501050274
Iteration 18 / 20 - Error: 2,757.687957640642
Iteration 19 / 20 - Error: 2,757.6378530511315
Iteration 20 / 20 - Error: 2,757.5967509839948


In [13]:
import numpy as np
import pandas as pd

def pairwise_cosine(ndarray):
    """ Cosine similarity https://stackoverflow.com/questions/41905029/create-cosine-similarity-matrix-numpy
    """
    distance = np.dot(ndarray.T, ndarray)
    norm = np.sqrt((ndarray * ndarray).sum(axis=0, keepdims=True))
    return distance / norm / norm.T

sim = pairwise_cosine(Y.T)
sim = pd.DataFrame(sim, index=R.columns, columns=R.columns)


In [14]:
def get_similar_movies(movie_title, n_similar_titles=10):
    """ Get similar titles for a given movie. """
    others = sim[movie_title].drop(movie_title).nlargest(n_similar_titles)
    display(pd.DataFrame(others.head(n_similar_titles)))

In [15]:
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual

# widgets to select the item and how many top items to present
movies_widget = widgets.Dropdown(options=list(R.columns), description="Movie:")
n_similar_widget = widgets.BoundedIntText(value=10, min=1, description="# Similar:")

# make the `get_similar_movies` function be interactive.
interactive(get_similar_movies,
        movie_title=movies_widget,
        n_similar_titles=n_similar_widget)

interactive(children=(Dropdown(description='Movie:', options=('$1,000,000 Duck (1971)', "'Night Mother (1986)"…