In [67]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import mean_squared_error
from surprise import Dataset, Reader, KNNBasic, SVD
from surprise.model_selection import cross_validate, train_test_split as surprise_train_test_split
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
data = pd.read_csv('resources/netflix/Netflix_User_Ratings.csv', usecols=[0, 1, 3])
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100480507 entries, 0 to 100480506
Data columns (total 3 columns):
 #   Column   Dtype
---  ------   -----
 0   CustId   int64
 1   Rating   int64
 2   MovieId  int64
dtypes: int64(3)
memory usage: 2.2 GB


In [3]:
data

Unnamed: 0,CustId,Rating,MovieId
0,1488844,3,1
1,822109,5,1
2,885013,4,1
3,30878,4,1
4,823519,3,1
...,...,...,...
100480502,1790158,4,17770
100480503,1608708,3,17770
100480504,234275,1,17770
100480505,255278,4,17770


In [4]:
# count the number of ratings per user
data.CustId.value_counts()

CustId
305344     17653
387418     17436
2439493    16565
1664010    15813
2118461    14831
           ...  
1839823        1
2400165        1
2404631        1
454275         1
1808649        1
Name: count, Length: 480189, dtype: int64

In [5]:
# filter out rows with less than 2000 ratings per user
nx = data[data.CustId.isin(data.CustId.value_counts()[data.CustId.value_counts() >= 2_000].index)].copy()
nx.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3311344 entries, 0 to 100480502
Data columns (total 3 columns):
 #   Column   Dtype
---  ------   -----
 0   CustId   int64
 1   Rating   int64
 2   MovieId  int64
dtypes: int64(3)
memory usage: 101.1 MB


In [6]:
nx.CustId.value_counts()

CustId
305344     17653
387418     17436
2439493    16565
1664010    15813
2118461    14831
           ...  
977932      2002
1441086     2002
840763      2001
2045846     2000
79380       2000
Name: count, Length: 1214, dtype: int64

In [7]:
# sort on CustId
nx.sort_values('CustId', inplace=True)
nx.head()

Unnamed: 0,CustId,Rating,MovieId
52762701,1333,4,9608
1499499,1333,3,312
77896920,1333,4,14173
18093543,1333,3,3439
8789496,1333,2,1754


In [8]:
# Create a dictionary to map unique customer IDs to new IDs
id_mapping = {old_id: new_id for new_id, old_id in enumerate(nx['CustId'].unique(), start=1)}

In [9]:
# Map the customer IDs using the dictionary
nx['CustId'] = nx['CustId'].map(id_mapping)
nx.head()

Unnamed: 0,CustId,Rating,MovieId
52762701,1,4,9608
1499499,1,3,312
77896920,1,4,14173
18093543,1,3,3439
8789496,1,2,1754


In [10]:
print(f'Number of users: {nx.CustId.nunique()} | Number of movies: {nx.MovieId.nunique()}')

Number of users: 1214 | Number of movies: 17768


In [11]:
nx.CustId.value_counts()

CustId
135     17653
183     17436
1107    16565
781     15813
971     14831
        ...  
458      2002
422      2002
392      2001
946      2000
38       2000
Name: count, Length: 1214, dtype: int64

In [12]:
nx.MovieId.value_counts()

MovieId
14691    1186
4306     1183
12918    1182
2862     1180
14410    1178
         ... 
16875       1
12959       1
16013       1
2537        1
15480       1
Name: count, Length: 17768, dtype: int64

In [13]:
movies = pd.read_csv('resources/netflix/movies.csv')
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17770 entries, 0 to 17769
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MovieId      17770 non-null  int64  
 1   ReleaseYear  17763 non-null  float64
 2   MovieTitle   17770 non-null  object 
dtypes: float64(1), int64(1), object(1)
memory usage: 416.6+ KB


In [14]:
# figure out which MovieIds are not in movies dataframe
movies[~movies.MovieId.isin(nx.MovieId)]

Unnamed: 0,MovieId,ReleaseYear,MovieTitle
11147,11148,1996.0,The Land Before Time IV: Journey Through the M...
13754,13755,2005.0,Mobsters and Mormons


In [15]:
# filter out rows with MovieIds that are not in movies dataframe
movies = movies[movies.MovieId.isin(nx.MovieId)]
movies

Unnamed: 0,MovieId,ReleaseYear,MovieTitle
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW
...,...,...,...
17765,17766,2002.0,Where the Wild Things Are and Other Maurice Se...
17766,17767,2004.0,Fidel Castro: American Experience
17767,17768,2000.0,Epoch
17768,17769,2003.0,The Company


In [16]:
# save movies dataframe to csv
movies.to_csv('resources/netflix/adjusted_movies.csv', index=False)
nx.to_csv('resources/netflix/adjusted_ratings.csv', index=False)

In [36]:
# map MovieIds to new indices
# id_mapping = {old_id: new_id for new_id, old_id in enumerate(movies.MovieId.unique(), start=1)}
# nx.MovieId = nx.MovieId.map(id_mapping)
# movies.MovieId = movies.index + 1
# nx.MovieId.value_counts()

MovieId
14689    1186
4306     1183
12917    1182
2862     1180
14408    1178
         ... 
12958       1
16873       1
16011       1
15478       1
2537        1
Name: count, Length: 17768, dtype: int64

In [17]:
X_train, X_test, y_train, y_test = train_test_split(nx, nx.Rating, test_size=0.25, stratify=nx.Rating, random_state=42)

In [18]:
X_train.CustId.value_counts()

CustId
135     13340
183     13059
1107    12492
781     11928
971     11141
        ...  
458      1475
670      1475
43       1473
9        1465
797      1433
Name: count, Length: 1214, dtype: int64

In [19]:
# pivot ratings to user-item matrix
ratings = X_train.pivot(index='CustId', columns='MovieId', values='Rating')
mean_ratings = ratings.mean(axis=1)
print(ratings.shape)
ratings.head()

(1214, 17766)


MovieId,1,2,3,4,5,6,7,8,9,10,...,17761,17762,17763,17764,17765,17766,17767,17768,17769,17770
CustId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,4.0,,,,,3.0,,,...,,3.0,,4.0,,,4.0,,,
2,,,,,4.0,,,1.0,,,...,,,,5.0,,3.0,,2.0,,
3,,,,,,,,,,,...,5.0,,,,,,,,,
4,,,,,,,,,,,...,,5.0,,5.0,,,,,,
5,,,,,,,,,,,...,,,,4.0,,,,,,3.0


In [23]:
# compute all recommendations for all movies given a user and model
def get_all_recommendations(user_id: int, model: NearestNeighbors, use_means: bool = True) -> pd.Series:
    distances, knn = model.kneighbors(ratings.fillna(0))  # knn are the nearest neighbors of the user
    knn = pd.DataFrame(knn + 1, index=ratings.index)  # don't forget to increase the nearest neighbor indices by 1
    sim = pd.DataFrame(1 - distances, index=ratings.index)  # 1 - distance because we want to invert the distance
    neighbors = knn.loc[user_id, 1:]
    similarities = sim.loc[user_id, 1:]
    # select from ratings only the columns of the neighbors, then set that index to the index of the similarities
    similarities.index = ratings.loc[neighbors].index
    
    if use_means:
        return pd.Series(
            mean_ratings.loc[user_id] + ratings.loc[neighbors].subtract(mean_ratings.loc[neighbors], axis='index').mul(
                similarities, axis='index').sum(axis='index') / similarities.sum(), name='recommendation')
    else:
        return pd.Series(ratings.loc[neighbors].mul(similarities, axis='index').sum(axis='index') / similarities.sum(),
                         name='recommendation')

In [21]:
# compute a single recommendation for a given user, movie and model
def get_recommendation(user_id: int, movie_id: int, model: NearestNeighbors, use_means: bool = True) -> float:
    if movie_id not in ratings.columns:
        return 2.5
    recommendations = get_all_recommendations(user_id, model, use_means=use_means)
    return recommendations.loc[movie_id]

In [22]:
model = NearestNeighbors(n_neighbors=40, metric='cosine')
model.fit(ratings.fillna(0))

In [24]:
def get_RMSE(test, model, use_means=True):
    group = test[['MovieId', 'Rating']].groupby(test.CustId)
    mse = pd.DataFrame(columns=['MovieId', 'Rating', 'recommendation'])
    i = 0
    for key in group.groups:
        predictions = get_all_recommendations(key, model=model, use_means=use_means)
        rated_movies = group.get_group(key).set_index('MovieId')
        df = rated_movies.join(predictions).dropna().reset_index()
        mse = pd.concat([mse, df]).reset_index(drop=True)
        if i % 100 == 0:
            score = np.sqrt(mean_squared_error(mse.Rating, mse.recommendation))
            print(f'{i}/{test.CustId.nunique()} - RMSE: {score:.4f}')
        i += 1

    score = np.sqrt(mean_squared_error(mse.Rating, mse.recommendation))
    print(f'{test.CustId.nunique()}/{test.CustId.nunique()} - RMSE: {score:.4f}')

In [78]:
def get_top_10_recommendations(user_id: int, model: NearestNeighbors, use_means: bool = True) -> pd.DataFrame:
    recommendations = get_all_recommendations(user_id, model, use_means=use_means)
    return movies[movies.MovieId.isin(recommendations.sort_values(ascending=False).head(10).index)]

In [84]:
def get_top_rated_movies(user_id: int) -> pd.DataFrame:
    return movies[movies.MovieId.isin(nx[(nx.CustId == user_id) & (nx.Rating == 5)].MovieId)]

In [25]:
get_RMSE(X_test, model)

  mse = pd.concat([mse, df]).reset_index(drop=True)


0/1214 - RMSE: 0.8840
100/1214 - RMSE: 0.9466
200/1214 - RMSE: 0.9616
300/1214 - RMSE: 0.9664
400/1214 - RMSE: 0.9544
500/1214 - RMSE: 0.9495
600/1214 - RMSE: 0.9475
700/1214 - RMSE: 0.9452
800/1214 - RMSE: 0.9388
900/1214 - RMSE: 0.9389
1000/1214 - RMSE: 0.9372
1100/1214 - RMSE: 0.9395
1200/1214 - RMSE: 0.9379
1214/1214 - RMSE: 0.9383


In [27]:
X_test

Unnamed: 0,CustId,Rating,MovieId
8255023,164,1,1648
36104959,734,4,6418
4052309,378,4,785
10982218,609,4,2135
58327686,64,4,10721
...,...,...,...
47282620,847,4,8524
32377852,515,3,5894
96603303,568,4,17136
77664880,562,3,14132


In [76]:
rec = get_all_recommendations(164, model)
rec

MovieId
1        2.470061
2        2.405583
3        2.439388
4        2.339170
5        2.374992
           ...   
17766    2.473624
17767    2.405583
17768    2.205812
17769    2.310033
17770    2.320588
Name: recommendation, Length: 17766, dtype: float64

In [85]:
get_top_rated_movies(164)

Unnamed: 0,MovieId,ReleaseYear,MovieTitle
570,571,1999.0,American Beauty
797,798,1975.0,Jaws
899,900,1994.0,Eat Drink Man Woman
1494,1495,2001.0,Alias: Season 1
1999,2000,1994.0,Four Weddings and a Funeral
2451,2452,2001.0,Lord of the Rings: The Fellowship of the Ring
2659,2660,1989.0,When Harry Met Sally
2781,2782,1995.0,Braveheart
2802,2803,1995.0,Pride and Prejudice
3319,3320,2002.0,About a Boy


In [79]:
get_top_10_recommendations(164, model)

Unnamed: 0,MovieId,ReleaseYear,MovieTitle
1904,1905,2003.0,Pirates of the Caribbean: The Curse of the Bla...
3289,3290,1974.0,The Godfather Part II
5581,5582,1980.0,Star Wars: Episode V: The Empire Strikes Back
7229,7230,2001.0,The Lord of the Rings: The Fellowship of the R...
9627,9628,1983.0,Star Wars: Episode VI: Return of the Jedi
10041,10042,1981.0,Raiders of the Lost Ark
11520,11521,2002.0,Lord of the Rings: The Two Towers
14239,14240,2003.0,Lord of the Rings: The Return of the King
14960,14961,2003.0,Lord of the Rings: The Return of the King: Ext...
17156,17157,1998.0,Saving Private Ryan


In [86]:
get_top_rated_movies(734)

Unnamed: 0,MovieId,ReleaseYear,MovieTitle
36,37,1973.0,Zatoichi's Conspiracy
105,106,2004.0,Stevie Ray Vaughan and Double Trouble: Live at...
117,118,1985.0,Rambo: First Blood Part II
164,165,1982.0,Richard Pryor: Live on the Sunset Strip
165,166,1980.0,Fame
...,...,...,...
17471,17472,1973.0,Magnum Force
17473,17474,1964.0,My Fair Lady: Special Edition
17498,17499,1998.0,Mulan: Special Edition
17525,17526,1959.0,Some Like It Hot


In [80]:
get_top_10_recommendations(734, model)

Unnamed: 0,MovieId,ReleaseYear,MovieTitle
1904,1905,2003.0,Pirates of the Caribbean: The Curse of the Bla...
3961,3962,2003.0,Finding Nemo (Widescreen)
9627,9628,1983.0,Star Wars: Episode VI: Return of the Jedi
10041,10042,1981.0,Raiders of the Lost Ark
11282,11283,1994.0,Forrest Gump
11780,11781,1984.0,Indiana Jones and the Temple of Doom
14239,14240,2003.0,Lord of the Rings: The Return of the King
16264,16265,1977.0,Star Wars: Episode IV: A New Hope
16953,16954,1989.0,Indiana Jones and the Last Crusade
17156,17157,1998.0,Saving Private Ryan


In [29]:
r = get_recommendation(164, 91, model)
r


2.4413177914624926

In [69]:
reader = Reader(rating_scale=(1, 5))
movie_ratings = Dataset.load_from_df(nx[['CustId', 'MovieId', 'Rating']], reader)
training_set, testing_set = surprise_train_test_split(movie_ratings, test_size=0.25, random_state=42)

In [70]:
algo = KNNBasic()
algo.fit(training_set)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x367da29e0>

In [56]:
cross_validate(algo, movie_ratings, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9392  0.9392  0.9396  0.9399  0.9388  0.9393  0.0004  
MAE (testset)     0.7464  0.7453  0.7462  0.7460  0.7453  0.7458  0.0005  
Fit time          13.24   13.44   14.58   13.36   13.45   13.61   0.49    
Test time         114.70  100.78  112.28  105.41  110.59  108.75  5.02    


{'test_rmse': array([0.93919493, 0.93921015, 0.93962957, 0.93985546, 0.93880016]),
 'test_mae': array([0.74636425, 0.74530602, 0.74622381, 0.74603044, 0.74530675]),
 'fit_time': (13.24197506904602,
  13.436630010604858,
  14.580430030822754,
  13.356144189834595,
  13.445959091186523),
 'test_time': (114.69929909706116,
  100.77734398841858,
  112.27520108222961,
  105.4133529663086,
  110.5937750339508)}

In [71]:
algo.predict(164, 91)

Prediction(uid=164, iid=91, r_ui=None, est=2.494495763660051, details={'actual_k': 34, 'was_impossible': False})

In [72]:
algo.test(testing_set)

[Prediction(uid=799, iid=12145, r_ui=2.0, est=3.647444452418009, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=972, iid=4827, r_ui=3.0, est=2.7686362369159374, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=1090, iid=629, r_ui=4.0, est=3.271236923112836, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=695, iid=17560, r_ui=5.0, est=3.5667144369192725, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=1108, iid=833, r_ui=3.0, est=3.358349226150812, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=1195, iid=7522, r_ui=2.0, est=1.6320089581365922, details={'actual_k': 7, 'was_impossible': False}),
 Prediction(uid=951, iid=17075, r_ui=2.0, est=3.862930196756261, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=712, iid=10928, r_ui=2.0, est=2.832447920050216, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=291, iid=10280, r_ui=4.0, est=3.854388714164064, 

In [0]:
algo = SVD()
algo.fit(training_set)

In [58]:
cross_validate(algo, movie_ratings, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8265  0.8260  0.8249  0.8255  0.8276  0.8261  0.0009  
MAE (testset)     0.6337  0.6331  0.6322  0.6333  0.6348  0.6334  0.0008  
Fit time          16.01   16.09   16.47   17.04   17.15   16.55   0.47    
Test time         4.59    3.92    4.25    4.52    3.95    4.24    0.28    


{'test_rmse': array([0.82651701, 0.82601483, 0.82488032, 0.82552365, 0.82757901]),
 'test_mae': array([0.63369483, 0.6331331 , 0.6322071 , 0.63331327, 0.63476768]),
 'fit_time': (16.010166883468628,
  16.09313678741455,
  16.471209287643433,
  17.035928964614868,
  17.146260023117065),
 'test_time': (4.590903997421265,
  3.9168550968170166,
  4.2499918937683105,
  4.5183117389678955,
  3.94820499420166)}

In [None]:
algo.predict(164, 91)

In [None]:
algo.test(testing_set)

In [30]:
def get_cosine_recommendation(user_id: int, movie_id: int, ratings):
    similarity_matrix = cosine_similarity(ratings.fillna(0), ratings.fillna(0))
    similarity_matrix_df = pd.DataFrame(similarity_matrix, index=ratings.index, columns=ratings.index)
    
    cosine_scores = similarity_matrix_df[user_id]
    ratings_scores = ratings[movie_id]
    ratings_scores.dropna().dot(cosine_scores[~ratings_scores.isna()]) / cosine_scores[~ratings_scores.isna()].sum()
    return np.dot(ratings_scores.dropna(), cosine_scores[~ratings_scores.isna()]) / cosine_scores[~ratings_scores.isna()].sum()

In [31]:
get_cosine_recommendation(164, 91, ratings)

2.9026884286071177