In [1]:
import pandas as pd
import numpy as np
movies=pd.read_csv('ml-latest-small/movies.csv')
links=pd.read_csv('ml-latest-small/links.csv')
ratings=pd.read_csv('ml-latest-small/ratings.csv')
tags=pd.read_csv('ml-latest-small/tags.csv')

In [2]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
def strrep(genre):
    return genre.replace('|',' ')

In [4]:
movies['genres']=movies.genres.apply(strrep)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
genresvect=vectorizer.fit_transform(movies['genres'])

genresvect.shape

(9742, 23)

In [7]:
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(genresvect, genresvect)
cosine_sim

array([[1.        , 0.81357774, 0.15276924, ..., 0.        , 0.4210373 ,
        0.26758648],
       [0.81357774, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.15276924, 0.        , 1.        , ..., 0.        , 0.        ,
        0.57091541],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.4210373 , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.26758648, 0.        , 0.57091541, ..., 0.        , 0.        ,
        1.        ]])

In [8]:
indices=pd.Series(movies.index,index=movies['title']).drop_duplicates()
indices

title
Toy Story (1995)                                0
Jumanji (1995)                                  1
Grumpier Old Men (1995)                         2
Waiting to Exhale (1995)                        3
Father of the Bride Part II (1995)              4
                                             ... 
Black Butler: Book of the Atlantic (2017)    9737
No Game No Life: Zero (2017)                 9738
Flint (2017)                                 9739
Bungo Stray Dogs: Dead Apple (2018)          9740
Andrew Dice Clay: Dice Rules (1991)          9741
Length: 9742, dtype: int64

In [9]:
def get_recommendation(title,cosine_sim=cosine_sim):
    idx=indices[title]
    sim_scores=list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_scores = sim_scores[1:11]
    print(sim_scores)

    movie_indices = [i[0] for i in sim_scores]

    return movies['title'].iloc[movie_indices]

In [10]:
get_recommendation('Toy Story (1995)')

[(1706, 0.9999999999999998), (2355, 0.9999999999999998), (2809, 0.9999999999999998), (3000, 0.9999999999999998), (3568, 0.9999999999999998), (6194, 0.9999999999999998), (6486, 0.9999999999999998), (6948, 0.9999999999999998), (7760, 0.9999999999999998), (8219, 0.9999999999999998)]


1706                                          Antz (1998)
2355                                   Toy Story 2 (1999)
2809       Adventures of Rocky and Bullwinkle, The (2000)
3000                     Emperor's New Groove, The (2000)
3568                                Monsters, Inc. (2001)
6194                                     Wild, The (2006)
6486                               Shrek the Third (2007)
6948                       Tale of Despereaux, The (2008)
7760    Asterix and the Vikings (Astérix et les Viking...
8219                                         Turbo (2013)
Name: title, dtype: object

In [5]:
Rating=ratings.pivot(index='userId',columns="movieId",values='rating').fillna(0)

In [6]:
Rating

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
R=Rating.as_matrix()
user_mean=np.mean(R,axis=1)
rating_demeaned=R-user_mean.reshape(-1,1)

  """Entry point for launching an IPython kernel.


In [8]:
user_mean

array([0.10417524, 0.01177499, 0.00976964, 0.07897984, 0.01645413,
       0.11281366, 0.05049362, 0.01727684, 0.01542575, 0.0472028 ,
       0.02488688, 0.01444879, 0.01162073, 0.01676265, 0.04787125,
       0.03753599, 0.04545455, 0.19266763, 0.18850267, 0.08936652,
       0.14854998, 0.03146853, 0.04540313, 0.04128959, 0.01285479,
       0.00699301, 0.04925956, 0.1770362 , 0.03450226, 0.01655697,
       0.02015631, 0.03938708, 0.06077746, 0.03023447, 0.0096668 ,
       0.01624846, 0.00894694, 0.02581242, 0.04113534, 0.03990128,
       0.07260387, 0.16135335, 0.0533731 , 0.01655697, 0.15903949,
       0.01727684, 0.04396339, 0.0136775 , 0.00920403, 0.08864665,
       0.13939737, 0.05985191, 0.01028383, 0.01028383, 0.00730152,
       0.01799671, 0.16608392, 0.04494035, 0.04792267, 0.00843274,
       0.01624846, 0.15364048, 0.10119292, 0.2003805 , 0.01408885,
       0.14263677, 0.01470588, 0.41901481, 0.02067051, 0.02756067,
       0.01295763, 0.01923077, 0.08016248, 0.07774578, 0.02293

In [10]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(rating_demeaned, k = 50)
sigma=np.diag(sigma)

In [11]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_mean.reshape(-1, 1)
all_user_predicted_ratings

array([[ 2.16732840e+00,  4.02750508e-01,  8.40183552e-01, ...,
        -2.34533753e-02, -2.34533753e-02, -5.87318552e-02],
       [ 2.11459069e-01,  6.65755884e-03,  3.34547997e-02, ...,
         1.94980595e-02,  1.94980595e-02,  3.22813825e-02],
       [ 3.58844848e-03,  3.05175179e-02,  4.63929239e-02, ...,
         5.90929301e-03,  5.90929301e-03,  8.00411072e-03],
       ...,
       [ 2.16136388e+00,  2.67091989e+00,  2.12845971e+00, ...,
        -4.40029476e-02, -4.40029476e-02,  7.18717825e-02],
       [ 7.80205947e-01,  5.33648654e-01,  9.64537701e-02, ...,
         4.35514249e-03,  4.35514249e-03, -1.34622131e-03],
       [ 5.36398127e+00, -3.40945139e-01, -1.75163291e-01, ...,
        -2.63577616e-02, -2.63577616e-02,  5.15415792e-02]])

In [35]:
preds=pd.DataFrame(all_user_predicted_ratings,columns=Rating.columns)
preds

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
0,2.167328,0.402751,0.840184,-0.076281,-0.551337,2.504091,-0.890114,-0.026443,0.196974,1.593259,...,-0.023453,-0.019967,-0.026939,-0.026939,-0.023453,-0.026939,-0.023453,-0.023453,-0.023453,-0.058732
1,0.211459,0.006658,0.033455,0.017419,0.183430,-0.062473,0.083037,0.024158,0.049330,-0.152530,...,0.019498,0.016777,0.022219,0.022219,0.019498,0.022219,0.019498,0.019498,0.019498,0.032281
2,0.003588,0.030518,0.046393,0.008176,-0.006247,0.107328,-0.012416,0.003779,0.007297,-0.059362,...,0.005909,0.006209,0.005610,0.005610,0.005909,0.005610,0.005909,0.005909,0.005909,0.008004
3,2.051549,-0.387104,-0.252199,0.087562,0.130465,0.270210,0.477835,0.040313,0.025858,-0.017365,...,0.004836,0.004172,0.005500,0.005500,0.004836,0.005500,0.004836,0.004836,0.004836,-0.023311
4,1.344738,0.778511,0.065749,0.111744,0.273144,0.584426,0.254930,0.128788,-0.085541,1.023455,...,-0.008042,-0.007419,-0.008664,-0.008664,-0.008042,-0.008664,-0.008042,-0.008042,-0.008042,-0.010127
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,2.501444,-0.139015,-0.082080,0.079300,-0.158770,-0.587275,2.368039,-0.014790,-0.093695,-0.182211,...,-0.038935,-0.032500,-0.045369,-0.045369,-0.038935,-0.045369,-0.038935,-0.038935,-0.038935,-0.045416
606,2.849138,1.368651,0.341869,0.000534,-0.272603,1.529573,-0.078889,-0.013913,0.075251,1.398731,...,0.006789,0.006108,0.007471,0.007471,0.006789,0.007471,0.006789,0.006789,0.006789,-0.030371
607,2.161364,2.670920,2.128460,0.036007,0.128314,3.684387,-0.028717,0.195936,0.073591,2.406330,...,-0.044003,-0.041123,-0.046882,-0.046882,-0.044003,-0.046882,-0.044003,-0.044003,-0.044003,0.071872
608,0.780206,0.533649,0.096454,0.029945,0.087756,0.209604,0.078704,0.061626,-0.064337,1.242835,...,0.004355,0.004268,0.004442,0.004442,0.004355,0.004442,0.004355,0.004355,0.004355,-0.001346


In [120]:
def recommend_movies(predictions, userID, movies, original_ratings, num_recommendations):
    user_row=userID-1
    sorted_movs=preds.iloc[user_row].sort_values(ascending=False)
    user_data = original_ratings[original_ratings.userId == (userID)]
    user_full = (user_data.merge(movies, how = 'left', left_on = 'movieId', right_on = 'movieId').
                     sort_values(['rating'], ascending=False)
                 )

    print('User {0} has already rated {1} movies.'.format(userID, user_full.shape[0]))
    print('Recommending highest {0} predicted ratings movies not already rated.'.format(num_recommendations))
    recommendations = (movies[~movies['movieId'].isin(user_full['movieId'])].
         merge(pd.DataFrame(sorted_movs).reset_index(), how = 'left',
               left_on = 'movieId',
               right_on = 'movieId').
         rename(columns = {user_row: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :]
                      )

    return user_full, recommendations
    
#     movie_id=list(sorted_movs.index)
#     movie_name=movies['title'].iloc(movie_id)
    
    
    

In [121]:
old,predictions=recommend_movies(preds,50,movies,ratings,20)


User 50 has already rated 310 movies.
Recommending highest 20 predicted ratings movies not already rated.


In [122]:
old

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
21,50,924,4.5,1514238077,2001: A Space Odyssey (1968),Adventure Drama Sci-Fi
33,50,1204,4.5,1526329645,Lawrence of Arabia (1962),Adventure Drama War
35,50,1208,4.5,1527106159,Apocalypse Now (1979),Action Drama War
40,50,1251,4.5,1534178801,8 1/2 (8½) (1963),Drama Fantasy
121,50,7327,4.0,1525359178,Persona (1966),Drama
...,...,...,...,...,...,...
261,50,136305,1.0,1518168422,Sharknado 3: Oh Hell No! (2015),Horror Sci-Fi
280,50,156607,1.0,1514240000,The Huntsman Winter's War (2016),Action Adventure Drama Fantasy
222,50,103171,1.0,1518168356,Schlussmacher (2013),Comedy
298,50,175485,0.5,1514240073,Death Note (2017),Horror Thriller


In [138]:
# predictions[predictions['movieId']==457]
predictions

Unnamed: 0,movieId,title,genres,Predictions
866,1193,One Flew Over the Cuckoo's Nest (1975),Drama,3.180062
3524,4973,"Amelie (Fabuleux destin d'Amélie Poulain, Le) ...",Comedy Romance,2.150116
4787,7361,Eternal Sunshine of the Spotless Mind (2004),Drama Romance Sci-Fi,1.82788
884,1219,Psycho (1960),Crime Horror,1.826857
919,1265,Groundhog Day (1993),Comedy Fantasy Romance,1.805202
906,1247,"Graduate, The (1967)",Comedy Drama Romance,1.772405
675,912,Casablanca (1942),Drama Romance,1.76394
6857,68954,Up (2009),Adventure Animation Children Drama,1.752619
924,1270,Back to the Future (1985),Adventure Comedy Sci-Fi,1.583552
3540,4993,"Lord of the Rings: The Fellowship of the Ring,...",Adventure Fantasy,1.572656


In [158]:
# pd.DataFrame(preds.iloc[5].sort_values(ascending=False).iloc[:5, :-1])


In [133]:
predictions

Unnamed: 0,movieId,title,genres,Predictions
866,1193,One Flew Over the Cuckoo's Nest (1975),Drama,3.180062
3524,4973,"Amelie (Fabuleux destin d'Amélie Poulain, Le) ...",Comedy Romance,2.150116
4787,7361,Eternal Sunshine of the Spotless Mind (2004),Drama Romance Sci-Fi,1.82788
884,1219,Psycho (1960),Crime Horror,1.826857
919,1265,Groundhog Day (1993),Comedy Fantasy Romance,1.805202
906,1247,"Graduate, The (1967)",Comedy Drama Romance,1.772405
675,912,Casablanca (1942),Drama Romance,1.76394
6857,68954,Up (2009),Adventure Animation Children Drama,1.752619
924,1270,Back to the Future (1985),Adventure Comedy Sci-Fi,1.583552
3540,4993,"Lord of the Rings: The Fellowship of the Ring,...",Adventure Fantasy,1.572656


In [159]:
np.dot(U, sigma)

array([[ 1.16501057e+00,  2.22964281e-02, -8.24604451e-01, ...,
         6.20837069e-01,  1.43661755e+01, -2.82808906e+01],
       [ 3.40482926e-01, -7.08361538e-02, -1.03899483e+00, ...,
        -2.28194305e-01, -4.08673730e+00, -2.97005723e+00],
       [-4.42343185e-02,  3.67083563e-01,  4.48137644e-01, ...,
         1.24850804e-01,  4.70348952e-01, -3.08034375e-01],
       ...,
       [ 1.09381613e+01,  9.12229219e+00,  3.83601989e+00, ...,
        -2.70724237e+00,  2.83940333e+00, -5.62166205e+01],
       [-6.06052642e-01, -4.44869588e-01,  3.70963258e-01, ...,
        -7.57546106e+00,  3.23740339e+00, -4.06260044e+00],
       [-7.05134377e-01, -6.00207210e-01, -4.37202805e+00, ...,
         1.14310811e+01, -4.69764803e+01, -5.75856230e+01]])

In [160]:
np.matmul(U, sigma)

array([[ 1.16501057e+00,  2.22964281e-02, -8.24604451e-01, ...,
         6.20837069e-01,  1.43661755e+01, -2.82808906e+01],
       [ 3.40482926e-01, -7.08361538e-02, -1.03899483e+00, ...,
        -2.28194305e-01, -4.08673730e+00, -2.97005723e+00],
       [-4.42343185e-02,  3.67083563e-01,  4.48137644e-01, ...,
         1.24850804e-01,  4.70348952e-01, -3.08034375e-01],
       ...,
       [ 1.09381613e+01,  9.12229219e+00,  3.83601989e+00, ...,
        -2.70724237e+00,  2.83940333e+00, -5.62166205e+01],
       [-6.06052642e-01, -4.44869588e-01,  3.70963258e-01, ...,
        -7.57546106e+00,  3.23740339e+00, -4.06260044e+00],
       [-7.05134377e-01, -6.00207210e-01, -4.37202805e+00, ...,
         1.14310811e+01, -4.69764803e+01, -5.75856230e+01]])

In [12]:
np.dot(np.dot(U, sigma), Vt)

array([[ 2.06315317e+00,  2.98575271e-01,  7.36008315e-01, ...,
        -1.27628612e-01, -1.27628612e-01, -1.62907092e-01],
       [ 1.99684079e-01, -5.11743088e-03,  2.16798100e-02, ...,
         7.72306977e-03,  7.72306977e-03,  2.05063928e-02],
       [-6.18119364e-03,  2.07478758e-02,  3.66232818e-02, ...,
        -3.86034911e-03, -3.86034911e-03, -1.76553140e-03],
       ...,
       [ 1.89352142e+00,  2.40307743e+00,  1.86061726e+00, ...,
        -3.11845399e-01, -3.11845399e-01, -1.95970669e-01],
       [ 7.67762508e-01,  5.21205215e-01,  8.40103312e-02, ...,
        -8.08829642e-03, -8.08829642e-03, -1.37896602e-02],
       [ 4.87010015e+00, -8.34826258e-01, -6.69044410e-01, ...,
        -5.20238880e-01, -5.20238880e-01, -4.42339540e-01]])

In [13]:
R

array([[4. , 0. , 4. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [2.5, 2. , 2. , ..., 0. , 0. , 0. ],
       [3. , 0. , 0. , ..., 0. , 0. , 0. ],
       [5. , 0. , 0. , ..., 0. , 0. , 0. ]])