In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
movies = pd.read_csv(os.getcwd() + "/movies/movies.csv")
user_ratings = pd.read_csv(os.getcwd() + "/movies/ratings.csv")
user_ratings.drop(['timestamp'], axis=1, inplace=True)
user_ratings = user_ratings.sort_values('userId').reset_index(drop = True)

In [3]:
movielens = pd.merge(user_ratings, movies, on = 'movieId')
movielens.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [120]:
def genre_array(str):
    return str.split('|')

movielens['genres'] = movielens['genres'].apply(genre_array)

    using ratings

In [4]:
movie_rating = movielens[['userId', 'movieId', 'title', 'rating']]
movie_ratingCount = (movie_rating.groupby(by = ['title'])['rating'].count().
                     reset_index().rename(columns={'rating':'ratingCount'})
                     [['title', 'ratingCount']]
                    )
movie_ratingCount.head()

Unnamed: 0,title,ratingCount
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2


In [5]:
movie_ratingCount_withRatings = movie_rating.merge(movie_ratingCount, left_on='title', right_on='title', how='left')
movie_ratingCount_withRatings.head()

Unnamed: 0,userId,movieId,title,rating,ratingCount
0,1,1,Toy Story (1995),4.0,215
1,5,1,Toy Story (1995),4.0,215
2,7,1,Toy Story (1995),4.5,215
3,15,1,Toy Story (1995),2.5,215
4,17,1,Toy Story (1995),4.5,215


In [6]:
print (movie_ratingCount['ratingCount'].quantile(np.arange(0.9,1,0.01)))

0.90     27.00
0.91     30.00
0.92     33.56
0.93     38.00
0.94     42.00
0.95     47.00
0.96     55.00
0.97     64.46
0.98     83.00
0.99    114.64
Name: ratingCount, dtype: float64


In [7]:
popularity_threshold = 50
movie_rating_popular = movie_ratingCount_withRatings.query('ratingCount >= @popularity_threshold')
movie_rating_popular = movie_rating_popular[movie_rating_popular.movieId != 64997]
movie_rating_popular.head()

Unnamed: 0,userId,movieId,title,rating,ratingCount
0,1,1,Toy Story (1995),4.0,215
1,5,1,Toy Story (1995),4.0,215
2,7,1,Toy Story (1995),4.5,215
3,15,1,Toy Story (1995),2.5,215
4,17,1,Toy Story (1995),4.5,215


In [32]:
from scipy.sparse import csr_matrix
movie_rating_pivot = movie_rating_popular.pivot(index='title', columns='userId', values='rating').fillna(0)

In [33]:
test_size = int(0.2*movie_rating_pivot.shape[0])
xtest = movie_rating_pivot.sample(n=test_size)
xtrain = movie_rating_pivot[~movie_rating_pivot.index.isin(xtest.index)]

In [34]:
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(xtrain)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [35]:
query_index = np.random.choice(xtest.shape[0])
distances, indices = model_knn.kneighbors(xtest.iloc[query_index, :].values.reshape(1,-1), n_neighbors = 6)

In [36]:
distances[0], indices[0]

(array([0.45851888, 0.45901988, 0.46413688, 0.46792656, 0.48542058,
        0.49398333]), array([313,  12, 319, 327, 314, 340]))

In [147]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(movie_rating_pivot.index[query_index]))
    
    print('{0}: {1}, with distance of {2}:'.format(i, movie_rating_pivot.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Bourne Supremacy, The (2004):

0: Predator (1987), with distance of 0.4585188804845023:
1: Airplane! (1980), with distance of 0.45901988039276276:
2: Quiz Show (1994), with distance of 0.4641368800447624:
3: Robin Hood: Men in Tights (1993), with distance of 0.46792656028553226:
4: Prestige, The (2006), with distance of 0.48542058398655386:
5: Scary Movie (2000), with distance of 0.493983328445033:


## Evaluation

In [60]:
movie_rating_pivot.to_csv('data.csv', sep=';')

In [61]:
from surprise import KNNBasic
from surprise import Dataset
from surprise import Reader
import os

file_path = os.path.expanduser('data.csv')
reader = Reader(line_format='user item rating timestamp', sep=';')
data = Dataset.load_from_file(file_path, reader=reader)

In [142]:
from surprise import evaluate, print_perf

##########--------MSD------User Based Collaborative Filtering algorithm
print('')
print('MSD----User Based Collaborative Filtering algorithm result')
data.split(n_folds=3)
algo = KNNBasic(sim_options = {'name':'MSD','user_based': True})
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])
print_perf(perf)


MSD----User Based Collaborative Filtering algorithm result
Evaluating RMSE, MAE of algorithm KNNBasic.

------------
Fold 1
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.1203
MAE:  1.0662
------------
Fold 2
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.1409
MAE:  1.0700
------------
Fold 3
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.1030
MAE:  1.0533
------------
------------
Mean RMSE: 1.1214
Mean MAE : 1.0632
------------
------------
        Fold 1  Fold 2  Fold 3  Mean    
RMSE    1.1203  1.1409  1.1030  1.1214  
MAE     1.0662  1.0700  1.0533  1.0632  




In [100]:
books = pd.read_csv(os.getcwd() + "/books/booksummaries.txt", sep='\t', names=['Wikipedia_ID', 'Freebase_ID', 'Title', 'Author', 'Pub_Date', 'Genre', 'Plot'])
books.dropna(subset=['Genre'], inplace = True)
del books['Freebase_ID']
del books['Plot']
import ast
def getGenre(str):
    return list(ast.literal_eval(str).values())

books['Genre'] = books['Genre'].apply(getGenre)
books.reset_index(inplace = True, drop = True)
books.head()

Unnamed: 0,Wikipedia_ID,Title,Author,Pub_Date,Genre
0,620,Animal Farm,George Orwell,1945-08-17,"[Roman à clef, Satire, Children's literature, ..."
1,843,A Clockwork Orange,Anthony Burgess,1962,"[Science Fiction, Novella, Speculative fiction..."
2,986,The Plague,Albert Camus,1947,"[Existentialism, Fiction, Absurdist fiction, N..."
3,2080,A Fire Upon the Deep,Vernor Vinge,,"[Hard science fiction, Science Fiction, Specul..."
4,2152,All Quiet on the Western Front,Erich Maria Remarque,1929-01-29,"[War novel, Roman à clef]"


In [115]:
from SentenceSimilarity import ss

In [125]:
import itertools

movies_tag = list(set(itertools.chain.from_iterable(movielens['genres'])))
books_tag = list(set(itertools.chain.from_iterable(books['Genre'])))

In [141]:
sim_matrix = []
for i in movies_tag:
    for j in books_tag:
        sim_matrix.append([i,j,ss(i,j)])
sim_matrix = pd.DataFrame(sim_matrix, columns=['movie_tag','book_tag', 'sim'])
sim_matrix = sim_matrix.pivot(index='movie_tag', columns='book_tag', values='sim').fillna(0)
sim_matrix = pd.DataFrame({'movie_tag':sim_matrix.index.values, 'book_tag':sim_matrix.idxmax(axis=1).values})
sim_matrix

  return 1.0 - (np.linalg.norm(r1 - r2) / np.linalg.norm(r1 + r2))


Unnamed: 0,movie_tag,book_tag
0,(no genres listed),Modernism
1,Action,War novel
2,Adventure,Adventure novel
3,Animation,Regency romance
4,Children,Juvenile fantasy
5,Comedy,Comedy of manners
6,Crime,Crime Fiction
7,Documentary,Picture book
8,Drama,Comedy of manners
9,Fantasy,Fantasy of manners
