'''
Source codes for Python Machine Learning By Example 3rd Edition (Packt Publishing)
Chapter 2 Building A Movie Recommendation Engine with Naive Bayes
Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)
'''

In [26]:
import numpy as np
from collections import defaultdict


def load_rating_data(data_path, n_users, n_movies):
    """
    Load rating data from file and also return the number of ratings for each movie and movie_id index mapping
    @param data_path: path of the rating data file
    @param n_users: number of users
    @param n_movies: number of movies that have ratings
    @return: rating data in the numpy array of [user, movie]; movie_n_rating, {movie_id: number of ratings};
             movie_id_mapping, {movie_id: column index in rating data}
    """
    data = np.zeros([n_users, n_movies], dtype=np.float32)
    movie_id_mapping = {}
    movie_n_rating = defaultdict(int)
    with open(data_path, 'r') as file:
        for line in file.readlines()[1:]:
            user_id, movie_id, rating, _ = line.split(",")
            user_id = int(user_id) - 1
            if movie_id not in movie_id_mapping:
                movie_id_mapping[movie_id] = len(movie_id_mapping)
            if "." in rating:
                rating = float(rating)
            rating = int(rating)
            data[user_id, movie_id_mapping[movie_id]] = rating
            if rating > 0:
                movie_n_rating[movie_id] += 1
    return data, movie_n_rating, movie_id_mapping

In [27]:
def display_distribution(data):
    values, counts = np.unique(data, return_counts=True)
    for value, count in zip(values, counts):
        print(f'Number of rating {int(value)}: {count}')

In [35]:
data_path = 'ml-latest-small/ml-latest-small/ratings.csv'
n_users = 610
n_movies = 9724
data, movie_n_rating, movie_id_mapping = load_rating_data(data_path, n_users, n_movies)

The code shown below was used to get number of users and movies. Use max(users) and max(movies)

In [30]:
with open(data_path) as infile:
    lines = infile.readlines()
lines = [line.strip() for line in lines[1:]]
users = []
movies = []
for line in lines:
    aList = line.split(",")
    users.append(int(aList[0]))
    movies.append(int(aList[1]))
    

In [36]:
len(set(users)), len(set(movies))

(610, 9724)

In [41]:
 display_distribution(data)

Number of rating 0: 5832174
Number of rating 1: 4602
Number of rating 2: 13101
Number of rating 3: 33183
Number of rating 4: 35369
Number of rating 5: 13211


In [37]:
data

array([[4., 4., 4., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [2., 2., 0., ..., 0., 0., 0.],
       [3., 0., 0., ..., 0., 0., 0.],
       [5., 0., 5., ..., 3., 3., 3.]], dtype=float32)

In [38]:
movie_n_rating

defaultdict(int,
            {'1': 214,
             '3': 51,
             '6': 102,
             '47': 201,
             '50': 204,
             '70': 55,
             '101': 23,
             '110': 236,
             '151': 44,
             '157': 11,
             '163': 66,
             '216': 49,
             '223': 104,
             '231': 132,
             '235': 70,
             '260': 250,
             '296': 305,
             '316': 140,
             '333': 50,
             '349': 110,
             '356': 328,
             '362': 34,
             '367': 156,
             '423': 10,
             '441': 42,
             '457': 189,
             '480': 237,
             '500': 143,
             '527': 218,
             '543': 40,
             '552': 61,
             '553': 65,
             '590': 164,
             '592': 189,
             '593': 276,
             '596': 58,
             '608': 181,
             '648': 161,
             '661': 49,
             '673': 50,
          

In [39]:
movie_id_mapping

{'1': 0,
 '3': 1,
 '6': 2,
 '47': 3,
 '50': 4,
 '70': 5,
 '101': 6,
 '110': 7,
 '151': 8,
 '157': 9,
 '163': 10,
 '216': 11,
 '223': 12,
 '231': 13,
 '235': 14,
 '260': 15,
 '296': 16,
 '316': 17,
 '333': 18,
 '349': 19,
 '356': 20,
 '362': 21,
 '367': 22,
 '423': 23,
 '441': 24,
 '457': 25,
 '480': 26,
 '500': 27,
 '527': 28,
 '543': 29,
 '552': 30,
 '553': 31,
 '590': 32,
 '592': 33,
 '593': 34,
 '596': 35,
 '608': 36,
 '648': 37,
 '661': 38,
 '673': 39,
 '733': 40,
 '736': 41,
 '780': 42,
 '804': 43,
 '919': 44,
 '923': 45,
 '940': 46,
 '943': 47,
 '954': 48,
 '1009': 49,
 '1023': 50,
 '1024': 51,
 '1025': 52,
 '1029': 53,
 '1030': 54,
 '1031': 55,
 '1032': 56,
 '1042': 57,
 '1049': 58,
 '1060': 59,
 '1073': 60,
 '1080': 61,
 '1089': 62,
 '1090': 63,
 '1092': 64,
 '1097': 65,
 '1127': 66,
 '1136': 67,
 '1196': 68,
 '1197': 69,
 '1198': 70,
 '1206': 71,
 '1208': 72,
 '1210': 73,
 '1213': 74,
 '1214': 75,
 '1219': 76,
 '1220': 77,
 '1222': 78,
 '1224': 79,
 '1226': 80,
 '1240': 81,
 '

In [40]:
movie_id_most, n_rating_most = sorted(movie_n_rating.items(), key=lambda d: d[1], reverse=True)[0]
print(f'Movie ID {movie_id_most} has {n_rating_most} ratings.')

Movie ID 356 has 328 ratings.


In [42]:
#We will delete the column corresponding to this movie and predict its ratings
X_raw = np.delete(data, movie_id_mapping[movie_id_most], axis=1)
Y_raw = data[:, movie_id_mapping[movie_id_most]]

In [43]:
X = X_raw[Y_raw > 0]
Y = Y_raw[Y_raw > 0]

In [44]:
print('Shape of X:', X.shape)
print('Shape of Y:', Y.shape)

Shape of X: (328, 9723)
Shape of Y: (328,)


In [45]:
display_distribution(Y)

Number of rating 1: 1
Number of rating 2: 12
Number of rating 3: 66
Number of rating 4: 133
Number of rating 5: 116


In [46]:
recommend = 3
Y[Y <= recommend] = 0
Y[Y > recommend] = 1

n_pos = (Y == 1).sum()
n_neg = (Y == 0).sum()
print(f'{n_pos} positive samples and {n_neg} negative samples.')

249 positive samples and 79 negative samples.


In [47]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print(len(Y_train), len(Y_test))

262 66


In [48]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=1.0, fit_prior=True)
clf.fit(X_train, Y_train)

prediction_prob = clf.predict_proba(X_test)
print(prediction_prob[0:10])

[[5.90883584e-07 9.99999409e-01]
 [3.56680973e-10 1.00000000e+00]
 [1.18217609e-14 1.00000000e+00]
 [1.71056048e-18 1.00000000e+00]
 [1.19045802e-14 1.00000000e+00]
 [5.94207942e-24 1.00000000e+00]
 [5.29839752e-85 1.00000000e+00]
 [3.01733914e-61 1.00000000e+00]
 [8.74149339e-01 1.25850661e-01]
 [1.00000000e+00 1.12506774e-29]]


In [49]:
accuracy = clf.score(X_test, Y_test)
print(f'The accuracy is: {accuracy*100:.1f}%')


The accuracy is: 69.7%


In [51]:
from sklearn.metrics import confusion_matrix
prediction = clf.predict(X_test)
print(confusion_matrix(Y_test, prediction, labels=[0, 1]))


[[ 3 13]
 [ 7 43]]


In [52]:
from sklearn.metrics import classification_report
report = classification_report(Y_test, prediction)
print(report)

              precision    recall  f1-score   support

         0.0       0.30      0.19      0.23        16
         1.0       0.77      0.86      0.81        50

    accuracy                           0.70        66
   macro avg       0.53      0.52      0.52        66
weighted avg       0.65      0.70      0.67        66

