In [None]:
#Importing necessary libraries
import pandas as pd
import numpy as np

In [None]:
#Load the dataset
movie_df = pd.read_csv('movies.csv')
movie_df.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
rating_df = pd.read_csv('ratings.csv')
rating_df.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
# 'MovieID' is specified as an integer (int32) and 'Title' as a string (str).
movie_df = pd.read_csv('movies.csv', dtype={'MovieID': 'int32', 'Title': 'str'})

# Drop the 'Genres' column from movie dataframe as we wound not need it.
movie_df = movie_df.drop('Genres', axis=1)

movie_df.head()


Unnamed: 0,MovieID,Title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [None]:
# 'UserID' and 'MovieID' are specified as integers (int32) and 'Rating' as a floating point number (float32).
rating_df = pd.read_csv('ratings.csv', dtype={'UserID': 'int32', 'MovieID': 'int32', 'Rating': 'float32'})

# Drop the 'Timestamp' column.
rating_df = rating_df.drop('Timestamp', axis=1)

rating_df.head()

Unnamed: 0,UserID,MovieID,Rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [None]:
### Merge the two datasets on 'movieId'
data = pd.merge(movie_df, rating_df, on='MovieID')
data['MovieID'] = data['MovieID'].apply(pd.to_numeric)
data.head()

Unnamed: 0,MovieID,Title,UserID,Rating
0,1,Toy Story (1995),1,4.0
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),7,4.5
3,1,Toy Story (1995),15,2.5
4,1,Toy Story (1995),17,4.5


In [None]:
# pivot function transform/reshapes (each unique combination of UserID and MovieID forms a cell in the new matrix) rating_df into a user-item matrix.
rating_matrix = rating_df.pivot(index = 'UserID', columns ='MovieID', values = 'Rating')
rating_matrix.head()

MovieID,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


In the above cell, the rows represent users (UserID), the columns represent movies (MovieID), and the values in the matrix represent ratings (Rating).

Each cell in the matrix represents a user’s rating for a particular movie. If a user has not rated a movie, the value is NaN.

To handel the NaN values : let us calculate the column-wise mean (average rating for each movie). In this case, if a user has not rated a movie, the rating will get filled with the average rating of that movies by other users.

In [None]:
# The function passed to apply is a lambda function that takes a column x, calculates the mean of x with x.mean()
rating_matrix = rating_matrix.apply(lambda x: x.fillna(x.mean()), axis=0)
rating_matrix.head()

# Next we can do the normalization on this matrix so that  SVD perform better
# by scaling the ratings to a small, standard range.

MovieID,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,3.431818,4.0,2.357143,3.071429,4.0,3.185185,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
2,3.92093,3.431818,3.259615,2.357143,3.071429,3.946079,3.185185,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
3,3.92093,3.431818,3.259615,2.357143,3.071429,3.946079,3.185185,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
4,3.92093,3.431818,3.259615,2.357143,3.071429,3.946079,3.185185,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
5,4.0,3.431818,3.259615,2.357143,3.071429,3.946079,3.185185,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0


In [None]:
# Normalize the dataframe : Subtract the mean rating of each user from their ratings - negative values indicate below-average preferences.
rating_matrix_normalized = rating_matrix.apply(lambda x: x - x.mean(), axis=1)
rating_matrix_normalized.head()

MovieID,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.718247,0.150065,0.718247,-0.92461,-0.210325,0.718247,-0.096568,-0.406753,-0.156753,0.214459,...,0.218247,-0.281753,0.718247,0.718247,0.218247,0.718247,0.218247,0.218247,0.218247,0.718247
2,0.658467,0.169355,-0.002848,-0.905321,-0.191035,0.683615,-0.077278,-0.387464,-0.137464,0.233748,...,0.237536,-0.262464,0.737536,0.737536,0.237536,0.737536,0.237536,0.237536,0.237536,0.737536
3,0.663091,0.173979,0.001776,-0.900696,-0.186411,0.688239,-0.072654,-0.382839,-0.132839,0.238373,...,0.242161,-0.257839,0.742161,0.742161,0.242161,0.742161,0.242161,0.242161,0.242161,0.742161
4,0.662843,0.173731,0.001528,-0.900944,-0.186659,0.687991,-0.072902,-0.383087,-0.133087,0.238125,...,0.241913,-0.258087,0.741913,0.741913,0.241913,0.741913,0.241913,0.241913,0.241913,0.741913
5,0.737895,0.169713,-0.00249,-0.904963,-0.190677,0.683973,-0.07692,-0.387105,-0.137105,0.234107,...,0.237895,-0.262105,0.737895,0.737895,0.237895,0.737895,0.237895,0.237895,0.237895,0.737895


Singular Value Decomposition (SVD) is a matrix factorization technique. It decomposes a matrix into three other matrices, allowing us to express the original matrix as the product of these three matrices. The SVD of a matrix
X is represented as:

X=UΣV^T

where:

U is an orthogonal matrix representing the left singular vectors.
Σ is a diagonal matrix containing the singular values.
V^T is the transpose of the orthogonal matrix V, representing the right singular vectors.

In [None]:
import numpy as np

# Convert your DataFrame to a numpy array
X = rating_matrix_normalized.values

# List of potential K values
k_values = list(range(1, 101, 10))

# Placeholder for the best K value and its corresponding score
best_k, best_score = 0, float('inf')

# Loop over potential K values
for k in k_values:
    # Perform Singular Value Decomposition (SVD)
    U, Sigma, Vt = np.linalg.svd(X, full_matrices=False)

    # Truncate to the first k components
    U_k = U[:, :k]
    Sigma_k = np.diag(Sigma[:k])
    Vt_k = Vt[:k, :]

    # Reconstruct the original matrix so that we can see how good the data is performing after reducing the dimension using SVD.
    X_reconstructed = np.dot(U_k, np.dot(Sigma_k, Vt_k))

    # Calculate the reconstruction error (RMSE) which tells us how good is the model going to peforme and peform best at what k value.
    rmse = np.sqrt(np.mean((X - X_reconstructed) ** 2))

    # If this score is better than the best score so far, update best_k and best_score
    if rmse < best_score:
        best_k, best_score = k, rmse

# Print the best K value
print(f'The optimal number of components is {best_k} with a RMSE of {best_score}')


The optimal number of components is 91 with a RMSE of 0.07038237899541855


Truncating to the first k components in SVD is a way to approximate the original matrix X with a lower-rank matrix. The intuition is to retain the most important information and discard the less important or noise components.

The idea behind choosing k is to retain only the first
k singular values (the diagonal elements of the matrix
Σ) and their corresponding singular vectors (matrices
U and V^T in SVD contain the left singular vectors and right singular vectors). Truncating to the first k components results in a lower-rank approximation of the original matrix, reducing dimensionality.

Noise Reduction : The singular values are sorted in descending order and the smaller values contribute less to the overall structure of the matrix. Truncating to the first k components helps in removing noise.

In [None]:
# Defining the range of ratings as given in the dataset
min_rating = 1
max_rating = 5

# Calculate normalized RMSE to gain a better understanding of whether a certain RMSE value is good or not.
normalized_rmse = rmse / (max_rating - min_rating)
print(f'Normalized RMSE: {normalized_rmse}')


Normalized RMSE: 0.017595594748854637


It shows that the normalized RMSE is very close to 0 and not 1 which means that our RMSE (i.e the model with k=91 will perform better) value is good.

In [None]:
n_users =int(max(rating_df.UserID)) # The maximum value of userid
n_items =int(max(rating_df.MovieID)) # The maximum value of movieid

In [None]:
# Dividing the dataset into Training, Testing dataset
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(rating_df, test_size=0.3)

In [None]:
# Create train and test data matrices
train_data_matrix = np.zeros((n_users, n_items))

for line in X_train.itertuples():
    a=int(line[1]-1)
    b=int(line[2]-1)
    c=line[3]
    train_data_matrix[a,b] = c

test_data_matrix = np.zeros((n_users, n_items))
for line in X_test.itertuples():
    a=int(line[1]-1)
    b=int(line[2]-1)
    c=line[3]
    test_data_matrix[a, b] = c

In [None]:
# Calculate the predicted ratings matrix for the test data
users_predicted_ratings_test = X_reconstructed

In [None]:
# Convert the matrix into a DataFrame
predictions_df = pd.DataFrame(users_predicted_ratings_test, columns=rating_matrix_normalized.columns)


In [None]:
from scipy.sparse.linalg import svds
import numpy as np
from scipy.sparse import csr_matrix

# Assuming rating_matrix_normalized is a dense matrix - where most of the elements are non-zero.
rating_matrix_normalized_sparse = csr_matrix(rating_matrix_normalized)

# Use svds with the sparse matrix - where most of the elements are zero.
U, sigma, Vt = svds(rating_matrix_normalized_sparse, k=best_k)

# Convert the diagonal matrix to a full matrix i.e from 2D to 1-Dimentional
sigma = np.diag(sigma)


Calculating the predicted ratings matrix by dot product of U, sigma, Vt generated. This can be achieved by  users_mean matrix which we define below.

In [None]:
#Mean rating given by each user
users_mean_rating = np.array(rating_matrix.mean(axis=1))

#Predicted rating value for each user
predicted_ratings = np.dot(np.dot(U, sigma), Vt) + users_mean_rating.reshape(-1, 1)
print(predicted_ratings)

[[3.3431551 3.4591687 3.854848  ... 3.500327  3.500327  3.9999206]
 [3.9107976 3.4362347 3.2393892 ... 3.4997714 3.4997714 3.999914 ]
 [4.2928386 3.49127   3.3187377 ... 3.501267  3.501267  4.0014396]
 ...
 [2.4251127 1.9091015 1.9360336 ... 3.5003097 3.5003097 4.0002227]
 [3.7626991 3.3796704 3.2258077 ... 3.4992833 3.4992833 3.9992218]
 [5.198341  3.569571  3.1885502 ... 3.5000157 3.5000157 3.999805 ]]


In [None]:
# Converting the matrix into dataframe
predictions_df = pd.DataFrame(predicted_ratings, columns =  rating_matrix.columns)
predictions_df.head()

MovieID,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
0,3.343155,3.459169,3.854848,2.356136,3.029069,4.27973,2.854415,2.769876,3.166842,3.478853,...,3.500327,3.000734,3.999921,3.999921,3.500327,3.999921,3.500327,3.500327,3.500327,3.999921
1,3.910798,3.436235,3.239389,2.352886,3.089917,3.958582,3.183248,2.870884,3.117515,3.514419,...,3.499771,2.999629,3.999914,3.999914,3.499771,3.999914,3.499771,3.499771,3.499771,3.999914
2,4.292839,3.49127,3.318738,2.344606,3.255173,3.947667,3.228472,2.952866,3.016036,3.747823,...,3.501267,3.001095,4.00144,4.00144,3.501267,4.00144,3.501267,3.501267,3.501267,4.00144
3,4.054424,3.413001,3.161343,2.382853,3.064132,3.695491,3.159775,2.929183,3.069046,3.695372,...,3.498654,2.998737,3.99857,3.99857,3.498654,3.99857,3.498654,3.498654,3.498654,3.99857
4,3.987528,3.369135,3.208396,2.346795,3.105495,3.9603,3.18224,2.850817,3.10727,3.466077,...,3.500019,3.000066,3.999971,3.999971,3.500019,3.999971,3.500019,3.500019,3.500019,3.999971


In [None]:
# Extract "userId" and "movieId" values from the 'test_data' DataFrame

user_ids = []
movie_ids = []

for index in range(X_test.shape[0]):
    # Extract "userId" and "movieId" values from the current row
    current_user_id = X_test["UserID"].iloc[index]
    current_movie_id = X_test["MovieID"].iloc[index]

    # Append the values to the respective lists
    user_ids.append(current_user_id)
    movie_ids.append(current_movie_id)

# Result: Lists 'user_ids' and 'movie_ids' now contain all "userId" and "movieId" values from 'test_data', respectively.


In [None]:
# Retrieve predicted ratings from 'predictions_df' based on 'user_ids' and 'movie_ids'

predicted_ratings = []

for index in range(X_test.shape[0]):
    # Extract current "userId" and "movieId" values from the lists
    current_user_id = user_ids[index]
    current_movie_id = movie_ids[index]

    # Retrieve the predicted rating from 'predictions_df'
    predicted_rating = predictions_df[current_movie_id][current_user_id - 1]

    # Append the predicted rating to the list
    predicted_ratings.append(predicted_rating)

# Result: List 'predicted_ratings' now contains the predicted ratings corresponding to the 'user_ids' and 'movie_ids'.


In [None]:
# Create a binary list based on whether predicted ratings are less than 3 or not

binary_predictions = []

for index in range(X_test.shape[0]):
    # Extract current "userId" and "movieId" values from the lists
    current_user_id = user_ids[index]
    current_movie_id = movie_ids[index]

    # Check if the predicted rating is less than 3
    if predictions_df[current_movie_id][current_user_id - 1] < 3:
        binary_predictions.append(0)  # Append 0 if predicted rating is less than 3
    else:
        binary_predictions.append(1)  # Append 1 if predicted rating is 3 or greater

# Result: List 'binary_predictions' now contains binary values (0 or 1) based on whether predicted ratings are less than 3 or not.


In [None]:
# Create a binary list based on whether actual ratings are less than 3 or 3 or greater

binary_actual_ratings = []

# Loop through each row in the DataFrame
for index in range(X_test.shape[0]):
    # Extract the actual rating from the current row
    current_actual_rating = X_test["Rating"].iloc[index]

    # Classify the actual rating into binary values (0 or 1)
    if current_actual_rating < 3:
        binary_actual_ratings.append(0)  # Append 0 if actual rating is less than 3
    else:
        binary_actual_ratings.append(1)  # Append 1 if actual rating is 3 or greater

# Result: List 'binary_actual_ratings' now contains binary values (0 or 1) based on whether actual ratings are less than 3 or 3 or greater.


In [None]:
# Initialize a variable to count correct predictions
correct_predictions_count = 0

# Loop through each prediction and actual rating pair
for pred, actual in zip(binary_predictions, binary_actual_ratings):
    # Check if the prediction is correct
    if pred == actual:
        correct_predictions_count += 1  # Increment the count for correct predictions

# Calculate accuracy as the ratio of correct predictions to total predictions
accuracy = correct_predictions_count / len(binary_actual_ratings)

# Print the accuracy as a percentage
print(f'Accuracy of the model: {accuracy * 100}')


Accuracy of the model: 90.24825625599154


Movie Recommendations

In [None]:
def movie_recommendation(predictions_df, userID, movies_df, ratings_df, n_recommendations=10):

    # Take the user ID as per row.
    user_row = userID - 1

    # Get the user predictions from prediction dataframe in descending order.
    user_predictions = predictions_df.iloc[user_row].sort_values(ascending=False)

    # Get the user's data and merge with the movie information such as Title
    user_data = ratings_df[rating_df.UserID == (userID)]
    user_full_data = (user_data.merge(movies_df, how = 'left', left_on = 'MovieID', right_on = 'MovieID').
                     sort_values(['Rating'], ascending=False)
                 )

    print('Recommending the highest {0} predicted ratings movies that are not rated.'.format(n_recommendations))

    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies_df[~movies_df['MovieID'].isin(user_full_data['MovieID'])].
                       merge(pd.DataFrame(user_predictions).reset_index(), how = 'left',
                             left_on = 'MovieID', right_on = 'MovieID').
                       rename(columns = {user_row: 'Predictions'}).
                       sort_values('Predictions', ascending = False).
                       iloc[:n_recommendations, :-1]
                      )

    return user_full_data, recommendations

previous_rated, recommended_movies = movie_recommendation(predictions_df, 20, movie_df, rating_df, 10)

Recommending the highest 10 predicted ratings movies that are not rated.


In [None]:
# The top 5 movies rated by the user
previous_rated.head(10)

Unnamed: 0,UserID,MovieID,Rating,Title
87,20,2300,5.0,"Producers, The (1968)"
36,20,1025,5.0,"Sword in the Stone, The (1963)"
231,20,5991,5.0,Chicago (2002)
43,20,1073,5.0,Willy Wonka & the Chocolate Factory (1971)
45,20,1097,5.0,E.T. the Extra-Terrestrial (1982)
46,20,1148,5.0,Wallace & Gromit: The Wrong Trousers (1993)
97,20,2709,5.0,Muppets From Space (1999)
50,20,1380,5.0,Grease (1978)
199,20,4995,5.0,"Beautiful Mind, A (2001)"
229,20,5952,5.0,"Lord of the Rings: The Two Towers, The (2002)"


In [None]:
# The recommended movies to the user
recommended_movies

Unnamed: 0,MovieID,Title
82,99,Heidi Fleiss: Hollywood Madam (1995)
43,53,Lamerica (1994)
4433,6983,Jane Eyre (1944)
8800,141718,Deathgasm (2015)
8804,141816,12 Chairs (1976)
7832,99636,English Vinglish (2012)
8475,126088,A Flintstones Christmas Carol (1994)
9125,162414,Moonlight
9123,162344,Tom Segura: Mostly Stories (2016)
6557,60737,Watching the Detectives (2007)
