In [1]:
import numpy as np
import pandas as pd
import sklearn
#import surprise
from surprise import Dataset, Reader, SVD
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error


In [27]:
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error

# Load the ratings matrix
file = "ratingmatrix_BDA.xlsx"
matrix = pd.read_excel(file, header=0, index_col=0)

# Create a mask before filling NaN values
known_ratings_mask = ~matrix.isna()

# Fill NaNs with 0 for SVD computation
ratings_matrix = matrix.fillna(0).to_numpy()

# Perform SVD
svd = TruncatedSVD(n_components=3)
U = svd.fit_transform(ratings_matrix)
sigma = np.diag(svd.singular_values_)
V = svd.components_

# Approximate the ratings matrix
approx_ratingMatrix = np.dot(U, np.dot(sigma, V))

# Convert to DataFrame
approx_df = pd.DataFrame(approx_ratingMatrix, index=matrix.index, columns=matrix.columns)

# Define styling function
def style_cell(val, is_known):
    if is_known:
        return 'font-weight: bold; color: blue;'  # Original ratings are bold and blue
    else:
        return 'font-style: italic; color: red;'  # Predicted values are italic and red

# Apply the style to each cell based on the mask
def style_row(row):
    mask_row = known_ratings_mask.loc[row.name]  # Ensure we use the original mask
    return [style_cell(val, is_known) for val, is_known in zip(row, mask_row)]

# Apply styling
styled_df = approx_df.style.apply(style_row, axis=1)

# Display the styled DataFrame
styled_df


Unnamed: 0,M1,M2,M3,M4,M5,M6,M7,M8,M9,M10
U1,11.88509,25.815675,-2.759305,28.475939,-2.317761,26.269333,18.048712,1.643756,18.507085,7.218951
U2,30.831917,40.372071,11.0863,16.47953,1.971115,4.484373,37.373161,8.035474,22.997372,24.871252
U3,27.626049,34.305664,16.732149,19.201407,10.594848,14.753024,31.322594,16.077087,19.587343,31.976433
U4,24.10979,6.639231,51.067342,-2.081125,46.936333,13.567501,13.48713,49.30924,-2.075684,67.667248
U5,1.00322,11.0896,-1.673483,28.799812,4.130475,36.42143,3.417906,6.167669,10.566954,6.391591
U6,7.892117,8.969196,12.961555,15.374057,14.683011,23.206363,6.582302,16.592657,5.832826,22.102126
U7,8.91324,15.301732,3.88741,17.467279,4.234854,18.731697,11.230798,6.761042,10.487175,11.676584
U8,4.969187,5.655549,9.372284,11.565313,11.066079,17.98535,3.816373,12.34572,3.843145,15.956766
U9,48.480575,53.598164,30.325816,13.970647,15.22901,0.248909,53.592016,23.581065,27.685263,51.653129
U10,14.169191,10.357069,17.032714,-0.419065,12.831072,-0.147998,12.545654,14.71416,3.70609,23.923631


In [33]:
# Q7
# No vacant cells only entries with values
val = ratings_matrix > 0
print(val)
# Calculate the RMSE considering only known ratings and no vacant cells
rmse = np.sqrt(mean_squared_error(ratings_matrix[val], approx_ratingMatrix[val]))
print("\nRoot Mean Squared Error (RMSE) between original and approximate matrix: ", rmse)

[[False  True False  True False  True False False  True False]
 [ True  True  True False  True False  True False  True False]
 [ True  True False  True False False  True  True  True  True]
 [False False  True False  True False False  True False  True]
 [False False False  True False  True  True False False False]
 [ True False False False  True  True False False False  True]
 [False  True False  True False  True False  True  True False]
 [False False False False False  True False False False  True]
 [ True  True  True False False False  True False False  True]
 [ True False  True False False False False  True False  True]]

Root Mean Squared Error (RMSE) between original and approximate matrix:  27.945541589566446


In [None]:
# Q8
observed = matrix.stack().reset_index()
observed.columns = ['user', 'item', 'rating']

# Create a Reader object (adjust the rating_scale as needed)
min_rating = observed['rating'].min()
max_rating = observed['rating'].max()
reader = Reader(rating_scale=(min_rating, max_rating))

# Load data for Surprise
data = Dataset.load_from_df(observed[['user', 'item', 'rating']], reader)
trainset = data.build_full_trainset()

# Train the SVD algorithm with 3 latent factors
algo = SVD(n_factors=3)
algo.fit(trainset)

# Prepare a DataFrame to store predictions for all user-item pairs
users = matrix.index
items = matrix.columns
predicted_matrix = pd.DataFrame(index=users, columns=items, dtype=float)

# Predict ratings for every user-item pair
for u in users:
    for i in items:
        predicted_matrix.loc[u, i] = algo.predict(u, i).est

# Round predictions for display clarity
predicted_matrix = predicted_matrix.round(2)

styled_df = predicted_matrix.style.apply(style_row, axis=1)

# Display the styled DataFrame
styled_df

Unnamed: 0,M1,M2,M3,M4,M5,M6,M7,M8,M9,M10
U1,2.71,2.87,2.76,2.9,2.73,2.82,2.87,2.82,2.78,2.89
U2,2.66,2.83,2.74,2.87,2.67,2.79,2.83,2.82,2.73,2.83
U3,2.48,2.64,2.49,2.64,2.48,2.67,2.6,2.55,2.46,2.58
U4,3.46,3.62,3.51,3.64,3.49,3.59,3.63,3.55,3.51,3.63
U5,2.83,2.97,2.74,2.95,2.82,3.01,2.88,2.78,2.81,2.94
U6,2.64,2.8,2.68,2.81,2.65,2.82,2.78,2.75,2.64,2.75
U7,2.35,2.51,2.37,2.5,2.37,2.54,2.5,2.41,2.32,2.44
U8,2.76,2.93,2.82,2.95,2.78,2.92,2.92,2.89,2.8,2.91
U9,3.24,3.38,3.21,3.38,3.26,3.37,3.36,3.23,3.26,3.39
U10,2.5,2.65,2.5,2.64,2.51,2.7,2.62,2.54,2.45,2.58


In [37]:
# Q9

# Calculate the RMSE considering only known ratings and no vacant cells
rmse = np.sqrt(mean_squared_error(ratings_matrix, predicted_matrix))
print("\nRoot Mean Squared Error (RMSE) between original and approximate matrix: ", rmse)


Root Mean Squared Error (RMSE) between original and approximate matrix:  2.3054385699905344


# Q10
In Q6, the SVD reconstruction was applied directly to a matrix with missing values filled as zeros, resulting in a high RMSE of 27. In contrast, Q8 used the Surprise package's SVD, which is optimized for collaborative filtering on only the known ratings, achieving an RMSE of 2.305. This significant RMSE reduction suggests that Q8’s predictions for originally blank cells are much more accurate. In summary, the collaborative filtering approach in Q8 offers a more reliable estimation of missing ratings.

In [39]:
# Collect recommendations in a list
recommendations_list = []

# For each user, select the top two movies with the highest predicted ratings among the originally blank cells
for user in predicted_matrix.index:
    # Only consider movies that were originally not rated
    unrated = predicted_matrix.loc[user][~known_ratings_mask.loc[user]]
    top_two = unrated.sort_values(ascending=False).head(2)
    for movie, rating in top_two.items():
        recommendations_list.append({"User": user, "Movie": movie, "Predicted Rating": rating})

# Convert list to DataFrame
recommendations = pd.DataFrame(recommendations_list)
print(recommendations)


   User Movie  Predicted Rating
0    U1   M10              2.89
1    U1    M7              2.87
2    U2    M4              2.87
3    U2   M10              2.83
4    U3    M6              2.67
5    U3    M3              2.49
6    U4    M4              3.64
7    U4    M7              3.63
8    U5    M2              2.97
9    U5   M10              2.94
10   U6    M4              2.81
11   U6    M2              2.80
12   U7    M7              2.50
13   U7   M10              2.44
14   U8    M4              2.95
15   U8    M2              2.93
16   U9    M4              3.38
17   U9    M6              3.37
18  U10    M6              2.70
19  U10    M2              2.65


These recommendations are reasonable because they come from a model with a low RMSE (2.305), indicating accurate predictions. The model is tailored to fit the known ratings, so the highest predicted values for unrated movies are likely to reflect the user's true preferences. However, practical recommendations may also consider other factors like genre or diversity.

# Q12


In [40]:
# Collect recommendations for each movie in a list
movie_recommendations = []

# For each movie, select the top two users (that originally did not rate the movie)
for movie in predicted_matrix.columns:
    # Only consider users that did not originally rate the movie
    unrated_users = predicted_matrix[movie][~known_ratings_mask[movie]]
    # Sort by predicted rating (highest first) and select the top two
    top_two_users = unrated_users.sort_values(ascending=False).head(2)
    for user, rating in top_two_users.items():
        movie_recommendations.append({
            "Movie": movie,
            "User": user,
            "Predicted Rating": rating
        })

# Convert the list to a DataFrame and display it
movie_recommendations_df = pd.DataFrame(movie_recommendations)
print(movie_recommendations_df)


   Movie User  Predicted Rating
0     M1   U4              3.46
1     M1   U5              2.83
2     M2   U4              3.62
3     M2   U5              2.97
4     M3   U8              2.82
5     M3   U1              2.76
6     M4   U4              3.64
7     M4   U9              3.38
8     M5   U9              3.26
9     M5   U5              2.82
10    M6   U4              3.59
11    M6   U9              3.37
12    M7   U4              3.63
13    M7   U8              2.92
14    M8   U9              3.23
15    M8   U8              2.89
16    M9   U4              3.51
17    M9   U9              3.26
18   M10   U5              2.94
19   M10   U1              2.89


If the predicted ratings for the recommended users are high, this indicates that the model expects these users to have a strong interest in the movie, making the recommendations reasonable. The model’s low RMSE (2.305) supports that the predictions are reliable. However, practical recommendations might also consider additional factors such as genre preferences or viewing history for even better personalization.

Q13:
a. The data would be split across each of the servers, as there is likely not enough space for each server to hold the data individually.
Ideally, each server would only hold the piece of the data it intends to predict to minimize how often servers communicate.
b. The P matrix would be divided between each server, in such a way that every multiplication within P and Q is possible within a server.
Duplicate parts of the P and Q matricies can exist across servers to ensure this.
(For example, with four servers, two servers would have the first half of P and two would have the second half of P. One server each from
those groups would have the first and second halves of Q, so each server can calculate 1/4th of the total data).
c. The Q matrix would be stored similarly to the P matrix, as described above. The result is that the full matrix of predicted ratings
can be computed across each of the servers.
d. After finding the error given by each prediction, each server would record how the P and Q matrix would change as a result of each prediction. 
Once every server completes this process, the changes would be communicated and combined with each other to create a consistent P and Q matrix across all servers.
e. The mapper function would run SVD on each server's data, get a list of predicted ratings, use them to calculate error compared to the rating
matrix R (using whichever loss function is chosen), and calculate how each weight within P and Q would be changed as a result. (For this output, 
the key would be the index of the weight, and the value would be how that weight's value is updated as a result of a single prediction.)
f. The reducer function would collect each weight's update and combine them, along with the weight's current value, to output the updated weight for the matrix.