In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import sqrt
import time
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import minmax_scale

In [4]:
ratings_df = pd.read_csv("./ratings.csv")
movies_df  = pd.read_csv("./item.csv")
users_df  = pd.read_csv("./user.csv")


In [8]:
users_df.head(5)

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [9]:
ratings_df.head(5)

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [10]:
movies_df.head(5)

Unnamed: 0,movie_id,movie_name,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


###  Splitting the Dataset

In [14]:
X = ratings_df.drop(columns='rating')
y = ratings_df["rating"].values  # The movie ratings are the target variables we want to predict

# Preparing train, validation and test datasets.
# I have chosen a split ratio of 70%, 15%, 15%, because I want a somewhat large training set at the cost of a
# smaller validation and test set. I do not think that a smaller validation (or test) dataset will negatively
# impact the generalization ability of the chosen models, because I am only using rather simple ML models
# with few hyperparamaters.
X_train, X_val_and_test, y_train, y_val_and_test = train_test_split(X, y, test_size=0.3, random_state=101)
X_val, X_test, y_val, y_test = train_test_split(X_val_and_test, y_val_and_test, test_size=0.5, random_state=101)

# Creating a complete training dataset with X_train and y_train
train_df = X_train.copy()
train_df["rating"] = y_train

train_df

Unnamed: 0,user_id,movie_id,unix_timestamp,rating
64304,648,28,884628437,5
78145,805,559,881695347,3
19755,222,1057,881061370,4
78591,811,678,886377686,5
42070,409,1449,881107817,5
...,...,...,...,...
5695,59,655,888204642,5
73542,752,316,891208329,3
83281,870,461,875680099,4
83467,871,510,888193335,3


### The Baseline Model

In [15]:
# Creating a new dataframe with the average rating for each movie. 
# The "prediction" for the baseline "model" will actually just be these averages for each movie.
baseline_y_pred = pd.DataFrame(train_df.groupby('movie_id')['rating'].mean())

# The baseline model has not yet calculated an average for the movies (if any) with no ratings. 
# We therefore add these movies to the model with a predicted rating equal to the average rating 
# for all movies in the training dataset.
# ADD SOME CODE HERE!!

# Creating a dataframe for the validation data (y_true) in order to calculate RMSE of the baseline model
val_movies_dict = {'movie_id': X_val["movie_id"], 'Actual rating': y_val}  
val_movies_df = pd.DataFrame(val_movies_dict)

# Merging the training and validation datasets on the movie ID
y_pred_and_y_true = pd.merge(baseline_y_pred, val_movies_df, on='movie_id')
baseline_y_pred_vs_y_true = y_pred_and_y_true.rename(columns={"rating": "Predicted rating"})

baseline_y_pred_vs_y_true

Unnamed: 0,movie_id,Predicted rating,Actual rating
0,1,3.857143,5
1,1,3.857143,4
2,1,3.857143,4
3,1,3.857143,5
4,1,3.857143,5
...,...,...,...
14966,1610,3.000000,2
14967,1610,3.000000,1
14968,1615,2.833333,3
14969,1615,2.833333,4


In [16]:
print("RMSE baseline model: ", sqrt(mean_squared_error(baseline_y_pred_vs_y_true["Predicted rating"], 
                                                       baseline_y_pred_vs_y_true["Actual rating"])))

RMSE baseline model:  1.0198186504959492


### Content Based Filtering

In [17]:
# Adding the movie features (genre, release year) to the training dataset
content_train_df = pd.merge(train_df, movies_df, on='movie_id')
content_train_df.drop(columns=['unix_timestamp', 'movie_id', 'movie_name','release date','video release date','IMDb URL','unknown'], inplace=True)  # Remove useless features

content_train_df


Unnamed: 0,user_id,rating,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,648,5,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
1,292,4,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
2,210,4,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
3,757,3,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
4,163,3,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,405,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
69996,7,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
69997,181,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0
69998,405,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [19]:
# Creating a list of lists with the target attribute (movie ratings), grouped by userID
y_grouped_by_user = content_train_df.groupby(["user_id"])
y_train_listed = []

for i, j in y_grouped_by_user:
    y_train_listed.append(j["rating"].values)  # Extract target feature (ratings) from sorted data
    
# Target attributes for the first user
y_train_listed[0]

array([4, 5, 5, 4, 3, 4, 5, 2, 3, 4, 3, 5, 4, 1, 4, 4, 4, 5, 3, 3, 5, 1,
       5, 5, 4, 3, 4, 4, 5, 3, 1, 5, 4, 5, 2, 3, 4, 5, 4, 5, 4, 3, 5, 1,
       2, 4, 4, 5, 3, 4, 3, 4, 5, 4, 5, 3, 4, 3, 5, 4, 2, 3, 3, 2, 4, 3,
       3, 5, 5, 5, 4, 5, 5, 5, 4, 5, 1, 5, 5, 5, 4, 4, 3, 2, 3, 2, 3, 5,
       4, 4, 5, 3, 5, 4, 3, 4, 2, 1, 3, 5, 1, 4, 4, 3, 5, 5, 4, 5, 5, 3,
       5, 4, 3, 4, 2, 5, 5, 2, 4, 4, 5, 4, 4, 2, 4, 4, 3, 3, 3, 4, 3, 4,
       5, 3, 2, 4, 4, 4, 1, 3, 5, 3, 3, 4, 2, 5, 5, 4, 2, 5, 1, 4, 5, 4,
       1, 4, 5, 3, 3, 3, 2, 5, 1, 4, 4, 2, 2, 4, 4, 4, 4, 3, 3, 3, 1, 4,
       2, 1, 4, 1, 4, 4, 5, 4, 1, 4, 5, 1, 2, 1, 1, 5, 1], dtype=int64)

In [20]:
# Creating a list of dataframes with the feature set (movie info about genres and release year), grouped by userID
content_train_df.drop(columns='rating', inplace=True)
x_grouped_by_user = content_train_df.groupby(["user_id"])
x_train_listed = []

for user_id, group in x_grouped_by_user:
    x_train_listed.append(group.drop(columns='user_id'))
    
# The feature set for the first user
x_train_listed[0]

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
31,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
918,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0
1330,1,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
1539,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
1975,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68972,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
69048,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
69441,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
69543,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [None]:
# Creating a new dataframe for the movies data
movies_df = movies_df.drop(columns=['release date', 'video release date','IMDb URL','unknown'])


In [36]:
all_movies = movies_df.drop(columns=['movie_name', 'movie_id'])
all_movies

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1678,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
1679,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
1680,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [38]:
# ----------- PREPARING VALIDATION DATA ----------- #
# Creating a 2 dimensional matrix for the validation data in order to make it easier to calculate RMSE.

# Listing the user ID's in the same order as in the grouped dataframes
user_ids = []
for user_id, group in x_grouped_by_user:
    user_ids.append(user_id)
    
# Listing the movie IDs in the same order as in the movies dataset
movie_ids = movies_df["movie_id"].values

# Creating the matrix. Axis 1: User IDs, Axis 2: Movie IDs. Elements: True ratings from validation data
df_val = X_val.copy()
df_val["rating"] = y_val
validation_matrix = pd.DataFrame(index=user_ids, columns=movie_ids)  # Starting with an empty matrix
for array in df_val.to_records():  # Filling in the true ratings as elements
    user = array['user_id']
    movie = array['movie_id']
    true_rating = array['rating']
    validation_matrix.loc[user][movie] = true_rating
    
validation_matrix

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
1,5,,4,,,5,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,,,...,,,,,,,,,,
940,,,,,,,4,,,,...,,,,,,,,,,
941,,,,,,,,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [39]:
# ----------- CREATING THE DIFFERENT CONTENT-BASED FILTERING MODELS ----------- #
# Assigning the different machine learning algorithms to be implemented in the models (incl. hyperparameters) to a dictionary
ml_algorithms = {"Linear regression": LinearRegression(), "Lasso": Lasso(alpha=1.0, max_iter=10000), 
                 "KNN_7": KNeighborsRegressor(n_neighbors=7),
                 "RFR": RandomForestRegressor(n_estimators=1000, n_jobs=3, max_features="auto", random_state=0),
                 "SVR": SVR(C=1.0)}

# Saving lists that I later use to construct a dataframe containing the performances of the models
CBF_models_listed = []
RMSE_CBF_listed = []

# For every machine learning algorithm in the dictionary:
for name, ml_alg in ml_algorithms.items():
    # Create an empty list for predictions
    CBF_predictions = []

    # For each user in the training dataset:
    for i, x in enumerate(x_train_listed):
        # Fit a machine learning model
        ml_alg.fit(x_train_listed[i], y_train_listed[i])
        # Predict all the ratings for this user for all movies
        prediction = ml_alg.predict(all_movies)
        prediction = np.clip(prediction, 1, 5)  # Predictions must be minimum 1, maximum 5
        # Append all the predictions to the predictions list
        CBF_predictions.append(prediction)

    # Create a dataframe with the predictions
    df_predict = pd.DataFrame(CBF_predictions, index=user_ids, columns=movie_ids)

    # Create a dataframe with only the predictions for the movies-user combinations that appear in the validation set
    num_actual = validation_matrix.to_numpy().flatten()[validation_matrix.notna().to_numpy().flatten()]
    num_predict = df_predict.to_numpy().flatten()[validation_matrix.notna().to_numpy().flatten()]

    # Calculate the RMSE for the content-based filtering model and add the result to the lists
    RMSE_CBF_listed.append(sqrt(mean_squared_error(num_predict, num_actual)))
    CBF_models_listed.append(name)


# Printing the results
RMSE_CBF_df = pd.DataFrame({"Model": CBF_models_listed, "RMSE": RMSE_CBF_listed})
print("RMSE of different content-based filtering models without the year of release feature:")
RMSE_CBF_df

RMSE of different content-based filtering models without the year of release feature:


Unnamed: 0,Model,RMSE
0,Linear regression,1.116877
1,Lasso,1.039847
2,KNN_7,1.061105
3,RFR,1.093032
4,SVR,1.062908


In [None]:
# ----------- CREATING THE DIFFERENT CONTENT-BASED FILTERING MODELS ----------- #
# Assigning the different machine learning algorithms to be implemented in the models (incl. hyperparameters) to a dictionary
ml_algorithms = {"Linear regression": LinearRegression(), "Lasso": Lasso(alpha=1.0, max_iter=10000), 
                 "KNN_7": KNeighborsRegressor(n_neighbors=7),
                 "RFR": RandomForestRegressor(n_estimators=1000, n_jobs=3, max_features="auto", random_state=0),
                 "SVR": SVR(C=1.0)}

# Saving lists that I later use to construct a dataframe containing the performances of the models
CBF_models_listed = []
RMSE_CBF_listed = []

# For every machine learning algorithm in the dictionary:
for name, ml_alg in ml_algorithms.items():
    # Create an empty list for predictions
    CBF_predictions = []

    # For each user in the training dataset:
    for i, x in enumerate(x_train_listed):
        # Fit a machine learning model
        ml_alg.fit(x_train_listed[i], y_train_listed[i])
        # Predict all the ratings for this user for all movies
        prediction = ml_alg.predict(all_movies)
        prediction = np.clip(prediction, 1, 5)  # Predictions must be minimum 1, maximum 5
        # Append all the predictions to the predictions list
        CBF_predictions.append(prediction)

    # Create a dataframe with the predictions
    df_predict = pd.DataFrame(CBF_predictions, index=user_ids, columns=movie_ids)

    # Create a dataframe with only the predictions for the movies-user combinations that appear in the validation set
    num_actual = validation_matrix.to_numpy().flatten()[validation_matrix.notna().to_numpy().flatten()]
    num_predict = df_predict.to_numpy().flatten()[validation_matrix.notna().to_numpy().flatten()]

    # Calculate the RMSE for the content-based filtering model and add the result to the lists
    RMSE_CBF_listed.append(sqrt(mean_squared_error(num_predict, num_actual)))
    CBF_models_listed.append(name)


# Printing the results
RMSE_CBF_df = pd.DataFrame({"Model": CBF_models_listed, "RMSE": RMSE_CBF_listed})
print("RMSE of different content-based filtering models without the year of release feature:")
RMSE_CBF_df

### Save the best result

In [40]:
# Running the best content-based filtering model so far
model = Lasso(alpha=1.0, max_iter=10000)
CBF_predictions = []

# For each user in the training dataset:
for i, j in enumerate(x_train_listed):
    model.fit(x_train_listed[i], y_train_listed[i])
    prediction = model.predict(all_movies)
    prediction = np.clip(prediction, 1, 5)
    CBF_predictions.append(prediction)

# Creating a dataframe for the predictions
CBF_model = pd.DataFrame(CBF_predictions, index=user_ids, columns=movie_ids)

### Collaborative Filtering

In [42]:

# Creating a 2D matrix (user ID vs movie ID) with the ratings as elements
user_matrix = train_df.pivot(index='user_id', columns='movie_id', values='rating')

# I subtract each user's average rating to magnify individual preferences
user_matrix = user_matrix.sub(user_matrix.mean(axis=1), axis=0)

# Replace NaN with 0.0, as this is now the "neutral" value
user_matrix = user_matrix.fillna(0.0)

In [43]:
# I calculate the Pearson Correlation between each user,
# and subtract this from 1 to get the Pearson Distance between users
user_dist_matrix = 1 - user_matrix.T.corr()
user_dist_matrix

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.000000,0.978116,0.964909,0.984445,0.970848,0.920537,0.931717,0.918086,1.007981,0.993822,...,0.953670,1.000719,0.931254,0.998334,0.987828,0.876970,0.909131,0.953273,1.025258,1.027294
2,0.978116,0.000000,0.985406,0.980388,0.993759,0.930890,0.950789,0.986840,0.984649,0.884260,...,0.977414,1.031253,0.949564,0.819471,0.906434,0.938678,0.954679,0.974755,0.993737,0.956389
3,0.964909,0.985406,0.000000,1.122934,0.976819,1.026994,0.966816,0.958702,1.005466,0.978109,...,0.998783,1.001004,1.044361,0.971690,0.981523,1.000000,0.994662,0.934526,1.018069,1.000583
4,0.984445,0.980388,1.122934,0.000000,1.000000,1.000000,1.045278,0.991412,1.000000,0.996861,...,1.008273,0.997353,0.981712,0.979529,1.022358,1.000000,0.897754,1.017227,0.963526,1.018323
5,0.970848,0.993759,0.976819,1.000000,0.000000,0.983555,0.994874,0.963088,0.963671,1.037647,...,0.925794,1.096582,0.978171,0.952636,0.960973,0.954551,0.955237,0.949200,0.976845,0.917620
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.876970,0.938678,1.000000,1.000000,0.954551,1.032048,0.990778,0.960310,1.000000,0.971099,...,0.998758,0.954452,0.901341,1.009258,1.012978,0.000000,1.000414,0.977714,1.006409,0.984494
940,0.909131,0.954679,0.994662,0.897754,0.955237,0.977369,0.988643,0.934923,0.990877,0.996739,...,1.027322,0.950554,1.036141,1.016649,0.986431,1.000414,0.000000,0.806063,0.978392,0.995456
941,0.953273,0.974755,0.934526,1.017227,0.949200,0.956598,1.015160,0.835026,0.996389,0.961019,...,1.043303,1.058905,0.954570,0.965438,0.981935,0.977714,0.806063,0.000000,1.010354,1.000994
942,1.025258,0.993737,1.018069,0.963526,0.976845,0.926960,0.937370,0.932510,0.969219,0.900708,...,0.946824,0.999173,1.007401,0.968212,0.975955,1.006409,0.978392,1.010354,0.000000,0.939609


In [44]:
ml_algorithms = { 'kNN-40': 40}

models_CF = []
RMSE_CF = []

# Training the models and predicting for the users and movies in the validation data
for name, num_neighbours in ml_algorithms.items():
    predictions = []

    # For every rating in the validation data
    for index, row in X_val.iterrows():
        # If the movie is in the training data
        if row["movie_id"] in X_train["movie_id"].unique():
            # Extract all user ID's for users who have rated the movie
            users_rated_movie = X_train.loc[X_train['movie_id'] == row['movie_id'], 'user_id']
            # Sort these users by similarity (Pearson distance)
            users_sorted = (user_dist_matrix.loc[row['user_id'], users_rated_movie].sort_values())
            # Select the nearest neighbours
            nearest_neighbours = users_sorted[:num_neighbours]
            # Extract the nearest neighbours' ratings data
            nn_data = train_df.loc[train_df['user_id'].isin(nearest_neighbours.index.to_list())]
            # Calculate the weighted average of the nearest neighbours' ratings
            nearest_neighbours_avg_rating = np.average(nn_data.loc[train_df['movie_id'] == row['movie_id'], 'rating'],
                                                       axis=0, weights=(1/nearest_neighbours))
        else:
            # There is a small chance that a few movies in the validation set might not appear in the training set.
            # I therefore predict that the user will rate these movies with the average rating for all movies
            nearest_neighbours_avg_rating = 4   # Must be changed!

        # Appending the prediction to the list of predictions
        if not np.isnan(nearest_neighbours_avg_rating):
            predictions.append(nearest_neighbours_avg_rating)
        else:
            predictions.append(3)

    models_CF.append(name)
    RMSE_CF.append(sqrt(mean_squared_error(y_val, predictions)))


# Displaying the results
RMSE_CF_dict = {"Model": models_CF, "RMSE": RMSE_CF}
RMSE_CF_df = pd.DataFrame(RMSE_CF_dict)
RMSE_CF_df

Unnamed: 0,Model,RMSE
0,kNN-40,1.0106


In [45]:
# Rerunning the best model so far (kNN-40) and storing the prediction results
best_CF_model = []
RMSE_best_CF = []

# Training the models and predicting for the users and movies in the validation data
CF_predictions = []

# For every movie in the validation data
for index, row in X_val.iterrows():
    # If that movie is in the training data
    if row["movie_id"] in X_train["movie_id"].unique():
        # Extract all user ID's for users who have rated the movie
        users_rated_movie = X_train.loc[X_train['movie_id'] == row['movie_id'], 'user_id']
        # Sort these users by similarity (Pearson distance)
        users_sorted = (user_dist_matrix.loc[row['user_id'], users_rated_movie].sort_values())
        # Select the nearest neighbours
        nearest_neighbours = users_sorted[:40]
        # Extract the nearest neighbours' ratings data
        nn_data = train_df.loc[train_df['user_id'].isin(nearest_neighbours.index.to_list())]
        # Calculate the weighted average of the nearest neighbours' ratings
        nearest_neighbours_avg_rating = np.average(nn_data.loc[train_df['movie_id'] == row['movie_id'], 'rating'],
                                                   axis=0, weights=(1/nearest_neighbours))
    else:
        # There is a small chance that a few movies in the validation set might not appear in the training set.
        # I therefore predict that the user will rate these movies with the average rating for all movies
        nearest_neighbours_avg_rating = 4   # Must be changed!

    # Appending the prediction to the list of predictions
    if not np.isnan(nearest_neighbours_avg_rating):
        CF_predictions.append(nearest_neighbours_avg_rating)
    else:
        CF_predictions.append(4)

In [46]:
# Extracting the validation prediction from the CBF dataframe containing all predictions
CBF_predictions = []
for index, row in X_val.iterrows():
    user_predictions = CBF_model.loc[row["user_id"], row["movie_id"]]
    CBF_predictions.append(user_predictions)
    

# Calculating the predictions for the different hybrid "models": different weighted averages of CF and CBF filtering
print("RMSE combined approach (Lasso and KNN-40):")
weighted_avgs = [(0.5, 0.5), (0.45, 0.55), (0.4, 0.6), (0.35, 0.65), (0.3, 0.7), (0.25, 0.75), (0.20, 0.80)]
  
for weight in weighted_avgs:
    combined_predictions = np.array([y_pred * weight[0] for y_pred in np.array(CBF_predictions)]) + np.array([y_pred * weight[1] for y_pred in np.array(CF_predictions)])
    print(f"RMSE for combined approach with CBF weighted {weight[0]} and CF weighted {weight[1]}: \n",
          sqrt(mean_squared_error(y_val, combined_predictions)), "\n")

RMSE combined approach (Lasso and KNN-40):
RMSE for combined approach with CBF weighted 0.5 and CF weighted 0.5: 
 0.9686672506297125 

RMSE for combined approach with CBF weighted 0.45 and CF weighted 0.55: 
 0.967702206410161 

RMSE for combined approach with CBF weighted 0.4 and CF weighted 0.6: 
 0.9679041774370927 

RMSE for combined approach with CBF weighted 0.35 and CF weighted 0.65: 
 0.9692724341844091 

RMSE for combined approach with CBF weighted 0.3 and CF weighted 0.7: 
 0.9718020504196714 

RMSE for combined approach with CBF weighted 0.25 and CF weighted 0.75: 
 0.97548399130406 

RMSE for combined approach with CBF weighted 0.2 and CF weighted 0.8: 
 0.9803052728317004 

