In [1]:
# ----------------------------------- NOTEBOOK SETUP --------------------------------------------------------- #
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import sqrt
import time
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import minmax_scale

# Setting plot style
plt.style.use('ggplot')

In [2]:
mnames = ['movie_id', 'title', 'genre']
movies_df = pd.read_table('ml-1m/movies.dat', names = mnames, sep = "::", engine = 'python', encoding='ISO-8859-1')

In [3]:
# Loading the cleaned datasets
rnames = ['user_id','movie_id','rating','timestamp']
ratings_df = pd.read_table("ml-1m/ratings.dat", header =None, sep='::',names=rnames, engine= 'python')

In [4]:
uname = ['user_id','gender','age','occupation','zip']
users_df = pd.read_table("ml-1m/users.dat", sep='::', header = None, names=uname, engine='python')

In [5]:
users_df.head()

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [5]:
ratings_df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [35]:
# Splitting the ratings dataset into the feature set (X) and target labels (y)
X = ratings_df.drop(columns='rating')
y = ratings_df["rating"].values  # The movie ratings are the target variables we want to predict

# Preparing train, validation and test datasets.
# I have chosen a split ratio of 80%, 10%, 10%, because I want a somewhat large training set at the cost of a
# smaller validation and test set. I do not think that a smaller validation (or test) dataset will negatively
# impact the generalization ability of the chosen models, because I am only using rather simple ML models
# with few hyperparamaters.
X_train, X_val_and_test, y_train, y_val_and_test = train_test_split(X, y, test_size=0.2, random_state=101)
X_val, X_test, y_val, y_test = train_test_split(X_val_and_test, y_val_and_test, test_size=0.5, random_state=101)

# Creating a complete training dataset with X_train and y_train
train_df = X_train.copy()
train_df["rating"] = y_train

train_df

Unnamed: 0,user_id,movie_id,timestamp,rating
760421,4517,1005,964864666,2
909559,5501,1086,959795392,4
214673,1303,2280,974835482,3
677323,4059,2248,965490104,3
363438,2116,54,983027913,3
...,...,...,...,...
661055,3985,365,965627754,2
204614,1260,2334,976138310,4
476497,2926,517,1006659095,2
214539,1302,185,993338245,5


In [36]:
# Creating a new dataframe with the average rating for each movie. 
# The "prediction" for the baseline "model" will actually just be these averages for each movie.
baseline_y_pred = pd.DataFrame(train_df.groupby('movie_id')['rating'].mean())

# The baseline model has not yet calculated an average for the movies (if any) with no ratings. 
# We therefore add these movies to the model with a predicted rating equal to the average rating 
# for all movies in the training dataset.
# ADD SOME CODE HERE!!

# Creating a dataframe for the validation data (y_true) in order to calculate RMSE of the baseline model
val_movies_dict = {'movie_id': X_val["movie_id"], 'Actual rating': y_val}  
val_movies_df = pd.DataFrame(val_movies_dict)

# Merging the training and validation datasets on the movie ID
y_pred_and_y_true = pd.merge(baseline_y_pred, val_movies_df, on='movie_id')
baseline_y_pred_vs_y_true = y_pred_and_y_true.rename(columns={"rating": "Predicted rating"})

baseline_y_pred_vs_y_true


Unnamed: 0,movie_id,Predicted rating,Actual rating
0,1,4.153341,3
1,1,4.153341,4
2,1,4.153341,5
3,1,4.153341,1
4,1,4.153341,4
...,...,...,...
99997,3952,3.790323,4
99998,3952,3.790323,5
99999,3952,3.790323,4
100000,3952,3.790323,3


In [37]:
# calculating RMSE for the baseline model
print("RMSE baseline model: ", sqrt(mean_squared_error(baseline_y_pred_vs_y_true["Predicted rating"], 
                                                       baseline_y_pred_vs_y_true["Actual rating"])))

RMSE baseline model:  0.9811184537429131


In [38]:
movies_df = pd.read_csv("films.csv")
movies_df.head()

Unnamed: 0,movie_id,title,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [39]:
# ----------- PREPARING TRAINING DATA ----------- #
# Adding the movie features (genre, release year) to the training dataset
content_train_df = pd.merge(train_df, movies_df, on='movie_id')
content_train_df.drop(columns=['timestamp', 'movie_id', 'title'], inplace=True)  # Remove useless features

content_train_df

Unnamed: 0,user_id,rating,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,4517,2,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1460,2,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,6035,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1969,3,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,5096,2,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
800162,4169,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
800163,1470,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
800164,1291,3,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
800165,3575,3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [40]:
# Creating a list of lists with the target attribute (movie ratings), grouped by userID
y_grouped_by_user = content_train_df.groupby(["user_id"])
y_train_listed = []

for i, j in y_grouped_by_user:
    y_train_listed.append(j["rating"].values)  # Extract target feature (ratings) from sorted data
    
# Target attributes for the first user
y_train_listed[0]

  for i, j in y_grouped_by_user:


array([4, 5, 4, 5, 4, 4, 4, 5, 5, 4, 5, 5, 5, 4, 3, 4, 4, 4, 5, 5, 5, 5,
       4, 3, 3, 4, 5, 4, 4, 4, 4, 4, 4, 5, 4, 4, 3, 3, 4, 3, 4, 5, 4])

In [41]:
# Creating a list of dataframes with the feature set (movie info about genres and release year), grouped by userID
content_train_df.drop(columns='rating', inplace=True)
x_grouped_by_user = content_train_df.groupby(["user_id"])
x_train_listed = []

for user_id, group in x_grouped_by_user:
    x_train_listed.append(group.drop(columns='user_id'))
    
# The feature set for the first user
x_train_listed[0]

  for user_id, group in x_grouped_by_user:


Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
12868,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0
22917,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
24237,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
36751,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
39487,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0
50864,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
57357,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
63999,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
73485,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
78119,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [42]:
# Creating a new dataframe for the movies data
all_movies = movies_df.drop(columns=['title', 'movie_id'])
all_movies.head()

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [43]:
# ----------- PREPARING VALIDATION DATA ----------- #
# Creating a 2 dimensional matrix for the validation data in order to make it easier to calculate RMSE.

# Listing the user ID's in the same order as in the grouped dataframes
user_ids = []
for user_id, group in x_grouped_by_user:
    user_ids.append(user_id)
    
# Listing the movie IDs in the same order as in the movies dataset
movie_ids = movies_df["movie_id"].values

# Creating the matrix. Axis 1: User IDs, Axis 2: Movie IDs. Elements: True ratings from validation data
df_val = X_val.copy()
df_val["rating"] = y_val
validation_matrix = pd.DataFrame(index=user_ids, columns=movie_ids)  # Starting with an empty matrix
for array in df_val.to_records():  # Filling in the true ratings as elements
    user = array['user_id']
    movie = array['movie_id']
    true_rating = array['rating']
    validation_matrix.loc[user][movie] = true_rating
    
validation_matrix

  for user_id, group in x_grouped_by_user:


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
1,5,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,,,,,,,,,,,...,,,,,,,,,,
6037,,,,,,,,,,,...,,,,,,,,,,
6038,,,,,,,,,,,...,,,,,,,,,,
6039,,,,,,,,,,,...,,,,,,,,,,


In [44]:
# ----------- CREATING THE DIFFERENT CONTENT-BASED FILTERING MODELS ----------- #
# Assigning the different machine learning algorithms to be implemented in the models (incl. hyperparameters) to a dictionary
ml_algorithms = {"Linear regression": LinearRegression(), "Lasso": Lasso(alpha=1.0, max_iter=10000), 
                 "KNN_7": KNeighborsRegressor(n_neighbors=7),
                 "SVR": SVR(C=1.0)}

# Saving lists that I later use to construct a dataframe containing the performances of the models
CBF_models_listed = []
RMSE_CBF_listed = []

# For every machine learning algorithm in the dictionary:
for name, ml_alg in ml_algorithms.items():
    # Create an empty list for predictions
    print(name)
    CBF_predictions = []

    # For each user in the training dataset:
    for i, x in enumerate(x_train_listed):
        # Fit a machine learning model
        ml_alg.fit(x_train_listed[i], y_train_listed[i])
        # Predict all the ratings for this user for all movies
        prediction = ml_alg.predict(all_movies)
        prediction = np.clip(prediction, 1, 5)  # Predictions must be minimum 1, maximum 5
        # Append all the predictions to the predictions list
        CBF_predictions.append(prediction)

    # Create a dataframe with the predictions
    df_predict = pd.DataFrame(CBF_predictions, index=user_ids, columns=movie_ids)

    # Create a dataframe with only the predictions for the movies-user combinations that appear in the validation set
    num_actual = validation_matrix.to_numpy().flatten()[validation_matrix.notna().to_numpy().flatten()]
    num_predict = df_predict.to_numpy().flatten()[validation_matrix.notna().to_numpy().flatten()]

    # Calculate the RMSE for the content-based filtering model and add the result to the lists
    RMSE_CBF_listed.append(sqrt(mean_squared_error(num_predict, num_actual)))
    CBF_models_listed.append(name)


# Printing the results
RMSE_CBF_df = pd.DataFrame({"Model": CBF_models_listed, "RMSE": RMSE_CBF_listed})
print("RMSE of different content-based filtering models without the year of release feature:")
RMSE_CBF_df

Linear regression
Lasso


  model = cd_fast.enet_coordinate_descent(


KNN_7
SVR
RMSE of different content-based filtering models without the year of release feature:


Unnamed: 0,Model,RMSE
0,Linear regression,1.054922
1,Lasso,1.035853
2,KNN_7,1.05203
3,SVR,1.044284


In [45]:
# Running the best content-based filtering model so far
model = Lasso(alpha=1.0, max_iter=10000)
CBF_predictions = []

# For each user in the training dataset:
for i, j in enumerate(x_train_listed):
    model.fit(x_train_listed[i], y_train_listed[i])
    prediction = model.predict(all_movies)
    prediction = np.clip(prediction, 1, 5)
    CBF_predictions.append(prediction)

# Creating a dataframe for the predictions
CBF_model = pd.DataFrame(CBF_predictions, index=user_ids, columns=movie_ids)

  model = cd_fast.enet_coordinate_descent(


In [46]:
train_df.head()


Unnamed: 0,user_id,movie_id,timestamp,rating
760421,4517,1005,964864666,2
909559,5501,1086,959795392,4
214673,1303,2280,974835482,3
677323,4059,2248,965490104,3
363438,2116,54,983027913,3


In [47]:
# DATA PREPROCESSING: Calculating the Pearson Distance between all users in the training data
# Creating a 2D matrix (user ID vs movie ID) with the ratings as elements
user_matrix = train_df.pivot(index='user_id', columns='movie_id', values='rating')

# I subtract each user's average rating to magnify individual preferences
user_matrix = user_matrix.sub(user_matrix.mean(axis=1), axis=0)

# Replace NaN with 0.0, as this is now the "neutral" value
user_matrix = user_matrix.fillna(0.0)

In [49]:
# I calculate the Pearson Correlation between each user,
# and subtract this from 1 to get the Pearson Distance between users
user_dist_matrix = 1 - user_matrix.T.corr()
user_dist_matrix

user_id,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.000000,0.953554,1.040537,1.016077,1.006932,0.969456,1.000000,0.986149,0.966970,0.990813,...,0.966997,0.948076,1.001745,1.000000,0.937986,1.022136,1.015531,1.000000,0.962772,0.989396
2,0.953554,0.000000,1.010484,1.038209,1.016726,1.010309,0.973498,0.941440,0.994384,0.982609,...,0.988552,1.015605,0.916226,1.000000,0.945344,0.930372,0.959419,0.996753,0.992242,1.029620
3,1.040537,1.010484,0.000000,0.946871,1.031079,1.006156,0.957689,1.011748,0.987033,0.941510,...,1.020780,0.989514,1.005240,1.000000,0.973374,0.949610,0.973024,0.956441,0.978812,1.029996
4,1.016077,1.038209,0.946871,0.000000,0.999498,0.959890,1.087608,0.979143,1.003410,1.045993,...,1.035208,0.998356,1.018926,1.000000,0.980234,0.957336,0.974804,1.074741,1.008310,0.987530
5,1.006932,1.016726,1.031079,0.999498,0.000000,1.027194,1.010804,0.957314,0.980020,1.012144,...,0.976724,0.983526,0.979666,0.968307,0.946191,0.967550,1.029275,1.001044,0.980991,0.959657
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,1.022136,0.930372,0.949610,0.957336,0.967550,1.004744,0.988310,0.928967,0.945256,0.971527,...,0.976123,0.957018,0.972667,0.956468,0.910552,0.000000,0.969221,1.039113,0.982550,0.899452
6037,1.015531,0.959419,0.973024,0.974804,1.029275,0.978712,0.990514,0.979320,0.960687,0.978617,...,0.969316,0.933855,1.014549,0.971992,0.977838,0.969221,0.000000,1.000000,0.952848,0.927060
6038,1.000000,0.996753,0.956441,1.074741,1.001044,1.039800,1.000000,1.000000,0.980825,0.924371,...,1.049874,1.010713,0.944845,1.000000,0.972671,1.039113,1.000000,0.000000,0.961464,1.025595
6039,0.962772,0.992242,0.978812,1.008310,0.980991,1.039575,1.000000,0.982823,0.992931,0.970088,...,0.997590,1.018703,0.993700,0.924650,0.993559,0.982550,0.952848,0.961464,0.000000,0.954428


In [50]:
# MODELLING: Predicting ratings for every user with K Nearest Neighbours
# Models with a different number of neighbors
ml_algorithms = {'kNN-5': 5, 'kNN-10': 10, 'kNN-20': 20, 'kNN-30': 30, 'kNN-40': 40, "kNN-60": 60}

models_CF = []
RMSE_CF = []

# Training the models and predicting for the users and movies in the validation data
for name, num_neighbours in ml_algorithms.items():
    print(name, num_neighbours)
    predictions = []

    # For every rating in the validation data
    for index, row in tqdm(X_val.iterrows()):
        # If the movie is in the training data
        if row["movie_id"] in X_train["movie_id"].unique():
            # Extract all user ID's for users who have rated the movie
            users_rated_movie = X_train.loc[X_train['movie_id'] == row['movie_id'], 'user_id']
            # Sort these users by similarity (Pearson distance)
            users_sorted = (user_dist_matrix.loc[row['user_id'], users_rated_movie].sort_values())
            # Select the nearest neighbours
            nearest_neighbours = users_sorted[:num_neighbours]
            # Extract the nearest neighbours' ratings data
            nn_data = train_df.loc[train_df['user_id'].isin(nearest_neighbours.index.to_list())]
            # Calculate the weighted average of the nearest neighbours' ratings
            nearest_neighbours_avg_rating = np.average(nn_data.loc[train_df['movie_id'] == row['movie_id'], 'rating'],
                                                       axis=0, weights=(1/nearest_neighbours))
        else:
            # There is a small chance that a few movies in the validation set might not appear in the training set.
            # I therefore predict that the user will rate these movies with the average rating for all movies
            nearest_neighbours_avg_rating = 4   # Must be changed!

        # Appending the prediction to the list of predictions
        if not np.isnan(nearest_neighbours_avg_rating):
            predictions.append(nearest_neighbours_avg_rating)
        else:
            predictions.append(3)

    models_CF.append(name)
    RMSE_CF.append(sqrt(mean_squared_error(y_val, predictions)))


# Displaying the results
RMSE_CF_dict = {"Model": models_CF, "RMSE": RMSE_CF}
RMSE_CF_df = pd.DataFrame(RMSE_CF_dict)
RMSE_CF_df

kNN-5 5


16609it [04:11, 66.13it/s]


KeyboardInterrupt: 

In [None]:
# Visualizing how the number of neighbors effect the root mean sqaured error
fig7, ax7 = plt.subplots()
ax7.plot(RMSE_CF_df.Model, RMSE_CF_df.RMSE, label="RMSE", color='darkred', linewidth=2)
plt.xlabel("Number of nearest neighbors", labelpad=18)
plt.ylabel("Root mean squared error", labelpad=15)
plt.title("K-value effect on RMSE for collaborative filtering models")
fig7.set_figheight(10)
fig7.set_figwidth(16)
plt.show()

In [51]:
from tqdm import tqdm

In [52]:
print(X_val)

        user_id  movie_id  timestamp
922435     5570      1220  959542161
803779     4813      1230  962930148
175734     1112      1951  974948247
575864     3520       111  966947451
982108     5926      3062  957278442
...         ...       ...        ...
61539       413      2420  976287588
780334     4658      1019  963878934
167646     1068       916  975015710
778682     4651      1169  963953111
227990     1383      3060  976147411

[100021 rows x 3 columns]


In [53]:
# Rerunning the best model so far (kNN-40) and storing the prediction results
best_CF_model = []
RMSE_best_CF = []

# Training the models and predicting for the users and movies in the validation data
CF_predictions = []

# For every movie in the validation data
for index, row in tqdm(X_val.iterrows()):
    # If that movie is in the training data
    if row["movie_id"] in X_train["movie_id"].unique():
        # Extract all user ID's for users who have rated the movie
        users_rated_movie = X_train.loc[X_train['movie_id'] == row['movie_id'], 'user_id']
        # Sort these users by similarity (Pearson distance)
        users_sorted = (user_dist_matrix.loc[row['user_id'], users_rated_movie].sort_values())
        # Select the nearest neighbours
        nearest_neighbours = users_sorted[:40]
        # Extract the nearest neighbours' ratings data
        nn_data = train_df.loc[train_df['user_id'].isin(nearest_neighbours.index.to_list())]
        # Calculate the weighted average of the nearest neighbours' ratings
        nearest_neighbours_avg_rating = np.average(nn_data.loc[train_df['movie_id'] == row['movie_id'], 'rating'],
                                                   axis=0, weights=(1/nearest_neighbours))
    else:
        # There is a small chance that a few movies in the validation set might not appear in the training set.
        # I therefore predict that the user will rate these movies with the average rating for all movies
        nearest_neighbours_avg_rating = 4   # Must be changed!

    # Appending the prediction to the list of predictions
    if not np.isnan(nearest_neighbours_avg_rating):
        CF_predictions.append(nearest_neighbours_avg_rating)
    else:
        CF_predictions.append(4)

100021it [31:45, 52.48it/s]


In [54]:
# Extracting the validation prediction from the CBF dataframe containing all predictions
CBF_predictions = []
for index, row in X_val.iterrows():
    user_predictions = CBF_model.loc[row["user_id"], row["movie_id"]]
    CBF_predictions.append(user_predictions)
    

# Calculating the predictions for the different hybrid "models": different weighted averages of CF and CBF filtering
print("RMSE combined approach (Lasso and KNN-40):")
weighted_avgs = [(0.5, 0.5), (0.45, 0.55), (0.4, 0.6), (0.35, 0.65), (0.3, 0.7), (0.25, 0.75), (0.20, 0.80)]
  
for weight in weighted_avgs:
    combined_predictions = np.array([y_pred * weight[0] for y_pred in np.array(CBF_predictions)]) + np.array([y_pred * weight[1] for y_pred in np.array(CF_predictions)])
    print(f"RMSE for combined approach with CBF weighted {weight[0]} and CF weighted {weight[1]}: \n",
          sqrt(mean_squared_error(y_val, combined_predictions)), "\n")

RMSE combined approach (Lasso and KNN-40):
RMSE for combined approach with CBF weighted 0.5 and CF weighted 0.5: 
 0.9326602226205212 

RMSE for combined approach with CBF weighted 0.45 and CF weighted 0.55: 
 0.9289369662488268 

RMSE for combined approach with CBF weighted 0.4 and CF weighted 0.6: 
 0.9265127382258487 

RMSE for combined approach with CBF weighted 0.35 and CF weighted 0.65: 
 0.9253977476480684 

RMSE for combined approach with CBF weighted 0.3 and CF weighted 0.7: 
 0.9255967259209857 

RMSE for combined approach with CBF weighted 0.25 and CF weighted 0.75: 
 0.9271088270237486 

RMSE for combined approach with CBF weighted 0.2 and CF weighted 0.8: 
 0.9299276454010957 

