In [2]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader
from surprise import SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

In [3]:
wines = pd.read_csv('wines_w2v.csv')
ratings = pd.read_csv('XWines_Slim_1K_wines_150K_ratings\XWines_Slim_150K_ratings.csv', low_memory=False)

In [4]:
#TODO: experiment with different columns only (can be done using weight 0 also)

# Select columns with w2v features
vector_columns = [
    'WineName_w2v', 'Type_w2v', 'Elaborate_w2v', 'Grapes_w2v',
    'Harmonize_w2v', 'Country_w2v', 'RegionName_w2v', 'WineryName_w2v'
]

In [5]:
def convert_w2v_string(w2v_string):
    '''
    Converts a string representation of a word2vec vector to a NumPy array.

    @param w2v_string: A string representation of a word2vec vector.
    @return: A NumPy array representation of the word2vec vector.
    '''
    # Remove the "[-" and "]" characters
    cleaned_string = w2v_string.strip("[]")
    
    # Split the cleaned string into a list of floats
    float_list = list(map(float, cleaned_string.split()))
    
    # Convert to a NumPy array
    return np.array(float_list)

In [6]:
# Apply the conversion function to each column
for column in vector_columns:
    wines[column] = wines[column].apply(convert_w2v_string)

In [7]:
print(wines.head(3))

   WineID            WineName       Type      Elaborate              Grapes  \
0  100001  Espumante Moscatel  Sparkling  Varietal 100%      Muscat Moscato   
1  100002          Ancellotta        Red  Varietal 100%          Ancellotta   
2  100003  Cabernet Sauvignon        Red  Varietal 100%  Cabernet Sauvignon   

                                       Harmonize   ABV          Body Acidity  \
0                     Pork, Rich Fish, Shellfish   7.5  Mediumbodied    High   
1  Beef, Barbecue, Codfish, Pasta, Pizza, Cheese  12.0  Mediumbodied  Medium   
2                            Beef, Lamb, Poultry  12.0    Fullbodied    High   

  Country  ...    RegionName    WineryName  \
0  Brazil  ...  Serra Gaúcha   Casa Perini   
1  Brazil  ...  Serra Gaúcha   Casa Perini   
2  Brazil  ...  Serra Gaúcha  Castellamare   

                                        WineName_w2v  \
0  [-0.3074022, -0.44844943, -0.44216424, 0.07813...   
1  [0.20456168, -0.238103, 0.13503605, -0.0984859...   
2  [-0.14

In [8]:
print(type(wines['Type_w2v'][0]))


<class 'numpy.ndarray'>


In [9]:
ratings.head(3)

Unnamed: 0,RatingID,UserID,WineID,Vintage,Rating,Date
0,143,1356810,103471,1950,4.5,2021-11-02 20:52:59
1,199,1173759,111415,1951,5.0,2015-08-20 17:46:26
2,348,1164877,111395,1952,5.0,2020-11-13 05:40:26


SVD Model (collaborative based):

In [10]:
# Create a reader object with the rating scale
reader = Reader(rating_scale=(0, 5))

data = Dataset.load_from_df(ratings[['UserID', 'WineID', 'Rating']], reader)

In [11]:
# Split the data into training and testing sets
# TODO: check if this is correct evaluation
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [12]:
# Use SVD algorithm
model_svd = SVD(n_factors=100, n_epochs=20, lr_all=0.005, reg_all=0.02)

# Train the model
model_svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x23f4e9a6cd0>

In [13]:
# Make predictions on the test set
predictions = model_svd.test(testset)

# Compute RMSE
rmse = accuracy.rmse(predictions)
print(f"RMSE: {rmse}")

RMSE: 0.4787
RMSE: 0.4786704014186373


In [14]:
def predict_ratings(user_id, wines_df, model):
    '''
    Predicts the ratings for a user for all wines in the dataset.

    @param user_id: The ID of the user for whom to make predictions.
    @param wines_df: The DataFrame containing the wines.
    @param model: The trained model to use for making predictions.
    @return: A DataFrame containing the predicted ratings for the user.
    '''
    predictions = []
    for wine_id in wines_df['WineID'].unique():
        predicted_rating = model.predict(user_id, wine_id).est
        predictions.append((wine_id, predicted_rating))
    return pd.DataFrame(predictions, columns=['WineID', 'PredictedRating'])

In [15]:
# Example for a specific user using svd model
user_id = 1209683
unrated_wines = ratings[~ratings['UserID'].isin([user_id])]
predictions = predict_ratings(user_id, unrated_wines, model_svd)
print(predictions.head())

   WineID  PredictedRating
0  103471         3.786132
1  111415         4.585128
2  111395         4.488114
3  111433         4.158496
4  111431         4.231509


In [39]:
def get_predicted_rating(user_id, wine_id, model):
    '''
    Retrieve the predicted rating for a specific user and wine.

    @param user_id: The ID of the user.
    @param wine_id: The ID of the wine.
    @param model: The trained SVD model.
    @return: The predicted rating.
    '''
    # Predict the rating for the specific user-wine pair
    prediction = model.predict(user_id, wine_id)
    
    return prediction.est

# Example usage
user_id = 1209683
wine_id = 111422  # Replace with the actual WineID you want to test

predicted_rating = get_predicted_rating(user_id, wine_id, model_svd)
print(f"Predicted rating for user {user_id} and wine {wine_id}: {predicted_rating}")


Predicted rating for user 1209683 and wine 111422: 4.249609116675912


---------------------------------------------------------------------------------------------------------------------------------------------------------------------

KNN model (content based)

In [16]:
# Check the shapes of the entries in each vector column
for column in vector_columns:
    shapes = wines[column].apply(lambda x: x.shape).unique()
    print(f"{column} shapes: {shapes}")

WineName_w2v shapes: [(100,)]
Type_w2v shapes: [(100,)]
Elaborate_w2v shapes: [(100,)]
Grapes_w2v shapes: [(100,)]
Harmonize_w2v shapes: [(100,)]
Country_w2v shapes: [(100,)]
RegionName_w2v shapes: [(100,)]
WineryName_w2v shapes: [(100,)]


In [17]:
# TODO: test different weights for each feature

# Weights for each feature
feature_weights = {
    'WineName_w2v': 1.0,
    'Type_w2v': 1.2,
    'Elaborate_w2v': 0.8,
    'Grapes_w2v': 1.0,
    'Harmonize_w2v': 1.5,
    'Country_w2v': 1.0,
    'RegionName_w2v': 1.1,
    'WineryName_w2v': 0.9
}

# Create a new column to store weighted wine vectors
wines['Weighted_wine_vector'] = wines.apply(
    lambda row: np.concatenate([row[column] * feature_weights[column] for column in vector_columns]), 
    axis=1
)

print(f"Weighted wine vectors shape: {wines['Weighted_wine_vector'].shape}")  # Should still be (1007, 800)

Weighted wine vectors shape: (1007,)


In [18]:
'''
This KNN model predicts the ratings for all unrated wines. It runs for super long as we are predicting for all wines for each user.

No need to run this model (will not be used in the final solution)
'''

'''
# Fit the kNN model on weighted wine features
from sklearn.neighbors import NearestNeighbors

# Extract the precomputed weighted vectors for kNN fitting
wine_vectors = np.stack(wines['Weighted_wine_vector'].values)

knn_model = NearestNeighbors(n_neighbors=10, metric='cosine')
knn_model.fit(wine_vectors)

# Function to predict ratings using kNN based on unrated wines
def predict_combined_ratings(user_id, wines_df, model, knn_model, k=10):
    user_ratings = ratings[ratings['UserID'] == user_id]

    if user_ratings.empty:
        return pd.DataFrame(columns=['WineID', 'PredictedRating'])

    predictions = []

    # Iterate over unrated wines
    for _, unrated_wine in unrated_wines.iterrows():
        unrated_wine_id = unrated_wine['WineID']
        
        # Get the precomputed weighted feature vector for the unrated wine
        unrated_wine_vector = wines.loc[wines['WineID'] == unrated_wine_id, 'Weighted_wine_vector'].values[0]

        # Find k nearest neighbors for the unrated wine using kNN
        distances, indices = knn_model.kneighbors(unrated_wine_vector.reshape(1, -1), n_neighbors=k)

        # Get the WineIDs of the k nearest neighbors
        neighbor_wine_ids = wines.iloc[indices.flatten()]['WineID'].values

        # Get ratings for the k nearest wines from the user
        neighbor_ratings = user_ratings[user_ratings['WineID'].isin(neighbor_wine_ids)]

        if not neighbor_ratings.empty:
            # Predict rating as the average of neighbor ratings
            #TODO: investigate how to weight the ratings based how close they are to the unrated wines
            predicted_rating = neighbor_ratings['Rating'].mean()
            predictions.append((unrated_wine_id, predicted_rating))

    return pd.DataFrame(predictions, columns=['WineID', 'PredictedRating'])

'''

"\n# Fit the kNN model on weighted wine features\nfrom sklearn.neighbors import NearestNeighbors\n\n# Extract the precomputed weighted vectors for kNN fitting\nwine_vectors = np.stack(wines['Weighted_wine_vector'].values)\n\nknn_model = NearestNeighbors(n_neighbors=10, metric='cosine')\nknn_model.fit(wine_vectors)\n\n# Function to predict ratings using kNN based on unrated wines\ndef predict_combined_ratings(user_id, wines_df, model, knn_model, k=10):\n    user_ratings = ratings[ratings['UserID'] == user_id]\n\n    if user_ratings.empty:\n        return pd.DataFrame(columns=['WineID', 'PredictedRating'])\n\n    predictions = []\n\n    # Iterate over unrated wines\n    for _, unrated_wine in unrated_wines.iterrows():\n        unrated_wine_id = unrated_wine['WineID']\n        \n        # Get the precomputed weighted feature vector for the unrated wine\n        unrated_wine_vector = wines.loc[wines['WineID'] == unrated_wine_id, 'Weighted_wine_vector'].values[0]\n\n        # Find k nearest

In [19]:
# Fit the kNN model on weighted wine features
from sklearn.neighbors import NearestNeighbors

# Extract the precomputed weighted vectors for kNN fitting
wine_vectors = np.stack(wines['Weighted_wine_vector'].values)


#TODO: experiment with different n_neighbors and metric
knn_model = NearestNeighbors(n_neighbors=10, metric='cosine')
knn_model.fit(wine_vectors)


In [30]:
from sklearn.metrics.pairwise import cosine_similarity


#TODO: experiment with different k
def predict_combined_ratings(knn_model, user_ratings, k=10, rating_threshold=3.5):
    '''
    Method to predict ratings for a user based on the k nearest neighbors of already highly rated wines.

    @param knn_model: The trained kNN model to use for finding nearest neighbors.
    @param user_ratings: The DataFrame containing the user's ratings.
    @param k: The number of nearest neighbors to consider.
    @param rating_threshold: The minimum rating to consider a wine as highly rated.
    @return: A DataFrame containing the predicted ratings for the user.
    '''
    if user_ratings.empty:
        return pd.DataFrame(columns=['WineID', 'PredictedRating'])

    # Filter highly rated wines (above threashold)
    #TODO: experiment with different threshold, what if there are no ratings above threshold
    highly_rated_wines = user_ratings[user_ratings['Rating'] > rating_threshold]

    # Step 1: Find unique unrated wines similar to highly rated wines
    to_predict = set()

    # Iterate over each highly rated wine
    for _, rated_wine in highly_rated_wines.iterrows():
        rated_wine_id = rated_wine['WineID']

        # Get the precomputed vector for the rated wine
        rated_wine_vector = wines.loc[wines['WineID'] == rated_wine_id, 'Weighted_wine_vector'].values[0]

        # Find k nearest unrated wines using kNN
        distances, indices = knn_model.kneighbors(rated_wine_vector.reshape(1, -1), n_neighbors=k)

        # Filter for unrated wines
        unrated_wines = wines[~wines['WineID'].isin(user_ratings['WineID'])]

        # Add unrated neighbors to the to_predict list
        neighbor_wine_ids = wines.iloc[indices.flatten()]['WineID'].values
        to_predict.update(neighbor_wine_id for neighbor_wine_id in neighbor_wine_ids 
                          if neighbor_wine_id in unrated_wines['WineID'].values)
        
    print(f"Number of unique unrated wines to predict: {len(to_predict)}")
    print(f"Unrated wines to predict: {to_predict}")

    # Step 2: Predict ratings for unique wines in the to_predict list
    #TODO: experiment with different metric of calculating the predicted rating
    unrated_wine_predictions = {}

    for unrated_wine_id in to_predict:
        unrated_wine_vector = wines.loc[wines['WineID'] == unrated_wine_id, 'Weighted_wine_vector'].values[0].reshape(1, -1)

        total_weighted_rating = 0
        total_similarity = 0

        # Compute similarity with all highly rated wines
        for _, rated_wine in highly_rated_wines.iterrows():
            rated_wine_id = rated_wine['WineID']
            rated_wine_rating = rated_wine['Rating']

            rated_wine_vector = wines.loc[wines['WineID'] == rated_wine_id, 'Weighted_wine_vector'].values[0].reshape(1, -1)

            # Compute cosine similarity
            similarity_weight = cosine_similarity(rated_wine_vector, unrated_wine_vector)[0][0]

            # Ensure similarity weight is non-negative
            if similarity_weight > 0:
                total_weighted_rating += rated_wine_rating * similarity_weight
                total_similarity += similarity_weight

        if total_similarity > 0:
            predicted_rating = total_weighted_rating / total_similarity
            unrated_wine_predictions[unrated_wine_id] = predicted_rating

    # Return predictions sorted by predicted rating in descending order
    return pd.DataFrame(list(unrated_wine_predictions.items()), columns=['WineID', 'PredictedRating']).sort_values(by='PredictedRating', ascending=False)

In [43]:
# Example for a specific user
user_id = 1209683

user_ratings = ratings[ratings['UserID'] == user_id]

# Predict ratings using the modified kNN approach for the user
predictions = predict_combined_ratings(knn_model, user_ratings, k=10, rating_threshold=3)
print(predictions.to_markdown())

Number of unique unrated wines to predict: 195
Unrated wines to predict: {167425, 167429, 176135, 137224, 167431, 159758, 167439, 112664, 167449, 167450, 142875, 111645, 113695, 128544, 180260, 163876, 167460, 184870, 113193, 116266, 105004, 167470, 103471, 192560, 102448, 111667, 112695, 174651, 195646, 167487, 164927, 168002, 175682, 111687, 179805, 174177, 112229, 180330, 112237, 111729, 155249, 112243, 112756, 112251, 111740, 184452, 176262, 113288, 112777, 118927, 183952, 111763, 112790, 141462, 111773, 112809, 127658, 138409, 100013, 100014, 185009, 172209, 137905, 113332, 183479, 179386, 113344, 174274, 116418, 112834, 175814, 155334, 171003, 131787, 155339, 194769, 184532, 135893, 111834, 184540, 184541, 100067, 111845, 140010, 168170, 138479, 112879, 174323, 179958, 195831, 171256, 164598, 181495, 100092, 174333, 171266, 116995, 122630, 113421, 184600, 112923, 111395, 174372, 169260, 135982, 137010, 184116, 112948, 174901, 184631, 113462, 111417, 174906, 112950, 184122, 111421

In [44]:
from sklearn.model_selection import train_test_split

def split_user_ratings(user_id, test_size=0.2):
    '''
    Split the user's ratings into train and test sets.

    @param user_id: The ID of the user.
    @param test_size: The proportion of the dataset to include in the test split.
    @return: train_ratings, test_ratings for the specified user.
    '''
    user_ratings = ratings[ratings['UserID'] == user_id]
    if user_ratings.empty:
        return pd.DataFrame(), pd.DataFrame()
    
    # Split the user's ratings
    train_ratings, test_ratings = train_test_split(user_ratings, test_size=0.2, random_state=42)
    return train_ratings, test_ratings


In [45]:
from sklearn.metrics import mean_squared_error

def evaluate_predictions(user_id, knn_model, ratings_df=ratings, k=10, rating_threshold=3.5):
    '''
    Split the user's ratings, predict ratings for the user using the training set, and evaluate using the test set.

    @param user_id: The ID of the user for whom to make predictions.
    @param knn_model: The trained kNN model.
    @param ratings_df: The DataFrame containing all ratings.
    @param k: The number of nearest neighbors to consider.
    @param rating_threshold: The minimum rating to consider a wine as highly rated.
    @return: RMSE and the predictions DataFrame.
    '''
    # Step 1: Split user ratings into train and test
    train_ratings, test_ratings = split_user_ratings(user_id, ratings_df)
    #print(f"Train ratings: {train_ratings}")
    #print(f"Test ratings: {test_ratings}")

    # Step 2: Predict ratings using the training ratings
    predictions = predict_combined_ratings(knn_model, train_ratings, k=k, rating_threshold=rating_threshold)

    # Step 3: Evaluate predictions using RMSE
    if test_ratings.empty or predictions.empty:
        return None, predictions

    # Join test ratings with predictions
    test_ratings = test_ratings[['WineID', 'Rating']].rename(columns={'Rating': 'TrueRating'})
    merged_df = test_ratings.merge(predictions, on='WineID')

    if merged_df.empty:
        return None, predictions

    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(merged_df['TrueRating'], merged_df['PredictedRating']))

    return rmse, merged_df


In [46]:
user_id = 1209683
rmse, result_df = evaluate_predictions(user_id, knn_model, ratings, k=10, rating_threshold=3)

if rmse is not None:
    print(f"RMSE: {rmse}")
else:
    print("No ratings available for evaluation")

print(result_df.to_markdown())


Number of unique unrated wines to predict: 160
Unrated wines to predict: {167425, 167429, 176135, 159758, 167439, 112664, 167450, 142875, 111645, 113695, 128544, 180260, 167460, 184870, 113193, 116266, 167470, 192560, 111667, 112695, 174651, 164927, 167487, 175682, 168002, 111687, 179805, 174177, 112229, 180330, 112237, 111729, 155249, 112243, 112756, 112251, 111740, 184452, 176262, 113288, 112777, 118927, 183952, 111763, 141462, 112790, 111773, 112809, 127658, 138409, 172209, 137905, 185009, 113332, 183479, 179386, 113344, 116418, 174274, 112834, 175814, 171003, 194769, 184532, 135893, 111834, 184540, 184541, 111845, 168170, 140010, 112879, 138479, 174323, 164598, 181495, 195831, 171256, 179958, 174333, 171266, 116995, 113421, 112923, 111395, 174372, 169260, 184116, 112948, 174901, 184631, 111927, 111417, 174906, 113462, 112950, 111421, 111422, 182081, 179012, 111429, 183620, 133447, 179016, 111433, 126285, 192337, 168273, 179027, 113489, 139618, 112483, 154979, 111461, 111466, 111468

In [52]:
import random
import numpy as np
from sklearn.metrics import mean_squared_error

#TODO: take into consideration only the latest ratings for each user

def split_user_ratings_loo(user_id, ratings_df):
    '''
    Split the ratings of a specific user into train and test sets for leave-one-out cross-validation.

    @param user_id: The ID of the user for whom to split ratings.
    @param ratings_df: The DataFrame containing all ratings.
    @return: Train and test DataFrames.
    '''
    user_ratings = ratings_df[ratings_df['UserID'] == user_id]

    if len(user_ratings) < 2:
        # Not enough ratings to split
        return user_ratings, pd.DataFrame()

    # Randomly select one rating for testing
    test_rating = user_ratings.sample(n=1, random_state=42)
    train_ratings = user_ratings.drop(test_rating.index)

    return train_ratings, test_rating

def evaluate_predictions_loo(user_id, knn_model, ratings_df=ratings, k=10, rating_threshold=3.5):
    '''
    Split the user's ratings, predict ratings for the user using the training set, and evaluate using the test set.

    @param user_id: The ID of the user for whom to make predictions.
    @param knn_model: The trained kNN model.
    @param ratings_df: The DataFrame containing all ratings.
    @param k: The number of nearest neighbors to consider.
    @param rating_threshold: The minimum rating to consider a wine as highly rated.
    @return: RMSE and the predictions DataFrame.
    '''
    # Step 1: Split user ratings into train and test (leave one out)
    train_ratings, test_ratings = split_user_ratings_loo(user_id, ratings_df)

    # Step 2: Predict ratings using the training ratings
    predictions = predict_combined_ratings(knn_model, train_ratings, k=k, rating_threshold=rating_threshold)

    # Step 3: Evaluate predictions using RMSE
    if test_ratings.empty or predictions.empty:
        return None, predictions

    # Join test ratings with predictions
    test_ratings = test_ratings[['WineID', 'Rating']].rename(columns={'Rating': 'TrueRating'})
    merged_df = test_ratings.merge(predictions, on='WineID')

    if merged_df.empty:
        return None, predictions

    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(merged_df['TrueRating'], merged_df['PredictedRating']))

    return rmse, merged_df

def evaluate_random_users(knn_model, ratings_df, num_users=100, k=10, rating_threshold=3.5):
    '''
    Evaluate the model for a random subset of users using leave-one-out cross-validation and compute the average RMSE.

    @param knn_model: The trained kNN model.
    @param ratings_df: The DataFrame containing all ratings.
    @param num_users: The number of random users to evaluate.
    @param k: The number of nearest neighbors to consider.
    @param rating_threshold: The minimum rating to consider a wine as highly rated.
    @return: The average RMSE across the selected users.
    '''
    user_ids = ratings_df['UserID'].unique()
    selected_user_ids = random.sample(list(user_ids), min(num_users, len(user_ids)))
    total_rmse = 0
    valid_rmse_count = 0

    for user_id in selected_user_ids:
        rmse, _ = evaluate_predictions_loo(user_id, knn_model, ratings_df, k=k, rating_threshold=rating_threshold)
        if rmse is not None:
            total_rmse += rmse
            valid_rmse_count += 1

    if valid_rmse_count == 0:
        return None

    avg_rmse = total_rmse / valid_rmse_count
    return avg_rmse

# Example: Evaluate for a random subset of users and print the average RMSE
avg_rmse = evaluate_random_users(knn_model, ratings, num_users=100, k=10, rating_threshold=3)
if avg_rmse is not None:
    print(f"Average RMSE across selected users: {avg_rmse}")
else:
    print("No ratings available for evaluation")

Number of unique unrated wines to predict: 180
Unrated wines to predict: {112641, 167431, 193031, 147976, 103435, 167439, 190485, 167445, 167449, 155674, 167450, 142875, 111645, 111648, 199204, 102950, 113193, 136233, 116266, 167470, 192560, 112689, 136246, 112695, 174651, 156732, 167487, 194113, 113730, 168002, 197701, 112711, 112716, 172117, 112732, 195170, 112229, 170599, 174184, 105585, 112243, 112756, 112247, 112251, 139388, 112255, 149121, 113288, 112777, 199306, 158353, 111763, 113811, 141462, 111773, 103070, 163490, 193699, 100003, 100005, 141987, 140962, 155308, 100018, 113332, 100021, 135861, 183479, 174274, 116418, 173765, 175814, 162502, 178378, 131787, 143050, 101584, 163025, 112341, 135893, 100055, 101592, 162519, 184541, 100062, 162530, 155368, 143080, 112872, 112875, 138475, 155375, 100080, 135922, 174323, 114932, 102645, 179958, 174330, 113916, 117009, 157974, 136982, 100120, 172833, 111395, 194341, 113452, 101677, 155438, 148270, 113462, 111415, 111927, 111417, 174906

In [22]:
#TODO: explanations of the predictions, check the rated wines from the user and compare to the top recommended wines

filtered_wines = wines[wines['WineID'].isin(predictions['WineID'])]
print(filtered_wines['Country'].unique())

['Brazil' 'France' 'Spain' 'Italy' 'Chile' 'Argentina' 'Australia'
 'United States' 'Canada' 'Hungary' 'New Zealand']


In [None]:
#TODO: combine prediction from svd and knn to one
#TODO: evaluate the model

---------------------------------------------------------------------------------------------------------

combine the models to make the rating prediction:

In [63]:
def combine_knn_svd_ratings(user_id, knn_model, svd_model, user_ratings, k=10, rating_threshold=3.5, svd_weight=0.7, knn_weight=0.3):
    '''
    Combine the ratings from the kNN content-based model and SVD collaborative model.

    @param user_id: The ID of the user.
    @param knn_model: The trained kNN model for content-based filtering.
    @param svd_model: The trained SVD model for collaborative filtering.
    @param user_ratings: The DataFrame containing the user's ratings.
    @param k: The number of nearest neighbors to consider for kNN.
    @param rating_threshold: The minimum rating to consider a wine as highly rated.
    @param svd_weight: The weight for the SVD model in the final prediction.
    @param knn_weight: The weight for the kNN model in the final prediction.
    @return: A DataFrame containing the combined predicted ratings for the user.
    '''

    # Get k-NN predicted ratings
    knn_predictions_df = predict_combined_ratings(knn_model, user_ratings, k=k, rating_threshold=rating_threshold)

    # Initialize a dictionary to store the final predictions
    final_predictions = {}

    # Iterate through each wine in the k-NN predictions
    for _, row in knn_predictions_df.iterrows():
        wine_id = row['WineID']
        knn_predicted_rating = row['PredictedRating']

        # Get the SVD predicted rating for the same wine
        svd_predicted_rating = get_predicted_rating(user_id, wine_id, svd_model)

        # If both ratings are NaN, fallback to the global average or user average
        if pd.isna(svd_predicted_rating) and pd.isna(knn_predicted_rating):
            print("wtf")
        elif pd.isna(knn_predicted_rating):
            combined_rating = svd_predicted_rating
        elif pd.isna(svd_predicted_rating):
            combined_rating = knn_predicted_rating
        else:
            # Combine both ratings using a weighted average
            combined_rating = (svd_weight * svd_predicted_rating) + (knn_weight * knn_predicted_rating)

        # Store the combined rating in the final predictions dictionary
        final_predictions[wine_id] = combined_rating

    # Return the final predictions as a DataFrame
    return pd.DataFrame(list(final_predictions.items()), columns=['WineID', 'CombinedRating']).sort_values(by='CombinedRating', ascending=False)


In [54]:
user_id = 1209683
user_ratings = ratings[ratings['UserID'] == user_id]

# Get combined predictions for the user
combined_predictions_df = combine_knn_svd_ratings(user_id, knn_model, model_svd, user_ratings, k=10, rating_threshold=3.5, svd_weight=0.7, knn_weight=0.3)

print(combined_predictions_df.head())

Number of unique unrated wines to predict: 142
Unrated wines to predict: {167425, 167429, 176135, 137224, 167431, 112664, 167449, 167450, 142875, 111645, 113695, 128544, 180260, 163876, 113193, 116266, 192560, 111667, 112695, 174651, 195646, 167487, 168002, 111687, 174177, 112229, 180330, 112237, 111729, 112243, 111740, 184452, 176262, 112777, 118927, 183952, 112790, 141462, 111773, 112809, 127658, 138409, 185009, 137905, 183479, 179386, 113344, 174274, 116418, 112834, 175814, 155334, 131787, 155339, 194769, 135893, 111834, 184540, 184541, 100067, 111845, 140010, 138479, 112879, 174323, 179958, 100092, 116995, 122630, 113421, 184600, 112923, 111395, 174372, 135982, 137010, 184116, 112948, 174901, 184631, 112950, 111417, 174906, 184122, 111932, 111421, 182081, 179012, 183620, 111429, 133447, 179016, 111433, 126285, 179024, 113489, 179027, 168279, 168284, 139618, 154979, 112483, 111468, 179054, 180083, 111475, 180089, 168827, 179069, 167810, 195459, 111495, 112526, 183184, 113050, 139162

In [64]:
import numpy as np

# Set the number of ratings to use for evaluation
num_test_ratings = 5
user_id = 1209683

# Get the ratings for the specified user
user_ratings = ratings[ratings['UserID'] == user_id]

# Randomly sample ratings for the test set
test_ratings = user_ratings.sample(n=num_test_ratings, random_state=42)
# Remove the test ratings from the user ratings to create the train set
train_ratings = user_ratings.drop(test_ratings.index)

# Print the test ratings
print("Test Ratings:\n", test_ratings)

# Get combined predictions for the test wines
combined_predictions_df = combine_knn_svd_ratings(user_id, knn_model, model_svd, train_ratings, k=10, rating_threshold=3.5, svd_weight=0.7, knn_weight=0.3)

# Filter to keep only the wines in the test set
predicted_test_ratings = combined_predictions_df[combined_predictions_df['WineID'].isin(test_ratings['WineID'])]

print("Predicted Ratings for Test Wines:\n", predicted_test_ratings)

# Merge predicted ratings with actual ratings from the test set
results_df = test_ratings.merge(predicted_test_ratings, on='WineID', how='left')

# Print the results
print("Comparison of Actual vs Predicted Ratings:\n", results_df[['WineID', 'Rating', 'CombinedRating']])

# Calculate RMSE
rmse = np.sqrt(((results_df['Rating'] - results_df['CombinedRating']) ** 2).mean())
print(f"RMSE for the Combined Model: {rmse:.4f}")



Test Ratings:
        RatingID   UserID  WineID Vintage  Rating                 Date
78245  10420763  1209683  179047    2015     4.0  2019-02-10 00:20:00
28751   2884341  1209683  135860    2010     4.5  2019-09-19 23:00:54
6580     325765  1209683  155438    1999     4.5  2020-03-14 13:22:56
80727  10762720  1209683  169311    2015     4.0  2019-03-10 17:56:44
5386     249601  1209683  111558    1997     4.5  2020-12-06 15:38:40
Number of unique unrated wines to predict: 118
Unrated wines to predict: {167425, 167429, 176135, 167431, 112664, 167449, 167450, 142875, 111645, 113695, 128544, 180260, 113193, 116266, 192560, 111667, 112695, 174651, 167487, 168002, 111687, 174177, 112229, 180330, 112237, 111729, 112243, 111740, 184452, 176262, 112777, 118927, 183952, 112790, 141462, 111773, 112809, 127658, 138409, 185009, 137905, 183479, 179386, 113344, 174274, 116418, 112834, 175814, 194769, 135893, 111834, 184540, 184541, 111845, 140010, 138479, 112879, 174323, 179958, 116995, 113421, 112