In [1]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader
from surprise import SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

In [2]:
wines = pd.read_csv('wines_w2v.csv')
ratings = pd.read_csv('XWines_Slim_1K_wines_150K_ratings\XWines_Slim_150K_ratings.csv', low_memory=False)

In [3]:
#TODO: experiment with different columns only (can be done using weight 0 also)

# Select columns with w2v features
vector_columns = [
    'WineName_w2v', 'Type_w2v', 'Elaborate_w2v', 'Grapes_w2v',
    'Harmonize_w2v', 'Country_w2v', 'RegionName_w2v', 'WineryName_w2v'
]

In [4]:
def convert_w2v_string(w2v_string):
    '''
    Converts a string representation of a word2vec vector to a NumPy array.

    @param w2v_string: A string representation of a word2vec vector.
    @return: A NumPy array representation of the word2vec vector.
    '''
    # Remove the "[-" and "]" characters
    cleaned_string = w2v_string.strip("[]")
    
    # Split the cleaned string into a list of floats
    float_list = list(map(float, cleaned_string.split()))
    
    # Convert to a NumPy array
    return np.array(float_list)

In [5]:
# Apply the conversion function to each column
for column in vector_columns:
    wines[column] = wines[column].apply(convert_w2v_string)

In [6]:
print(wines.head(3))

   WineID            WineName       Type      Elaborate              Grapes  \
0  100001  Espumante Moscatel  Sparkling  Varietal 100%      Muscat Moscato   
1  100002          Ancellotta        Red  Varietal 100%          Ancellotta   
2  100003  Cabernet Sauvignon        Red  Varietal 100%  Cabernet Sauvignon   

                                       Harmonize   ABV          Body Acidity  \
0                     Pork, Rich Fish, Shellfish   7.5  Mediumbodied    High   
1  Beef, Barbecue, Codfish, Pasta, Pizza, Cheese  12.0  Mediumbodied  Medium   
2                            Beef, Lamb, Poultry  12.0    Fullbodied    High   

  Country  ...    RegionName    WineryName  \
0  Brazil  ...  Serra Gaúcha   Casa Perini   
1  Brazil  ...  Serra Gaúcha   Casa Perini   
2  Brazil  ...  Serra Gaúcha  Castellamare   

                                        WineName_w2v  \
0  [-0.3074022, -0.44844943, -0.44216424, 0.07813...   
1  [0.20456168, -0.238103, 0.13503605, -0.0984859...   
2  [-0.14

In [7]:
print(type(wines['Type_w2v'][0]))


<class 'numpy.ndarray'>


In [8]:
ratings.head(3)

Unnamed: 0,RatingID,UserID,WineID,Vintage,Rating,Date
0,143,1356810,103471,1950,4.5,2021-11-02 20:52:59
1,199,1173759,111415,1951,5.0,2015-08-20 17:46:26
2,348,1164877,111395,1952,5.0,2020-11-13 05:40:26


SVD Model (collaborative based):

In [17]:
# Create a reader object with the rating scale
reader = Reader(rating_scale=(0, 5))

data = Dataset.load_from_df(ratings[['UserID', 'WineID', 'Rating']], reader)

In [18]:
# Split the data into training and testing sets
# TODO: check if this is correct evaluation
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [19]:
# Use SVD algorithm
model_svd = SVD(n_factors=100, n_epochs=20, lr_all=0.005, reg_all=0.02)

# Train the model
model_svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2907535c390>

In [20]:
# Make predictions on the test set
predictions = model_svd.test(testset)

# Compute RMSE
rmse = accuracy.rmse(predictions)
print(f"RMSE: {rmse}")

RMSE: 0.4780
RMSE: 0.4779602451485183


In [21]:
from collections import defaultdict
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
from surprise import accuracy

def evaluate_model(predictions):
    # RMSE
    rmse = accuracy.rmse(predictions)
    
    # MAE
    mae = accuracy.mae(predictions)
    
    # Calculate MSE manually
    mse = np.mean([float((true_r - est)**2) for (_, _, true_r, est, _) in predictions])
    
    # R-squared (coefficient of determination)
    y_true = np.array([true_r for (_, _, true_r, _, _) in predictions])
    y_pred = np.array([est for (_, _, _, est, _) in predictions])
    y_mean = np.mean(y_true)
    ss_tot = np.sum((y_true - y_mean)**2)
    ss_res = np.sum((y_true - y_pred)**2)
    r_squared = 1 - (ss_res / ss_tot)
    
    # Coverage (percentage of user-item pairs for which the system can make predictions)
    total_possible_predictions = len(trainset.ur) * len(trainset.ir)
    coverage = len(predictions) / total_possible_predictions
    
    # Precision at k and Recall at k
    k = 10
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()

    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        n_rel = sum((true_r >= 4) for (_, true_r) in user_ratings)
        n_rec_k = sum((est >= 4) for (est, _) in user_ratings[:k])
        n_rel_and_rec_k = sum(((true_r >= 4) and (est >= 4)) for (est, true_r) in user_ratings[:k])
        
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    precision_at_k = sum(prec for prec in precisions.values()) / len(precisions)
    recall_at_k = sum(rec for rec in recalls.values()) / len(recalls)
    
    return {
        'RMSE': rmse,
        'MAE': mae,
        'MSE': mse,
        'R-squared': r_squared,
        'Coverage': coverage,
        'Precision@K': precision_at_k,
        'Recall@K': recall_at_k
    }

# Use this function after making predictions
results = evaluate_model(predictions)
for metric, value in results.items():
    print(f"{metric}: {value}")

RMSE: 0.4780
MAE:  0.3535
RMSE: 0.4779602451485183
MAE: 0.35354724813863286
MSE: 0.2284459959424317
R-squared: 0.4518561344977746
Coverage: 0.002849139759232495
Precision@K: 0.9474117612179261
Recall@K: 0.5720667780100179


In [22]:
import numpy as np

def get_user_profile(user_id, model):
    return model.pu[model.trainset.to_inner_uid(user_id)]

def get_wine_profile(wine_id, model):
    return model.qi[model.trainset.to_inner_iid(wine_id)]

def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

def explain_recommendation(user_id, wine_id, model, wines_df):
    user_profile = get_user_profile(user_id, model)
    wine_profile = get_wine_profile(wine_id, model)
    
    similarity = cosine_similarity(user_profile, wine_profile)
    
    wine_info = wines_df[wines_df['WineID'] == wine_id].iloc[0]
    
    explanation = f"This wine ({wine_info['WineName']}) is recommended because:\n"
    explanation += f"1. It has a {similarity:.2f} similarity to your taste profile.\n"
    explanation += f"2. It's a {wine_info['Type']} wine from {wine_info['Country']}.\n"
    explanation += f"3. It pairs well with: {wine_info['Harmonize']}.\n"
    
    return explanation

# Example usage
user_id = 1209683
wine_id = 111422
explanation = explain_recommendation(user_id, wine_id, model_svd, wines)
print(explanation)

This wine (Pauillac  Grand Cru Classé ) is recommended because:
1. It has a 0.01 similarity to your taste profile.
2. It's a Red wine from France.
3. It pairs well with: Beef, Lamb, Game Meat, Poultry.



In [13]:
def predict_ratings(user_id, wines_df, model):
    '''
    Predicts the ratings for a user for all wines in the dataset.

    @param user_id: The ID of the user for whom to make predictions.
    @param wines_df: The DataFrame containing the wines to predict.
    @param model: The trained model to use for making predictions.
    @return: A DataFrame containing the predicted ratings for the user.
    '''
    predictions = []
    for wine_id in wines_df['WineID'].unique():
        predicted_rating = model.predict(user_id, wine_id).est
        predictions.append((wine_id, predicted_rating))
    return pd.DataFrame(predictions, columns=['WineID', 'PredictedRating'])

In [14]:
# Example for a specific user using svd model
user_id = 1209683
unrated_wines = ratings[~ratings['UserID'].isin([user_id])]
predictions = predict_ratings(user_id, unrated_wines, model_svd)
print(predictions.head())

   WineID  PredictedRating
0  103471         3.813287
1  111415         4.477943
2  111395         4.421610
3  111433         4.036834
4  111431         4.179070


In [None]:

# Retrieve corresponding information about the recommended wines
recommended_wines_info = wines[wines['wine_id'].isin(top_recommendations['WineID'])]

print("Top N Recommendations:")
print(recommended_wines_info)

# Select a random user ID and print their ratings
random_user_id = random.choice(ratings['UserID'].unique())
random_user_ratings = ratings[ratings['UserID'] == random_user_id]

In [None]:
# Select a random user ID and print their ratings
import random

user_id = random.choice(ratings['UserID'].unique())
print(f"Ratings for user {user_id}:")
print(ratings[ratings['UserID'] == user_id])

# Predict ratings for the user
unrated_wines = ratings[~ratings['UserID'].isin([user_id])]
predictions = predict_ratings(user_id, unrated_wines, model_svd)

# Get top N recommendations
top_n = 5
top_recommendations = predictions.sort_values(by='PredictedRating', ascending=False).head(top_n)
print("Top N Recommendations:")

# Retrieve corresponding information about the recommended wines
recommended_wines_info = wines[wines['wine_id'].isin(top_recommendations['WineID'])]
print("Top N Recommendations:")
print(recommended_wines_info)



In [15]:
def get_predicted_rating(user_id, wine_id, model):
    '''
    Retrieve the predicted rating for a specific user and wine.

    @param user_id: The ID of the user.
    @param wine_id: The ID of the wine.
    @param model: The trained SVD model.
    @return: The predicted rating.
    '''
    # Predict the rating for the specific user-wine pair
    prediction = model.predict(user_id, wine_id)
    
    return prediction.est

# Example usage
user_id = 1209683
wine_id = 111422  # Replace with the actual WineID you want to test

predicted_rating = get_predicted_rating(user_id, wine_id, model_svd)
print(f"Predicted rating for user {user_id} and wine {wine_id}: {predicted_rating}")


Predicted rating for user 1209683 and wine 111422: 4.335518107337999
