In [60]:
import pandas as pd

ratings = pd.read_csv("wines_ratings_processed.csv", low_memory=False, encoding="utf-8", memory_map=True)
wines_data = pd.read_csv("wines.csv", low_memory=False, encoding="utf-8", memory_map=True)
ratings_data = pd.read_csv("ratings_processed.csv", low_memory=False, encoding="utf-8", memory_map=True)

In [67]:
print(wines_data.head(1).to_markdown())


|    |   WineID | WineName           | Type      | Elaborate     | Grapes         |   ABV | Body          | Acidity   | Code   | Country   |   RegionID | RegionName   |   WineryID | WineryName   |   Aperitif |   Appetizer |   Barbecue |   Beef |   Blue Cheese |   Cake |   Cheese |   Chicken |   Chocolate |   Codfish |   Cold Cuts |   Cream |   Cured Meat |   Dessert |   Duck |   Fish |   French Fries |   Fruit |   Fruit Dessert |   Game Meat |   Goat Cheese |   Grilled |   Ham |   Hard Cheese |   Lamb |   Lean Fish |   Light Stews |   Maturated Cheese |   Mushrooms |   Pasta |   Pizza |   Pork |   Poultry |   Rich Fish |   Risotto |   Salad |   Seafood |   Shellfish |   Snack |   Soft Cheese |   Soufflé |   Spicy Food |   Sweet Dessert |   Tomato Dishes |   Veal |   Vegetarian |
|---:|---------:|:-------------------|:----------|:--------------|:---------------|------:|:--------------|:----------|:-------|:----------|-----------:|:-------------|-----------:|:-------------|-----------:|-

In [68]:
print(ratings_data.head(1).to_markdown())

|    |   RatingID |   UserID |   WineID |   Rating | Date                |
|---:|-----------:|---------:|---------:|---------:|:--------------------|
|  0 |        143 |  1356810 |   103471 |      4.5 | 2021-11-02 20:52:59 |


In [17]:
def inspect_data():
    # Print first few rows of the ratings data
    print("First 5 rows of the ratings data:")
    print(ratings.head())

    # Check for missing values
    print("\nAre there any missing values in the dataset?")
    print(ratings.isnull().sum())

    # Print the shape of the ratings dataset
    print("\nShape of the ratings dataset:")
    print(ratings.shape)

    # Check the unique number of users and wines
    print("\nNumber of unique users and wines:")
    print("Unique Users:", ratings['UserID'].nunique())
    print("Unique Wines:", ratings['WineID'].nunique())

    # Check the distribution of the ratings
    print("\nRating distribution:")
    print(ratings['Rating'].value_counts().sort_index())

    # Print user-item matrix sample
    user_item_matrix = ratings.pivot(index='UserID', columns='WineID', values='Rating').fillna(0)
    print("\nSample of user-item matrix:")
    print(user_item_matrix.head())


inspect_data()


First 5 rows of the ratings data:
   WineID            WineName       Type      Elaborate          Grapes  ABV  \
0  100001  Espumante Moscatel  Sparkling  Varietal/100%  Muscat/Moscato  7.5   
1  100001  Espumante Moscatel  Sparkling  Varietal/100%  Muscat/Moscato  7.5   
2  100001  Espumante Moscatel  Sparkling  Varietal/100%  Muscat/Moscato  7.5   
3  100001  Espumante Moscatel  Sparkling  Varietal/100%  Muscat/Moscato  7.5   
4  100001  Espumante Moscatel  Sparkling  Varietal/100%  Muscat/Moscato  7.5   

            Body Acidity Code Country  ...  Seafood Shellfish  Snack  \
0  Medium-bodied    High   BR  Brazil  ...        0         1      0   
1  Medium-bodied    High   BR  Brazil  ...        0         1      0   
2  Medium-bodied    High   BR  Brazil  ...        0         1      0   
3  Medium-bodied    High   BR  Brazil  ...        0         1      0   
4  Medium-bodied    High   BR  Brazil  ...        0         1      0   

  Soft Cheese  Soufflé  Spicy Food  Sweet Dessert To

TruncatedSVD

In [53]:
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
import numpy as np

# Prepare the user-item matrix for TruncatedSVD
# TODO problem with filling na with 0!
user_item_matrix = ratings.pivot(index='UserID', columns='WineID', values='Rating').fillna(0)

# Perform TruncatedSVD (matrix factorization)
n_factors = 500  # Number of latent factors
svd = TruncatedSVD(n_components=n_factors)
latent_matrix = svd.fit_transform(user_item_matrix)

# Print explained variance
explained_variance = svd.explained_variance_ratio_.sum()
print(f"Explained variance by {n_factors} components: {explained_variance:.4f}")

# Reconstruct the ratings matrix using the decomposed factors
reconstructed_matrix = np.dot(latent_matrix, svd.components_)

# Calculate the error (optional, but useful for evaluation)
mse = mean_squared_error(user_item_matrix.values, reconstructed_matrix)
print(f"Mean Squared Error: {mse:.4f}")


Explained variance by 500 components: 0.9448
Mean Squared Error: 0.0111


In [59]:
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
import numpy as np

# Prepare the user-item matrix for TruncatedSVD
user_item_matrix = ratings.pivot(index='UserID', columns='WineID', values='Rating')

# Fill missing values with the mean rating of each item
user_item_matrix = user_item_matrix.apply(lambda x: x.fillna(x.mean()), axis=0)

# Perform TruncatedSVD (matrix factorization)
n_factors = 500  # Number of latent factors
svd = TruncatedSVD(n_components=n_factors)
latent_matrix = svd.fit_transform(user_item_matrix)

# Print explained variance
explained_variance = svd.explained_variance_ratio_.sum()
print(f"Explained variance by {n_factors} components: {explained_variance:.4f}")

# Reconstruct the ratings matrix using the decomposed factors
reconstructed_matrix = np.dot(latent_matrix, svd.components_)

# Calculate the error (optional, but useful for evaluation)
mse = mean_squared_error(user_item_matrix.values, reconstructed_matrix)
print(f"Mean Squared Error: {mse:.4f}")

Explained variance by 500 components: 0.9454
Mean Squared Error: 0.0002


In [43]:
# Baseline model: Predict the average rating for each item
item_means = user_item_matrix.mean(axis=0).values
baseline_predictions = np.tile(item_means, (user_item_matrix.shape[0], 1))

# Calculate MSE for the baseline model
baseline_mse = mean_squared_error(user_item_matrix.values, baseline_predictions)
print(f"Baseline Mean Squared Error: {baseline_mse:.4f}")

# Compare with the SVD model's MSE
print(f"SVD Model Mean Squared Error: {mse:.4f}")

Baseline Mean Squared Error: 0.2015
SVD Model Mean Squared Error: 0.0111


In [44]:
def recommend_wines(user_id, num_recommendations=5):
    user_index = user_item_matrix.index.get_loc(user_id)
    user_ratings = reconstructed_matrix[user_index]
    
    # Get wines that the user hasn't rated yet
    unrated_wines = np.where(user_item_matrix.iloc[user_index] == 0)[0]
    
    # Sort predicted ratings for unrated wines
    recommended_wines = unrated_wines[np.argsort(-user_ratings[unrated_wines])][:num_recommendations]
    
    print(f"Top {num_recommendations} wine recommendations for User {user_id}:")
    return user_item_matrix.columns[recommended_wines]

# Example: Recommend 5 wines for a specific user
recommend_wines(user_id=1000010, num_recommendations=5)


Top 5 wine recommendations for User 1000010:


Index([163306, 141869, 163490, 167124, 169463], dtype='int64', name='WineID')

KNN

In [54]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import numpy as np

user_item_matrix = ratings.pivot(index='UserID', columns='WineID', values='Rating').fillna(0)

# Instantiate the KNN model
knn_model = NearestNeighbors(metric='cosine', algorithm='brute', n_jobs=-1)

# Fit the model on the user-item matrix
knn_model.fit(user_item_matrix.values)


In [55]:
def recommend_wines(user_id, n_recommendations=5):
    # Find the index of the user
    user_index = user_item_matrix.index.get_loc(user_id)
    
    # Get distances and indices of the nearest neighbors
    distances, indices = knn_model.kneighbors(user_item_matrix.iloc[user_index, :].values.reshape(1, -1), n_neighbors=n_recommendations+1)
    
    # Get recommended wine indices from the nearest neighbors
    recommended_wines = []
    for i in range(1, len(indices.flatten())):  # skip the first as it's the user itself
        similar_user_index = indices.flatten()[i]
        similar_user_ratings = user_item_matrix.iloc[similar_user_index]
        recommended_wines.extend(similar_user_ratings[similar_user_ratings > 0].index.tolist())
    
    # Filter unique recommended wines
    recommended_wines = list(set(recommended_wines))
    
    # Create a DataFrame of the recommended wines
    recommended_df = pd.DataFrame(recommended_wines, columns=['Recommended WineID'])
    
    return recommended_df.head(n_recommendations)

# Example usage:
recommended_wines = recommend_wines(user_id=1000010)  # Replace with an actual UserID
print(recommended_wines)


   Recommended WineID
0              167429
1              167433
2              167443
3              111765
4              112149


In [57]:
from sklearn.model_selection import train_test_split

# Split the user-item matrix into training and test sets
train_data, test_data = train_test_split(user_item_matrix, test_size=0.2, random_state=42)

# Fit the KNN model on the training data
knn_model.fit(train_data.values)

def test_recommendations(test_data, n_recommendations=5):
    recommendations = {}
    
    for user_id in test_data.index:
        recommended_wines = recommend_wines(user_id, n_recommendations)
        recommendations[user_id] = recommended_wines['Recommended WineID'].values.tolist()
    
    return recommendations

# Generate recommendations for test set users
test_recommendations_dict = test_recommendations(test_data)


In [58]:
def precision_recall_at_k(recommendations, test_data, k=5):
    precision_scores = []
    recall_scores = []
    
    for user_id, recs in recommendations.items():
        true_relevant_items = test_data.loc[user_id][test_data.loc[user_id] > 0].index.tolist()
        
        if not true_relevant_items:  # No relevant items for this user
            continue
        
        relevant_and_recommended = set(recs) & set(true_relevant_items)
        
        precision = len(relevant_and_recommended) / len(recs) if len(recs) > 0 else 0
        recall = len(relevant_and_recommended) / len(true_relevant_items)
        
        precision_scores.append(precision)
        recall_scores.append(recall)
    
    return np.mean(precision_scores), np.mean(recall_scores)

precision, recall = precision_recall_at_k(test_recommendations_dict, test_data)
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}")

Precision: 0.0404, Recall: 0.0143


## 1.2 sklearn.decomposition.TruncatedSVD

In [19]:
# LensKit expects a DataFrame with columns: user, item, rating
ratings_lk = ratings[['UserID', 'WineID', 'Rating']].rename(columns={'UserID': 'user', 'WineID': 'item', 'Rating': 'rating'})

print(ratings_lk)

           user    item  rating
0       1756594  100001     4.0
1       1219305  100001     2.5
2       2047929  100001     3.5
3       1006545  100001     5.0
4       1400823  100001     2.0
...         ...     ...     ...
149995  1988282  199533     3.0
149996  1000085  199885     3.5
149997  1254482  199885     4.0
149998  1239634  199885     3.5
149999  1748857  200139     4.0

[150000 rows x 3 columns]


In [20]:
ratings_matrix = ratings_lk.pivot(index='user', columns='item', values='rating').fillna(0)

print(ratings_lk)

           user    item  rating
0       1756594  100001     4.0
1       1219305  100001     2.5
2       2047929  100001     3.5
3       1006545  100001     5.0
4       1400823  100001     2.0
...         ...     ...     ...
149995  1988282  199533     3.0
149996  1000085  199885     3.5
149997  1254482  199885     4.0
149998  1239634  199885     3.5
149999  1748857  200139     4.0

[150000 rows x 3 columns]


In [23]:
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split

# Split the data into training and test sets
train_matrix, test_matrix = train_test_split(ratings_matrix, test_size=0.2, random_state=420)

svd = TruncatedSVD(n_components=2)
rat_mat_reduced = svd.fit_transform(train_matrix)

print("\nReduced User-Item Matrix:")
print(rat_mat_reduced)


Reduced User-Item Matrix:
[[ 2.74587043 -4.24895644]
 [ 4.10707679 -0.12280195]
 [ 3.8778463  -1.99091606]
 ...
 [ 8.17631934  5.36503718]
 [ 2.65895047 -4.68408564]
 [ 1.10763426  0.84064222]]


In [24]:
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import precision_score, recall_score

# Function to predict ratings
def predict_ratings(svd, user_item_matrix):
    return svd.inverse_transform(svd.transform(user_item_matrix))

# Function to recommend top N items for each user
def recommend_top_n(predictions, n=5):
    recommendations = {}
    for user_id in range(predictions.shape[0]):
        user_ratings = predictions[user_id]
        top_n_items = np.argsort(user_ratings)[-n:][::-1]
        recommendations[user_id] = top_n_items
    return recommendations

# Predict ratings for the test set
predicted_ratings = predict_ratings(svd, test_matrix)

# Evaluate the predicted ratings using MSE or RMSE
mse = mean_squared_error(test_matrix, predicted_ratings)
rmse = np.sqrt(mse)
print(f"RMSE: {rmse}")

# Generate top N recommendations for each user
top_n_recommendations = recommend_top_n(predicted_ratings, n=5)


RMSE: 0.4452369832316882


In [6]:
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity


wines = pd.read_csv("wines.csv")
ratings = pd.read_csv("wines_ratings_processed.csv")

# Step 1: Create a user-item matrix
user_item_matrix = ratings.pivot(index='UserID', columns='WineID', values='Rating').fillna(0)

print(user_item_matrix)

WineID   100001  100002  100003  100005  100007  100008  100010  100012  \
UserID                                                                    
1000004     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
1000010     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
1000021     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
1000023     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
1000024     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
...         ...     ...     ...     ...     ...     ...     ...     ...   
2061042     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
2061195     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
2062232     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
2062388     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
2062618     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

WineID   100013  100014 