# Wine Ratings Group Recommendation System

In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from scipy.sparse import csr_matrix
from numpy.linalg import inv
import numpy as np

# Load the datasets
ratings_df = pd.read_csv('XWines_Slim_150K_ratings.csv')
wines_df = pd.read_csv('XWines_Slim_1K_wines.csv')

# Prepare the data for clustering
# Pivot the ratings dataframe to have users as rows and wines as columns
ratings_pivot = ratings_df.pivot_table(index='UserID', columns='WineID', values='Rating')

# Fill missing values with 0 (unrated wines)
ratings_pivot = ratings_pivot.fillna(0)

scaler = StandardScaler()
ratings_normalized = scaler.fit_transform(ratings_pivot)

# Run K-means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(ratings_normalized)

ratings_pivot['Cluster'] = clusters

print(ratings_pivot)

  ratings_df = pd.read_csv('XWines_Slim_150K_ratings.csv')


WineID   100001  100002  100003  100005  100007  100008  100010  100012  \
UserID                                                                    
1000004     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
1000010     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
1000021     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
1000023     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
1000024     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
...         ...     ...     ...     ...     ...     ...     ...     ...   
2061042     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
2061195     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
2062232     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
2062388     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
2062618     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

WineID   100013  100014 

In [25]:
def train_EASE_sparse(X, lambda_=0.5):
    X = X.toarray()  # Convert sparse matrix to dense
    G = X.T @ X  # Gram matrix
    diag_indices = np.diag_indices(G.shape[0])
    G[diag_indices] += lambda_  # Add regularization to the diagonal
    P = inv(G)
    B = P / (-np.diag(P))
    B[diag_indices] = 0  # Set the diagonal to 0
    return B

# Create sparse matrix for user-item interaction
interaction_matrix = csr_matrix(ratings_pivot.drop(columns=['Cluster']).values)

ease_B_sparse = train_EASE_sparse(interaction_matrix, lambda_=0.1)

# Function to get group recommendations with explanations
def get_group_recommendations_with_explanations(B, cluster_id, top_n=10):
    cluster_users = ratings_pivot[ratings_pivot['Cluster'] == cluster_id].drop(columns=['Cluster']).values
    
    # Calculate the average score for wines across the users in the cluster
    group_scores = cluster_users.mean(axis=0) @ B  
    
    # Get the top N recommended wines
    top_items = np.argsort(group_scores)[-top_n:][::-1]  
    
    # Explanation for why each wine is recommended
    explanations = {}
    for wine_idx in top_items:
        # Find the wines that contributed the most to this recommendation
        # This is done by checking which wines have the highest values in the B matrix for the recommended wine
        wine_contributors = np.argsort(B[:, wine_idx])[-3:][::-1]  # Top 3 contributing wines
        explanations[wine_idx] = wine_contributors  # Store the top contributors for each recommended wine
    
    # Return the recommendations and their explanations
    return top_items, explanations

# Get recommendations and explanations for each cluster
recommendations_with_explanations = {}
for cluster_id in range(5):  
    top_items_sparse, explanations_sparse = get_group_recommendations_with_explanations(ease_B_sparse, cluster_id, top_n=10)
    
    recommendations_with_explanations[cluster_id] = {
        "recommendations": top_items_sparse,
        "explanations": explanations_sparse
    }

print("Final Recommendations with Explanations:")
for cluster_id, data in recommendations_with_explanations.items():
    print(f"\nCluster {cluster_id}:")
    print(f"  Recommendations:")
    
    # For each recommendation, print the wine and its explanation
    for wine_id in data['recommendations']:
        print(f"    Wine ID {wine_id} is recommended.")
        print(f"    Top contributing wines for this recommendation:")
        
        # Get the top contributing wines and explain why they contributed
        contributing_wines = data['explanations'][wine_id]
        for contrib_wine in contributing_wines:
            print(f"      - Wine ID {contrib_wine} contributed to this recommendation based on user ratings.")


Final Recommendations with Explanations:

Cluster 0:
  Recommendations:
    Wine ID 187 is recommended.
    Top contributing wines for this recommendation:
      - Wine ID 167 contributed to this recommendation based on user ratings.
      - Wine ID 510 contributed to this recommendation based on user ratings.
      - Wine ID 1000 contributed to this recommendation based on user ratings.
    Wine ID 516 is recommended.
    Top contributing wines for this recommendation:
      - Wine ID 495 contributed to this recommendation based on user ratings.
      - Wine ID 131 contributed to this recommendation based on user ratings.
      - Wine ID 878 contributed to this recommendation based on user ratings.
    Wine ID 764 is recommended.
    Top contributing wines for this recommendation:
      - Wine ID 505 contributed to this recommendation based on user ratings.
      - Wine ID 167 contributed to this recommendation based on user ratings.
      - Wine ID 878 contributed to this recommendat

In [26]:
# Generate recommendations for each cluster
recommendations_sparse = {}
for cluster_id in range(5):
    top_items_sparse = get_group_recommendations_with_explanations(ease_B_sparse, cluster_id, top_n=10)
    recommendations_sparse[cluster_id] = top_items_sparse

recommendations_sparse

{0: (array([187, 516, 764, 906, 193, 190, 209, 195, 194, 374], dtype=int64),
  {187: array([ 167,  510, 1000], dtype=int64),
   516: array([495, 131, 878], dtype=int64),
   764: array([505, 167, 878], dtype=int64),
   906: array([ 847, 1000,  497], dtype=int64),
   193: array([179, 763, 994], dtype=int64),
   190: array([1001,  515,  185], dtype=int64),
   209: array([715, 370, 177], dtype=int64),
   195: array([1006,  179,  186], dtype=int64),
   194: array([ 179,  715, 1004], dtype=int64),
   374: array([ 897,  990, 1006], dtype=int64)}),
 1: (array([559, 649, 639, 636, 640, 637, 562,  54, 641, 651], dtype=int64),
  {559: array([ 51, 897, 711], dtype=int64),
   649: array([ 1, 47, 49], dtype=int64),
   639: array([ 49,  41, 988], dtype=int64),
   636: array([ 897, 1001,  715], dtype=int64),
   640: array([ 184,  505, 1004], dtype=int64),
   637: array([514, 515, 820], dtype=int64),
   562: array([878, 895, 986], dtype=int64),
   54: array([369, 761,  41], dtype=int64),
   641: array(

In [28]:
ratings_pivot_trimmed = ratings_pivot.iloc[:, :1007]
predicted_ratings_matrix = ratings_pivot_trimmed.values @ ease_B_sparse

def calculate_rmse(predicted_ratings, actual_ratings):
    # Mask for non-zero actual ratings (i.e., items that were rated by users)
    non_zero_mask = actual_ratings != 0
    
    # Calculate squared errors only for actual ratings (ignoring unrated items)
    squared_errors = (predicted_ratings[non_zero_mask] - actual_ratings[non_zero_mask]) ** 2
    
    # Compute the mean of the squared errors and take the square root
    rmse = np.sqrt(np.mean(squared_errors))
    
    return rmse

# Use the trimmed actual ratings matrix for comparison
actual_ratings = ratings_pivot_trimmed.values  # Actual ratings matrix from the dataset

# Calculate RMSE
rmse_value = calculate_rmse(predicted_ratings_matrix, actual_ratings)
print(f"RMSE: {rmse_value}")


RMSE: 3.0885827627485685
