In [110]:
import pandas as pd
import numpy as np

# first we load and preprocess the wines dataset 
ratings_df_slim = pd.read_csv("XWines_Slim_1K_wines_150K_ratings\XWines_Slim_150K_ratings.csv")

print(ratings_df_slim.head())

   RatingID   UserID  WineID Vintage  Rating                 Date
0       143  1356810  103471    1950     4.5  2021-11-02 20:52:59
1       199  1173759  111415    1951     5.0  2015-08-20 17:46:26
2       348  1164877  111395    1952     5.0  2020-11-13 05:40:26
3       374  1207665  111433    1953     5.0  2017-05-05 06:44:13
4       834  1075841  111431    1955     5.0  2016-09-14 20:18:38


  ratings_df_slim = pd.read_csv("XWines_Slim_1K_wines_150K_ratings\XWines_Slim_150K_ratings.csv")


In [111]:
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.preprocessing import LabelEncoder

class EASE:
    def __init__(self):
        self.user_enc = LabelEncoder() # encode user
        self.item_enc = LabelEncoder() # encode item

    def _get_users_and_items(self, df):
        """
        Encodes user and item IDs into numerical indices.

        - Parameters:
        df (pandas.DataFrame): DataFrame containing user and item IDs.

        - Returns:
        tuple: Encoded user and item indices.
        """
        users = self.user_enc.fit_transform(df.loc[:, 'UserID'])
        items = self.item_enc.fit_transform(df.loc[:, 'WineID'])
        return users, items

    def fit(self, df, lambda_: float = 0.5, implicit=True):
        """
        Fits the EASE model to the provided data.

        - Parameters:
        df (pandas.DataFrame): DataFrame with columns user_id, item_id, and (optionally) rating.
        lambda_ (float): L2-regularization term.
        implicit (bool): If True, ratings are ignored and taken as 1; else normalized ratings are used.
        """
        users, items = self._get_users_and_items(df)
        values = (
            np.ones(df.shape[0])
            if implicit
            else df['Rating'].to_numpy() / df['Rating'].max()
        )

        X = csr_matrix((values, (users, items)))
        self.X = X

        G = X.T.dot(X).toarray()
        diagIndices = np.diag_indices(G.shape[0])
        G[diagIndices] += lambda_
        P = np.linalg.inv(G)
        B = P / (-np.diag(P))
        B[diagIndices] = 0

        self.B = B
        self.pred = X.dot(B)
    
    def predict(self, train, users, items, k):
        """
        Generates top-k item recommendations for the specified users.

        - Parameters:
        train (pandas.DataFrame): Training data DataFrame with columns user_id and item_id.
        users (list): List of user IDs for whom to generate recommendations.
        items (list): List of item IDs to consider for recommendations.
        k (int): Number of top recommendations to return for each user.

        - Returns:
        pandas.DataFrame: DataFrame containing user_id, item_id, and predicted scores.
        """
        items = self.item_enc.transform(items)
        dd = train.loc[train.UserID.isin(users)]
        dd['ci'] = self.item_enc.transform(dd.WineID)
        dd['cu'] = self.user_enc.transform(dd.UserID)
        g = dd.groupby('cu')

        results = []

        for user, group in g:
            user_pred = self.predict_for_user(user, group, self.pred[user, :], items, k)
            results.append(user_pred)

        df = pd.concat(results)
        df['WineID'] = self.item_enc.inverse_transform(df['WineID'])
        df['UserID'] = self.user_enc.inverse_transform(df['UserID'])
        return df

    @staticmethod
    def predict_for_user(user, group, pred, items, k):
        """
        Generates top-k item recommendations for a single user.

        - Parameters:
        user (int): Encoded user ID.
        group (pandas.DataFrame): Grouped DataFrame for the user.
        pred (numpy.ndarray): Predicted scores for all items for the user.
        items (numpy.ndarray): Encoded item IDs to consider for recommendations.
        k (int): Number of top recommendations to return.

        - Returns:
        pandas.DataFrame: DataFrame containing user_id, item_id, and predicted scores.
        """
        watched = set(group['ci'])
        candidates = [item for item in items if item not in watched]
        pred = np.take(pred, candidates)

        # Scale the predictions to the original rating range (0 to 5)
        min_pred, max_pred = np.min(pred), np.max(pred)
        if max_pred != min_pred:
            pred = 5 * (pred - min_pred) / (max_pred - min_pred)
        else:
            pred = np.full_like(pred, 2.5)  # Set to midpoint of the range if all values are the same
        

        res = np.argpartition(pred, -k)[-k:]
        r = pd.DataFrame(
            {
                "UserID": [user] * len(res),
                "WineID": np.take(candidates, res),
                "Rating": np.take(pred, res),
            }
        ).sort_values('Rating', ascending=False)
        return r

In [112]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder

class EASE_sim:
    def __init__(self):
        self.user_enc = LabelEncoder() # encode user
        self.item_enc = LabelEncoder() # encode item

    def _get_users_and_items(self, df):
        """
        Encodes user and item IDs into numerical indices.

        - Parameters:
        df (pandas.DataFrame): DataFrame containing user and item IDs.

        - Returns:
        tuple: Encoded user and item indices.
        """
        users = self.user_enc.fit_transform(df.loc[:, 'UserID'])
        items = self.item_enc.fit_transform(df.loc[:, 'WineID'])
        return users, items

    def fit(self, df, similarity_matrix_path, implicit=True):
        """
        Fits the EASE model to the provided data using a precomputed similarity matrix.

        - Parameters:
        df (pandas.DataFrame): DataFrame with columns user_id, item_id, and (optionally) rating.
        similarity_matrix_path (str): Path to the precomputed item-item similarity matrix (numpy .npy file).
        implicit (bool): If True, ratings are ignored and taken as 1; else normalized ratings are used.
        """
        users, items = self._get_users_and_items(df)
        values = (
            np.ones(df.shape[0])
            if implicit
            else df['Rating'].to_numpy() / df['Rating'].max()
        )

        # Load the precomputed item-item similarity matrix
        B = np.load(similarity_matrix_path)

        X = coo_matrix((values, (users, items)), shape=(len(users), B.shape[1]))
        X = X.tocsr()
        self.X = X

        self.B = B
        self.pred = X.dot(B)
    
    def predict(self, train, users, items, k):
        """
        Generates top-k item recommendations for the specified users.

        - Parameters:
        train (pandas.DataFrame): Training data DataFrame with columns user_id and item_id.
        users (list): List of user IDs for whom to generate recommendations.
        items (list): List of item IDs to consider for recommendations.
        k (int): Number of top recommendations to return for each user.

        - Returns:
        pandas.DataFrame: DataFrame containing user_id, item_id, and predicted scores.
        """
        items = self.item_enc.transform(items)
        dd = train.loc[train.UserID.isin(users)]
        dd['ci'] = self.item_enc.transform(dd.WineID)
        dd['cu'] = self.user_enc.transform(dd.UserID)
        g = dd.groupby('cu')

        results = []

        for user, group in g:
            user_pred = self.predict_for_user(user, group, self.pred[user, :], items, k)
            results.append(user_pred)

        df = pd.concat(results)
        df['WineID'] = self.item_enc.inverse_transform(df['WineID'])
        df['UserID'] = self.user_enc.inverse_transform(df['UserID'])
        return df

    @staticmethod
    def predict_for_user(user, group, pred, items, k):
        """
        Generates top-k item recommendations for a single user.

        - Parameters:
        user (int): Encoded user ID.
        group (pandas.DataFrame): Grouped DataFrame for the user.
        pred (numpy.ndarray): Predicted scores for all items for the user.
        items (numpy.ndarray): Encoded item IDs to consider for recommendations.
        k (int): Number of top recommendations to return.

        - Returns:
        pandas.DataFrame: DataFrame containing user_id, item_id, and predicted scores.
        """
        watched = set(group['ci'])
        candidates = [item for item in items if item not in watched]
        pred = np.take(pred, candidates)

        # Scale the predictions to the original rating range (0 to 5)
        min_pred, max_pred = np.min(pred), np.max(pred)
        if max_pred != min_pred:
            pred = 5 * (pred - min_pred) / (max_pred - min_pred)
        else:
            pred = np.full_like(pred, 2.5)  # Set to midpoint of the range if all values are the same
        

        res = np.argpartition(pred, -k)[-k:]
        r = pd.DataFrame(
            {
                "UserID": [user] * len(res),
                "WineID": np.take(candidates, res),
                "Rating": np.take(pred, res),
            }
        ).sort_values('Rating', ascending=False)
        return r

In [113]:
from sklearn.model_selection import train_test_split

# construct user-item mattrix with ratings as values
user_item_matrix = ratings_df_slim.drop(columns=['RatingID','Vintage','Date'])

# Split the dataset into train and test sets
train, test = train_test_split(user_item_matrix, test_size=0.2, random_state=42)

print(user_item_matrix.head())

    UserID  WineID  Rating
0  1356810  103471     4.5
1  1173759  111415     5.0
2  1164877  111395     5.0
3  1207665  111433     5.0
4  1075841  111431     5.0


In [114]:
import numpy as np

# Load the wine similarities array
similarity_matrix_path = 'wine_similarities.npy'
wine_similarities = np.load(similarity_matrix_path)

# Print the array
print(wine_similarities)

[[1.         0.34243759 0.25715279 ... 0.08090713 0.04792404 0.00931809]
 [0.34243759 1.         0.22099944 ... 0.02145987 0.02227039 0.04432614]
 [0.25715279 0.22099944 1.         ... 0.06395532 0.03669954 0.0404726 ]
 ...
 [0.08090713 0.02145987 0.06395532 ... 1.         0.13548625 0.01549924]
 [0.04792404 0.02227039 0.03669954 ... 0.13548625 1.         0.00610152]
 [0.00931809 0.04432614 0.0404726  ... 0.01549924 0.00610152 1.        ]]


In [115]:
# Initialize and fit the models
ease = EASE() # normal EASE model
ease.fit(train, lambda_=0.5, implicit=False)

ease_sim = EASE_sim() # EASE model with precomputed similarities
similarity_matrix_path = 'wine_similarities.npy'
ease_sim.fit(train, similarity_matrix_path, implicit=False)

users = train['UserID'].unique()
items = train['WineID'].unique()

predictions = ease.predict(train, users, items, 5)
print('Preditions for EASE model:')
print(predictions.head())

print()
print('Preditions for EASE model with precomputed similarities:')
predictions_sim = ease_sim.predict(train, users, items, 5)
print(predictions_sim.head())

Preditions for EASE model:
    UserID  WineID    Rating
2  1000004  111479  5.000000
4  1000004  162497  4.634106
3  1000004  135871  4.246498
1  1000004  167443  4.118155
0  1000004  179044  4.092122

Preditions for EASE model with precomputed similarities:
    UserID  WineID    Rating
3  1000004  111544  5.000000
4  1000004  112834  4.995180
2  1000004  113600  4.795602
1  1000004  113288  4.742439
0  1000004  111479  4.670854


In [116]:
# Evaluate the recommendations
from sklearn.metrics import mean_squared_error

# Merge predictions with actual ratings on 'user_id' and 'item_id'
merged = pd.merge(predictions, test, on=['UserID', 'WineID'], suffixes=('_pred', '_actual'))
merged_sim = pd.merge(predictions_sim, test, on=['UserID', 'WineID'], suffixes=('_pred', '_actual'))

# Drop rows with NaN values in the 'Rating_actual' column
merged = merged.dropna(subset=['Rating_actual'])
merged_sim = merged_sim.dropna(subset=['Rating_actual'])

# Extract the predicted and actual scores
predicted_scores = merged['Rating_pred']
actual_scores = merged['Rating_actual']

predicted_scores_sim = merged_sim['Rating_pred']
actual_scores_sim = merged_sim['Rating_actual']

# Calculate MSE and RMSE
mse = mean_squared_error(actual_scores, predicted_scores)
rmse = np.sqrt(mse)

mse_sim = mean_squared_error(actual_scores_sim, predicted_scores_sim)
rmse_sim = np.sqrt(mse_sim)

print(f'MSE: {mse}')
print(f'RMSE: {rmse}')

print()

print(f'MSE with precomputed similarities: {mse_sim}')
print(f'RMSE with precomputed similarities: {rmse_sim}')

MSE: 0.7658744467276951
RMSE: 0.8751425293788979

MSE with precomputed similarities: 0.9666028493119528
RMSE with precomputed similarities: 0.9831596255501712
