In [1]:
import pandas as pd
import numpy as np

# first we load and preprocess the wines dataset 

wines_df_slim = pd.read_csv("XWines_Slim_1K_wines_150K_ratings\XWines_Slim_1K_wines.csv")
ratings_df_slim = pd.read_csv("XWines_Slim_1K_wines_150K_ratings\XWines_Slim_150K_ratings.csv")

# Merge wines and ratings on WineID
merged_data = pd.merge(wines_df_slim, ratings_df_slim, on='WineID')

print(ratings_df_slim.head())


   RatingID   UserID  WineID Vintage  Rating                 Date
0       143  1356810  103471    1950     4.5  2021-11-02 20:52:59
1       199  1173759  111415    1951     5.0  2015-08-20 17:46:26
2       348  1164877  111395    1952     5.0  2020-11-13 05:40:26
3       374  1207665  111433    1953     5.0  2017-05-05 06:44:13
4       834  1075841  111431    1955     5.0  2016-09-14 20:18:38


  ratings_df_slim = pd.read_csv("XWines_Slim_1K_wines_150K_ratings\XWines_Slim_150K_ratings.csv")


In [2]:
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder
from multiprocessing import Pool, cpu_count

class EASE:
    def __init__(self):
        self.user_enc = LabelEncoder() # encode user
        self.item_enc = LabelEncoder() # encode item

    def _get_users_and_items(self, df):
        """
        Encodes user and item IDs into numerical indices.

        - Parameters:
        df (pandas.DataFrame): DataFrame containing user and item IDs.

        - Returns:
        tuple: Encoded user and item indices.
        """
        users = self.user_enc.fit_transform(df.loc[:, 'UserID'])
        items = self.item_enc.fit_transform(df.loc[:, 'WineID'])
        return users, items

    def fit(self, df, lambda_: float = 0.5, implicit=True):
        """
        Fits the EASE model to the provided data.

        - Parameters:
        df (pandas.DataFrame): DataFrame with columns user_id, item_id, and (optionally) rating.
        lambda_ (float): L2-regularization term.
        implicit (bool): If True, ratings are ignored and taken as 1; else normalized ratings are used.
        """
        users, items = self._get_users_and_items(df)
        values = (
            np.ones(df.shape[0])
            if implicit
            else df['Rating'].to_numpy() / df['Rating'].max()
        )

        X = csr_matrix((values, (users, items)))
        self.X = X

        G = X.T.dot(X).toarray()
        diagIndices = np.diag_indices(G.shape[0])
        G[diagIndices] += lambda_
        P = np.linalg.inv(G)
        B = P / (-np.diag(P))
        B[diagIndices] = 0

        self.B = B
        self.pred = X.dot(B)
    
    def predict(self, train, users, items, k):
        """
        Generates top-k item recommendations for the specified users.

        - Parameters:
        train (pandas.DataFrame): Training data DataFrame with columns user_id and item_id.
        users (list): List of user IDs for whom to generate recommendations.
        items (list): List of item IDs to consider for recommendations.
        k (int): Number of top recommendations to return for each user.

        - Returns:
        pandas.DataFrame: DataFrame containing user_id, item_id, and predicted scores.
        """
        items = self.item_enc.transform(items)
        dd = train.loc[train.UserID.isin(users)]
        dd['ci'] = self.item_enc.transform(dd.WineID)
        dd['cu'] = self.user_enc.transform(dd.UserID)
        g = dd.groupby('cu')

        results = []

        for user, group in g:
            user_pred = self.predict_for_user(user, group, self.pred[user, :], items, k)
            results.append(user_pred)

        df = pd.concat(results)
        df['item_id'] = self.item_enc.inverse_transform(df['item_id'])
        df['user_id'] = self.user_enc.inverse_transform(df['user_id'])
        return df

    @staticmethod
    def predict_for_user(user, group, pred, items, k):
        """
        Generates top-k item recommendations for a single user.

        - Parameters:
        user (int): Encoded user ID.
        group (pandas.DataFrame): Grouped DataFrame for the user.
        pred (numpy.ndarray): Predicted scores for all items for the user.
        items (numpy.ndarray): Encoded item IDs to consider for recommendations.
        k (int): Number of top recommendations to return.

        - Returns:
        pandas.DataFrame: DataFrame containing user_id, item_id, and predicted scores.
        """
        watched = set(group['ci'])
        candidates = [item for item in items if item not in watched]
        pred = np.take(pred, candidates)

        # Scale the predictions to the original rating range (0 to 5)
        min_pred, max_pred = np.min(pred), np.max(pred)
        if max_pred != min_pred:
            pred = 5 * (pred - min_pred) / (max_pred - min_pred)
        else:
            pred = np.full_like(pred, 2.5)  # Set to midpoint of the range if all values are the same
        

        res = np.argpartition(pred, -k)[-k:]
        r = pd.DataFrame(
            {
                "user_id": [user] * len(res),
                "item_id": np.take(candidates, res),
                "score": np.take(pred, res),
            }
        ).sort_values('score', ascending=False)
        return r

In [8]:
# construct user-item mattrix
# with columns user_id, item_id and (rating)

user_item_matrix = ratings_df_slim.drop(columns=['RatingID','Vintage','Date'])

print(user_item_matrix.head())

    UserID  WineID  Rating
0  1356810  103471     4.5
1  1173759  111415     5.0
2  1164877  111395     5.0
3  1207665  111433     5.0
4  1075841  111431     5.0


In [6]:
import numpy as np

# Load the wine similarities array
wine_similarities = np.load('wine_similarities.npy')

# Print the array
print(wine_similarities)

[[1.         0.34243759 0.25715279 ... 0.08090713 0.04792404 0.00931809]
 [0.34243759 1.         0.22099944 ... 0.02145987 0.02227039 0.04432614]
 [0.25715279 0.22099944 1.         ... 0.06395532 0.03669954 0.0404726 ]
 ...
 [0.08090713 0.02145987 0.06395532 ... 1.         0.13548625 0.01549924]
 [0.04792404 0.02227039 0.03669954 ... 0.13548625 1.         0.00610152]
 [0.00931809 0.04432614 0.0404726  ... 0.01549924 0.00610152 1.        ]]


In [9]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder

class EASE:
    def __init__(self):
        self.user_enc = LabelEncoder() # encode user
        self.item_enc = LabelEncoder() # encode item

    def _get_users_and_items(self, df):
        """
        Encodes user and item IDs into numerical indices.

        - Parameters:
        df (pandas.DataFrame): DataFrame containing user and item IDs.

        - Returns:
        tuple: Encoded user and item indices.
        """
        users = self.user_enc.fit_transform(df.loc[:, 'UserID'])
        items = self.item_enc.fit_transform(df.loc[:, 'WineID'])
        return users, items

    def fit(self, df, similarity_matrix_path, implicit=True):
        """
        Fits the EASE model to the provided data using a precomputed similarity matrix.

        - Parameters:
        df (pandas.DataFrame): DataFrame with columns user_id, item_id, and (optionally) rating.
        similarity_matrix_path (str): Path to the precomputed item-item similarity matrix (numpy .npy file).
        implicit (bool): If True, ratings are ignored and taken as 1; else normalized ratings are used.
        """
        users, items = self._get_users_and_items(df)
        values = (
            np.ones(df.shape[0])
            if implicit
            else df['Rating'].to_numpy() / df['Rating'].max()
        )

        X = csr_matrix((values, (users, items)))
        self.X = X

        # Load the precomputed item-item similarity matrix
        B = np.load(similarity_matrix_path)
        self.B = B
        self.pred = X.dot(B)
    
    def predict(self, train, users, items, k):
        """
        Generates top-k item recommendations for the specified users.

        - Parameters:
        train (pandas.DataFrame): Training data DataFrame with columns user_id and item_id.
        users (list): List of user IDs for whom to generate recommendations.
        items (list): List of item IDs to consider for recommendations.
        k (int): Number of top recommendations to return for each user.

        - Returns:
        pandas.DataFrame: DataFrame containing user_id, item_id, and predicted scores.
        """
        items = self.item_enc.transform(items)
        dd = train.loc[train.UserID.isin(users)]
        dd['ci'] = self.item_enc.transform(dd.WineID)
        dd['cu'] = self.user_enc.transform(dd.UserID)
        g = dd.groupby('cu')

        results = []

        for user, group in g:
            user_pred = self.predict_for_user(user, group, self.pred[user, :], items, k)
            results.append(user_pred)

        df = pd.concat(results)
        df['item_id'] = self.item_enc.inverse_transform(df['item_id'])
        df['user_id'] = self.user_enc.inverse_transform(df['user_id'])
        return df

    @staticmethod
    def predict_for_user(user, group, pred, items, k):
        """
        Generates top-k item recommendations for a single user.

        - Parameters:
        user (int): Encoded user ID.
        group (pandas.DataFrame): Grouped DataFrame for the user.
        pred (numpy.ndarray): Predicted scores for all items for the user.
        items (numpy.ndarray): Encoded item IDs to consider for recommendations.
        k (int): Number of top recommendations to return.

        - Returns:
        pandas.DataFrame: DataFrame containing user_id, item_id, and predicted scores.
        """
        watched = set(group['ci'])
        candidates = [item for item in items if item not in watched]
        pred = np.take(pred, candidates)

        # Scale the predictions to the original rating range (0 to 5)
        min_pred, max_pred = np.min(pred), np.max(pred)
        if max_pred != min_pred:
            pred = 5 * (pred - min_pred) / (max_pred - min_pred)
        else:
            pred = np.full_like(pred, 2.5)  # Set to midpoint of the range if all values are the same
        

        res = np.argpartition(pred, -k)[-k:]
        r = pd.DataFrame(
            {
                "user_id": [user] * len(res),
                "item_id": np.take(candidates, res),
                "score": np.take(pred, res),
            }
        ).sort_values('score', ascending=False)
        return r

In [10]:
# construct user-item mattrix
# with columns user_id, item_id and (rating)

user_item_matrix = ratings_df_slim.drop(columns=['RatingID','Vintage','Date'])

print(user_item_matrix.head())

    UserID  WineID  Rating
0  1356810  103471     4.5
1  1173759  111415     5.0
2  1164877  111395     5.0
3  1207665  111433     5.0
4  1075841  111431     5.0


In [20]:
# Path to the precomputed similarity matrix
similarity_matrix_path = 'wine_similarities.npy'

# Initialize and fit the model
ease = EASE()
ease.fit(user_item_matrix, similarity_matrix_path, implicit=False)
print('data fitted')

users = user_item_matrix['UserID'].unique()
items = user_item_matrix['WineID'].unique()

predictions = ease.predict(user_item_matrix, users, items, 5)

data fitted


In [22]:
print(predictions.to_markdown())

|    |     user_id |   item_id |   score |
|---:|------------:|----------:|--------:|
|  2 | 1e+06       |    112834 | 5       |
|  4 | 1e+06       |    113600 | 4.59021 |
|  1 | 1e+06       |    113489 | 4.47686 |
|  3 | 1e+06       |    113288 | 4.28875 |
|  0 | 1e+06       |    111479 | 4.13407 |
|  2 | 1.00001e+06 |    179059 | 5       |
|  4 | 1.00001e+06 |    184745 | 5       |
|  3 | 1.00001e+06 |    179855 | 4.66276 |
|  1 | 1.00001e+06 |    179058 | 4.62287 |
|  0 | 1.00001e+06 |    182081 | 4.49538 |
|  4 | 1.00002e+06 |    113600 | 5       |
|  2 | 1.00002e+06 |    111461 | 4.82353 |
|  3 | 1.00002e+06 |    111479 | 4.78457 |
|  1 | 1.00002e+06 |    113288 | 4.66297 |
|  0 | 1.00002e+06 |    113344 | 4.62437 |
|  4 | 1.00002e+06 |    112229 | 5       |
|  3 | 1.00002e+06 |    112695 | 4.95178 |
|  2 | 1.00002e+06 |    113489 | 4.9176  |
|  1 | 1.00002e+06 |    111967 | 4.85634 |
|  0 | 1.00002e+06 |    111667 | 4.78283 |
|  4 | 1.00002e+06 |    112695 | 5       |
|  3 | 1.00

In [4]:

ease = EASE()
ease.fit(user_item_matrix, lambda_=0.5, implicit=False)
print('data fitted')

users = user_item_matrix['UserID'].unique()
items = user_item_matrix['WineID'].unique()

print(ease.pred.shape)


predictions = ease.predict(user_item_matrix, users, items, 5)
print(predictions)



data fitted
(10561, 1007)
    user_id  item_id     score
3   1000004   135860  5.000000
4   1000004   162497  4.738203
2   1000004   179012  4.474685
1   1000004   111479  4.254400
0   1000004   167443  3.947609
..      ...      ...       ...
3   2062618   111421  5.000000
4   2062618   135860  4.850239
2   2062618   111468  4.727079
1   2062618   111448  4.603914
0   2062618   111484  4.271333

[52805 rows x 3 columns]


In [5]:
# Evaluate the recommendations
from sklearn.metrics import mean_squared_error

# TODO
test_set = 

# Flatten the matrices and remove NaN values for comparison
predicted_flat = predicted_ratings.flatten()
actual_flat = test_set.flatten()

# Remove NaN values
mask = ~np.isnan(actual_flat)
predicted_flat = predicted_flat[mask]
actual_flat = actual_flat[mask]

# Calculate MSE or RMSE
mse = mean_squared_error(actual_flat, predicted_flat)
rmse = np.sqrt(mse)

print(f'MSE: {mse}')
print(f'RMSE: {rmse}')

SyntaxError: invalid syntax (2541029606.py, line 5)