In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [27]:
import pandas as pd

file_path = '/content/drive/MyDrive/Recommender/Beer_taste_Reviews_1M_trainingset.csv'
df = pd.read_csv(file_path)


In [28]:
df.shape

(1047592, 10)

In [29]:
df = df[['review_profilename', 'beer_beerid', 'review_taste']]
df.shape



(1047592, 3)

In [30]:
df.dropna(inplace=True)
df.shape

(1047357, 3)

In [31]:
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import coo_matrix

# Encode user and beer IDs
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

df['user_idx'] = user_encoder.fit_transform(df['review_profilename'])
df['item_idx'] = item_encoder.fit_transform(df['beer_beerid'])

# Create a sparse matrix for user-item ratings
R_sparse = coo_matrix((df['review_taste'], (df['user_idx'], df['item_idx'])))


In [37]:
for i in range(5):
    print(R_sparse.row[i], R_sparse.col[i], R_sparse.data[i])

171 129 4.0
40 24810 5.0
150 71 4.0
150 216 3.5
150 876 4.5


In [43]:
def als_step_sparse(R_sparse, P, Q, lambda_reg, n_factors):
    # Convert sparse matrix to CSR format for efficient row-wise access
    R_csr = R_sparse.tocsr() # Change to CSR format

    # Update user matrix P
    for u in range(R_sparse.shape[0]):
        indices = R_csr[u].indices  # Find rated items for user u using CSR indexing
        Q_i = Q[indices, :]
        R_u = R_csr[u, indices].toarray().flatten()
        P[u, :] = np.linalg.solve(np.dot(Q_i.T, Q_i) + lambda_reg * np.eye(n_factors),
                                  np.dot(Q_i.T, R_u))

    # Convert sparse matrix to CSC format for efficient column-wise access
    R_csc = R_sparse.tocsc() # Change to CSC format

    # Update item matrix Q
    for i in range(R_sparse.shape[1]):
        indices = R_csc[:, i].indices  # Find users who rated this item using CSC indexing
        P_u = P[indices, :]
        R_i = R_csc[indices, i].toarray().flatten()
        Q[i, :] = np.linalg.solve(np.dot(P_u.T, P_u) + lambda_reg * np.eye(n_factors),
                                  np.dot(P_u.T, R_i))

    return P, Q

In [44]:
def train_als_sparse(R_sparse, P, Q, lambda_reg, n_factors, n_iterations):
    for iteration in range(n_iterations):
        P, Q = als_step_sparse(R_sparse, P, Q, lambda_reg, n_factors)
        # You can compute error or skip if it's too costly
        print(f"Iteration {iteration+1}/{n_iterations} complete.")
    return P, Q

# Define parameters
lambda_reg = 0.1
n_factors = 10
n_iterations = 20

# Initialize random latent factors for P (users) and Q (items)
P = np.random.rand(R_sparse.shape[0], n_factors)
Q = np.random.rand(R_sparse.shape[1], n_factors)

# Train ALS
P_trained, Q_trained = train_als_sparse(R_sparse, P, Q, lambda_reg, n_factors, n_iterations)

Iteration 1/20 complete.
Iteration 2/20 complete.
Iteration 3/20 complete.
Iteration 4/20 complete.
Iteration 5/20 complete.
Iteration 6/20 complete.
Iteration 7/20 complete.
Iteration 8/20 complete.
Iteration 9/20 complete.
Iteration 10/20 complete.
Iteration 11/20 complete.
Iteration 12/20 complete.
Iteration 13/20 complete.
Iteration 14/20 complete.
Iteration 15/20 complete.
Iteration 16/20 complete.
Iteration 17/20 complete.
Iteration 18/20 complete.
Iteration 19/20 complete.
Iteration 20/20 complete.


**Predict for Sparse Data**

In [45]:
def predict_sparse(R_sparse, P, Q):
    R_coo = R_sparse.tocoo()
    predictions = []
    for u, i in zip(R_coo.row, R_coo.col):
        pred_rating = np.dot(P[u, :], Q[i, :].T)
        predictions.append(pred_rating)
    return predictions

# Get predicted ratings for non-zero interactions
predicted_ratings = predict_sparse(R_sparse, P_trained, Q_trained)


In [46]:
predicted_ratings[:5]

[3.8755446043278825,
 4.80664823142946,
 4.025245906846221,
 3.501057894749321,
 4.35720328673041]

In [47]:
import numpy as np
from scipy.sparse import coo_matrix

def predict_sparse(R_sparse, P, Q):
    R_pred = np.dot(P, Q.T)
    return R_pred

In [48]:
def calculate_rmse(R_sparse, P, Q):
    R_pred = predict_sparse(R_sparse, P, Q)
    R_coo = R_sparse.tocoo()  # COO format to access data easily

    # Get the actual ratings and predicted ratings for non-zero entries
    actual_ratings = R_coo.data
    predicted_ratings = R_pred[R_coo.row, R_coo.col]

    # Calculate RMSE
    mse = np.mean((actual_ratings - predicted_ratings) ** 2)
    rmse = np.sqrt(mse)

    return rmse

In [49]:
# Assuming P and Q are your trained matrices, and R_sparse is your sparse matrix
rmse = calculate_rmse(R_sparse, P_trained, Q_trained)
print(f'RMSE: {rmse:.4f}')

RMSE: 0.5390


**Testing with testset.csv**

In [54]:
file_path2 = '/content/drive/MyDrive/Recommender/Beer_taste_Reviews_1M_testset.csv'
test_df = pd.read_csv(file_path2)

# Encode users and items in the testing dataset using the same encoders as the training data
test_df['user_idx'] = user_encoder.transform(test_df['review_profilename'])

known_items = item_encoder.classes_

# Find unknown item IDs in the test set
unknown_items = set(test_df['beer_beerid']) - set(known_items)

# Handle unknown items (e.g., assign a default index or remove them)
if unknown_items:
    # Option 1: Assign a default index (e.g., -1)
    test_df['item_idx'] = test_df['beer_beerid'].apply(lambda x: item_encoder.transform([x])[0] if x in known_items else -1)


In [57]:
def predict_for_test_data(test_df, P, Q):
    # Use the trained matrices P and Q to predict ratings
    R_pred = np.dot(P, Q.T)

    # Extract predicted ratings for the user-item pairs in the test dataset
    predicted_ratings = R_pred[test_df['user_idx'].values, test_df['item_idx'].values]
    return predicted_ratings

# Generate predictions for the test data
predicted_ratings = predict_for_test_data(test_df, P_trained, Q_trained)

In [58]:
def calculate_rmse_test_data(test_df, predicted_ratings):
    # Actual ratings from the test dataset
    actual_ratings = test_df['review_taste'].values

    # Calculate RMSE
    mse = np.mean((actual_ratings - predicted_ratings) ** 2)
    rmse = np.sqrt(mse)

    return rmse

# Calculate RMSE for the test data
rmse_test = calculate_rmse_test_data(test_df, predicted_ratings)
print(f'RMSE on Test Data: {rmse_test:.4f}')

RMSE on Test Data: 0.6330
