In [None]:
from google.colab import drive
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
from sklearn.preprocessing import LabelEncoder

# Mount Google Drive
# drive.mount('/content/drive')

# Load training set
train_file_path = '/content/drive/MyDrive/Recommender/Beer_taste_Reviews_1M_trainingset.csv'
df = pd.read_csv(train_file_path)

# Preprocess training data
df = df[['review_profilename', 'beer_beerid', 'review_taste']]
df.dropna(inplace=True)

# Encode user and beer IDs
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

df['user_idx'] = user_encoder.fit_transform(df['review_profilename'])
df['item_idx'] = item_encoder.fit_transform(df['beer_beerid'])

df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)


Unnamed: 0,review_profilename,beer_beerid,review_taste,user_idx,item_idx
0,7,245,4.0,171,129
1,187,47364,5.0,40,24810
2,585,129,4.0,150,71
3,585,429,3.5,150,216
4,585,1696,4.5,150,876


In [None]:
# Create a sparse matrix for user-item ratings
R_sparse = coo_matrix((df['review_taste'], (df['user_idx'], df['item_idx'])))

print(f"Number of users: {R_sparse.shape[0]}")
print(f"Number of items: {R_sparse.shape[1]}")

print(R_sparse)

In [None]:
# ALS Step Function
def als_step_sparse(R_sparse, P, Q, lambda_reg, n_factors):
    R_csr = R_sparse.tocsr()
    for u in range(R_sparse.shape[0]):
        indices = R_csr[u].indices
        Q_i = Q[indices, :]
        R_u = R_csr[u, indices].toarray().flatten()
        P[u, :] = np.linalg.solve(np.dot(Q_i.T, Q_i) + lambda_reg * np.eye(n_factors), np.dot(Q_i.T, R_u))

    R_csc = R_sparse.tocsc()
    for i in range(R_sparse.shape[1]):
        indices = R_csc[:, i].indices
        P_u = P[indices, :]
        R_i = R_csc[indices, i].toarray().flatten()
        Q[i, :] = np.linalg.solve(np.dot(P_u.T, P_u) + lambda_reg * np.eye(n_factors), np.dot(P_u.T, R_i))

    return P, Q

# Train ALS function
def train_als_sparse(R_sparse, P, Q, lambda_reg, n_factors, n_iterations):
    for iteration in range(n_iterations):
        P, Q = als_step_sparse(R_sparse, P, Q, lambda_reg, n_factors)
        print(f"Iteration {iteration+1}/{n_iterations} complete.")
    return P, Q

In [None]:
# Initialize ALS parameters
lambda_reg = 0.05
n_factors = 5
n_iterations = 1

P = np.random.rand(R_sparse.shape[0], n_factors)
Q = np.random.rand(R_sparse.shape[1], n_factors)

# Train the ALS model
P_trained, Q_trained = train_als_sparse(R_sparse, P, Q, lambda_reg, n_factors, n_iterations)

Iteration 1/1 complete.


In [None]:
# Predict function for test data
def predict_for_test_data(test_df, P, Q):
    R_pred = np.dot(P, Q.T)
    predicted_ratings = R_pred[test_df['user_idx'].values, test_df['item_idx'].values]
    return predicted_ratings

In [None]:
# Load test set
test_file_path = '/content/drive/MyDrive/Recommender/Beer_taste_Reviews_1M_testset.csv'
test_df = pd.read_csv(test_file_path)

# Encode test set using the same encoders as training data
test_df['user_idx'] = user_encoder.transform(test_df['review_profilename'])
known_items = item_encoder.classes_
unknown_items = set(test_df['beer_beerid']) - set(known_items)

# Handle unknown items
test_df['item_idx'] = test_df['beer_beerid'].apply(lambda x: item_encoder.transform([x])[0] if x in known_items else -1)

# Generate predictions for the test set
predicted_ratings = predict_for_test_data(test_df, P_trained, Q_trained)

# print(predicted_ratings)

# Add predictions to test DataFrame
test_df['predicted_taste'] = predicted_ratings

In [None]:
# Save the result to CSV
output_file = '/content/drive/MyDrive/Recommender/Part1_File1_PredictedRatings_Group[7].csv'
test_df.to_csv(output_file, index=False)


In [None]:
# Calculate RMSE for test data
def calculate_rmse_test_data(test_df, predicted_ratings):
    actual_ratings = test_df['review_taste'].values
    mse = np.mean((actual_ratings - predicted_ratings) ** 2)
    rmse = np.sqrt(mse)
    return rmse

# Calculate and save RMSE
rmse_test = calculate_rmse_test_data(test_df, predicted_ratings)
rmse_file = '/content/drive/MyDrive/Recommender/Part1_File2_RMSE_Group[7].txt'

with open(rmse_file, 'w') as f:
    f.write(f'RMSE: {rmse_test:.4f}')

print(f'RMSE on Test Data: {rmse_test:.4f}')