In [None]:
from google.colab import drive
import pandas as pd
import numpy as np

# Mount Google Drive
drive.mount('/content/drive')

# Step 1: Load the training dataset
file_path = '/content/drive/MyDrive/Recommender/Beer_taste_Reviews_1M_trainingset.csv'
df = pd.read_csv(file_path)

# Step 2: Keep only the relevant columns
df = df[['review_profilename', 'beer_beerid', 'review_taste']]

# Step 3: Drop missing values
df.dropna(inplace=True)

# Step 4: Encode user and item IDs
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import coo_matrix

user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

df['user_idx'] = user_encoder.fit_transform(df['review_profilename'])
df['item_idx'] = item_encoder.fit_transform(df['beer_beerid'])

# Step 5: Create a sparse matrix for user-item interactions
R_sparse = coo_matrix((df['review_taste'], (df['user_idx'], df['item_idx'])))

# Step 6: Initialize SGD-based Matrix Factorization
def sgd_step_sparse(R_sparse, P, Q, learning_rate, lambda_reg, n_factors):
    R_coo = R_sparse.tocoo()  # Convert sparse matrix to COO format for easy iteration
    for u, i, r_ui in zip(R_coo.row, R_coo.col, R_coo.data):
        # Prediction error
        error_ui = r_ui - np.dot(P[u, :], Q[i, :].T)

        # Update latent factors
        P[u, :] += learning_rate * (error_ui * Q[i, :] - lambda_reg * P[u, :])
        Q[i, :] += learning_rate * (error_ui * P[u, :] - lambda_reg * Q[i, :])

    return P, Q

def train_sgd_sparse(R_sparse, P, Q, learning_rate, lambda_reg, n_factors, n_iterations):
    for iteration in range(n_iterations):
        P, Q = sgd_step_sparse(R_sparse, P, Q, learning_rate, lambda_reg, n_factors)
        print(f"Iteration {iteration+1}/{n_iterations} complete.")
    return P, Q

# Parameters for SGD
learning_rate = 0.01
lambda_reg = 0.05
n_factors = 5
n_iterations = 20

# Step 7: Initialize random latent factors for P (users) and Q (items)
P = np.random.rand(R_sparse.shape[0], n_factors)
Q = np.random.rand(R_sparse.shape[1], n_factors)

# Step 8: Train the model using SGD
P_trained_sgd, Q_trained_sgd = train_sgd_sparse(R_sparse, P, Q, learning_rate, lambda_reg, n_factors, n_iterations)

# Step 9: Load the test dataset
file_path2 = '/content/drive/MyDrive/Recommender/Beer_taste_Reviews_1M_testset.csv'
test_df = pd.read_csv(file_path2)

# Encode users and items in the test dataset
test_df['user_idx'] = user_encoder.transform(test_df['review_profilename'])

known_items = item_encoder.classes_
unknown_items = set(test_df['beer_beerid']) - set(known_items)

# Handle unknown items by assigning a default index
if unknown_items:
    test_df['item_idx'] = test_df['beer_beerid'].apply(lambda x: item_encoder.transform([x])[0] if x in known_items else -1)

# Step 10: Predict ratings for the test data
def predict_for_test_data(test_df, P, Q):
    R_pred = np.dot(P, Q.T)
    predicted_ratings = R_pred[test_df['user_idx'].values, test_df['item_idx'].values]
    return predicted_ratings

# Generate predictions for the test data
predicted_ratings_test_sgd = predict_for_test_data(test_df, P_trained_sgd, Q_trained_sgd)

# Step 11: Add predictions to the test DataFrame
test_df['predicted_taste'] = predicted_ratings_test_sgd

# Step 12: Save the predicted results as required
output_file = '/content/drive/MyDrive/Recommender/Part2_File1_PredictedRatings_Group[7].csv'
test_df.to_csv(output_file, index=False)

# Step 13: RMSE Calculation for Test Data
def calculate_rmse_test_data(test_df, predicted_ratings):
    # Actual ratings from the test dataset
    actual_ratings = test_df['review_taste'].values

    # Calculate RMSE
    mse = np.mean((actual_ratings - predicted_ratings) ** 2)
    rmse = np.sqrt(mse)
    return rmse

# Calculate RMSE on the test data
rmse_test_sgd = calculate_rmse_test_data(test_df, predicted_ratings_test_sgd)

# Step 14: Save RMSE to a file
rmse_file = '/content/drive/MyDrive/Recommender/Part2_File2_RMSE_Group[7].txt'
with open(rmse_file, 'w') as f:
    f.write(f'RMSE: {rmse_test_sgd:.4f}')

print(f'RMSE on Test Data with SGD: {rmse_test_sgd:.4f}')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Iteration 1/20 complete.
Iteration 2/20 complete.
Iteration 3/20 complete.
Iteration 4/20 complete.
Iteration 5/20 complete.
Iteration 6/20 complete.
Iteration 7/20 complete.
Iteration 8/20 complete.
Iteration 9/20 complete.
Iteration 10/20 complete.
Iteration 11/20 complete.
Iteration 12/20 complete.
Iteration 13/20 complete.
Iteration 14/20 complete.
Iteration 15/20 complete.
Iteration 16/20 complete.
Iteration 17/20 complete.
Iteration 18/20 complete.
Iteration 19/20 complete.
Iteration 20/20 complete.
RMSE on Test Data with SGD: 0.5460
