In [28]:
# Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Load the Datasets
train_file_path = '/content/drive/MyDrive/Recommender/Beer_taste_Reviews_1M_trainingset.csv'
test_file_path = '/content/drive/MyDrive/Recommender/Beer_taste_Reviews_1M_testset.csv'

train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

# Encode Users and Items
def encode_data(train, test):
    user_encoder = LabelEncoder()
    item_encoder = LabelEncoder()

    train['user_idx'] = user_encoder.fit_transform(train['review_profilename'])
    train['item_idx'] = item_encoder.fit_transform(train['beer_beerid'])

    test['user_idx'] = test['review_profilename'].map(dict(zip(train['review_profilename'], train['user_idx'])))
    test['item_idx'] = test['beer_beerid'].map(dict(zip(train['beer_beerid'], train['item_idx'])))

    return train, test

train_data, test_data = encode_data(train_data, test_data)

# Create User-Item Matrix
train_matrix = train_data.pivot_table(index='user_idx', columns='item_idx', values='review_taste')

# Compute Cosine Similarity
def calculate_cosine_similarity(matrix):
    # Fill NaN values with 0
    filled_matrix = matrix.fillna(0)

    # Calculate item similarities
    norms = np.linalg.norm(filled_matrix, axis=0)
    similarity_matrix = np.dot(filled_matrix.T, filled_matrix) / np.outer(norms, norms)

    # Remove self-similarity
    np.fill_diagonal(similarity_matrix, 0)
    return pd.DataFrame(similarity_matrix, index=matrix.columns, columns=matrix.columns)

item_similarity_matrix = calculate_cosine_similarity(train_matrix)

# Predict Ratings
def predict_rating(user_id, item_id):
    if item_id not in item_similarity_matrix.index or user_id not in train_matrix.index:
        return np.nan

    # Get similarities and user ratings
    similarities = item_similarity_matrix[item_id].drop(item_id).sort_values(ascending=False)
    user_ratings = train_matrix.loc[user_id].dropna()

    # Calculate weighted sum of ratings
    numerator = sum(similarities[item] * user_ratings[item] for item in user_ratings.index if item in similarities.index)
    denominator = sum(similarities[item] for item in user_ratings.index if item in similarities.index)

    if denominator == 0:
        return train_matrix.loc[user_id].mean()  # Fallback to user mean if no ratings
    return numerator / denominator

# Generate Predictions for Test Data
test_data['predicted_review_taste'] = test_data.apply(lambda row: predict_rating(row['user_idx'], row['item_idx']), axis=1)

# Calculate RMSE
def calculate_rmse(actual, predicted):
    valid_mask = ~np.isnan(actual) & ~np.isnan(predicted)
    mse = np.mean((actual[valid_mask] - predicted[valid_mask]) ** 2)
    return np.sqrt(mse)

rmse_value = calculate_rmse(test_data['review_taste'].values, test_data['predicted_review_taste'].values)

# Save Predictions and RMSE to Files
test_data[['brewery_id', 'beer_beerid', 'review_profilename', 'review_taste', 'predicted_review_taste']].to_csv(
    'Part1_File1_PredictedRatings_Group[7].csv', index=False)

with open('Part1_File2_RMSE_Group[7].txt', 'w') as file:
    file.write(f'RMSE: {rmse_value:.4f}')

print(f'RMSE on Test Data: {rmse_value:.4f}')


RMSE on Test Data: 0.6551
