In [4]:
import json
from scipy.sparse import csr_matrix

# Load the data splits
with open('train_data.json', 'r') as json_file:
    train_data = json.load(json_file)

# Function to create the user-movie matrix with the new data format
def create_user_movie_matrix(data):
    rows = []
    cols = []
    vals = []

    user_ids = set()
    movie_ids = set()

    for user in data:
        user_id = user['id']
        for rating in user['ratings']:
            rows.append(user_id)
            cols.append(rating['movie_id'])
            vals.append(rating['rating'])
            user_ids.add(user_id)
            movie_ids.add(rating['movie_id'])

    user_index = {user_id: idx for idx, user_id in enumerate(user_ids)}
    movie_index = {movie_id: idx for idx, movie_id in enumerate(movie_ids)}
    
    row_indices = [user_index[user_id] for user_id in rows]
    col_indices = [movie_index[movie_id] for movie_id in cols]

    user_movie_matrix = csr_matrix((vals, (row_indices, col_indices)), shape=(len(user_index), len(movie_index)))

    return user_movie_matrix, user_index, movie_index

# Create train matrix
train_matrix, train_user_index, train_movie_index = create_user_movie_matrix(train_data)

# Debug prints to verify the results
#print("User Index Mapping:", train_user_index)
#print("Movie Index Mapping:", train_movie_index)
#print("Shape of user_movie_matrix:", train_matrix.shape)
#print("Non-zero entries in user_movie_matrix:", train_matrix.nnz)

In [5]:
def pgraph_rag_neighbors_ratings_only(user_id, movie_id, user_movie_matrix, user_index, movie_index, limit=None):
    # Retrieve the movie index
    movie_idx = movie_index.get(movie_id)
    if movie_idx is None:
        return []
    
    # Get the user indices who reviewed the movie
    user_indices = user_movie_matrix[:, movie_idx].nonzero()[0]
    
    # Get the corresponding user IDs and their ratings for the movie
    neighbor_ratings = [(list(user_index.keys())[list(user_index.values()).index(idx)], user_movie_matrix[idx, movie_idx]) for idx in user_indices]
    
    # If limit is specified, return only that many results
    if limit is not None:
        neighbor_ratings = neighbor_ratings[:limit]
    
    return neighbor_ratings

# Example usage
example_user_id = 'some_user_id'  # Replace with an actual user_id
example_movie_id = 2622           # Replace with an actual movie_id
limit = 5                         # Replace with the desired limit
neighbors_ratings = pgraph_rag_neighbors_ratings_only(example_user_id, example_movie_id, train_matrix, train_user_index, train_movie_index, limit)
print(f"Ratings by neighbors for movie {example_movie_id}: {neighbors_ratings}")


Ratings by neighbors for movie 2622: [('5131', 4), ('3971', 3), ('4127', 5), ('4271', 5), ('1263', 1)]


In [None]:
# Function to generate GPT-3.5-turbo completion
def generate_gpt_completion(context):
    prompt = f"Generate a rating from 1 to 5 for the movie from the following ratings. Ratings: {context}"
    
    response = openai.ChatCompletion.create(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="gpt-3.5-turbo",
    )
    
    return response['choices'][0]['message']['content']

# Function to get neighbors' ratings and generate GPT-3.5-turbo completion
def pgraph_rag_neighbors_ratings_only(user_id, movie_id, user_movie_matrix, user_index, movie_index, limit=None):
    # Retrieve the movie index
    movie_idx = movie_index.get(movie_id)
    if movie_idx is None:
        return []
    
    # Get the user indices who reviewed the movie
    user_indices = user_movie_matrix[:, movie_idx].nonzero()[0]
    
    # Get the corresponding user IDs and their ratings for the movie
    neighbor_ratings = [(list(user_index.keys())[list(user_index.values()).index(idx)], user_movie_matrix[idx, movie_idx]) for idx in user_indices]
    
    # If limit is specified, return only that many results
    if limit is not None:
        neighbor_ratings = neighbor_ratings[:limit]
    
    # Generate context string for the prompt
    context = "; ".join([f"User {user_id} rated it {rating}" for user_id, rating in neighbor_ratings])
    
    # Generate GPT-3.5-turbo completion
    gpt_description = generate_gpt_completion(context)
    
    return neighbor_ratings, gpt_description

# Example usage
example_user_id = 'some_user_id'  # Replace with an actual user_id
example_movie_id = 2622           # Replace with an actual movie_id
limit = 5                         # Replace with the desired limit
neighbors_ratings, gpt_description = pgraph_rag_neighbors_ratings_only(example_user_id, example_movie_id, train_matrix, train_user_index, train_movie_index, limit)
print(f"Ratings by neighbors for movie {example_movie_id}: {neighbors_ratings}")
print(f"GPT-3.5-turbo description: {gpt_description}")

In [6]:
def pgraph_rag_neighbors_ratings_and_user_ratings(user_id, movie_id, user_movie_matrix, user_index, movie_index):
    # Retrieve the ratings from neighbors
    neighbor_ratings = pgraph_rag_neighbors_ratings_only(user_id, movie_id, user_movie_matrix, user_index, movie_index)
    
    # Retrieve the user's own ratings
    user_idx = user_index.get(user_id)
    if user_idx is None:
        return neighbor_ratings
    
    user_ratings = [(movie, user_movie_matrix[user_idx, movie_index[movie]]) for movie in movie_index.keys() if user_movie_matrix[user_idx, movie_index[movie]] != 0]
    
    # Combine both sets of ratings
    combined_ratings = neighbor_ratings + user_ratings
    
    return combined_ratings

# Example usage
combined_ratings = pgraph_rag_neighbors_ratings_and_user_ratings(example_user_id, example_movie_id, train_matrix, train_user_index, train_movie_index)
print(f"Combined ratings for user {example_user_id} and movie {example_movie_id}: {combined_ratings}")


Combined ratings for user 2622 and movie 2622: [('1022', 4), ('3547', 4), ('4868', 3), ('4934', 3), ('5614', 4), ('5797', 4), ('2219', 5), ('2872', 3), ('19', 5), ('2298', 3), ('3074', 4), ('629', 4), ('1119', 4), ('2041', 3), ('1390', 3), ('5576', 2), ('5634', 3), ('5250', 3), ('4673', 3), ('2010', 4), ('5450', 4), ('2724', 1), ('2479', 5), ('5847', 4), ('3479', 5), ('5472', 3), ('817', 4), ('4064', 2), ('6039', 3), ('4436', 3), ('560', 5), ('704', 4), ('218', 4), ('2868', 4), ('2109', 4), ('1812', 5), ('3311', 4), ('1165', 2), ('4033', 5), ('5378', 4), ('754', 1), ('4439', 3), ('1926', 5), ('5401', 3), ('3539', 4), ('2878', 3), ('3618', 2), ('879', 3), ('78', 3), ('1450', 2), ('2199', 3), ('26', 3), ('6016', 3), ('789', 3), ('2611', 2), ('1969', 4), ('798', 5), ('5107', 4), ('4957', 4), ('3473', 5), ('3607', 4), ('5246', 2), ('2106', 4), ('3724', 3), ('1980', 3), ('3638', 2), ('532', 4), ('5881', 3), ('1354', 3), ('1271', 4), ('1425', 4), ('6024', 3), ('3229', 2), ('1391', 3), ('3836

In [7]:
def pgraph_multi_rag(user_id, movie_id, user_movie_matrix, user_index, movie_index):
    # Retrieve the ratings from neighbors
    neighbor_ratings = pgraph_rag_neighbors_ratings_only(user_id, movie_id, user_movie_matrix, user_index, movie_index)
    
    # Retrieve the user's own ratings
    user_idx = user_index.get(user_id)
    if user_idx is None:
        return neighbor_ratings, []
    
    user_ratings = [(movie, user_movie_matrix[user_idx, movie_index[movie]]) for movie in movie_index.keys() if user_movie_matrix[user_idx, movie_index[movie]] != 0]
    
    return neighbor_ratings, user_ratings

# Example usage
neighbors_ratings, user_ratings = pgraph_multi_rag(example_user_id, example_movie_id, train_matrix, train_user_index, train_movie_index)
print(f"Neighbor ratings for movie {example_movie_id}: {neighbors_ratings}")
print(f"User {example_user_id}'s ratings: {user_ratings}")


Neighbor ratings for movie 2622: [('1022', 4), ('3547', 4), ('4868', 3), ('4934', 3), ('5614', 4), ('5797', 4), ('2219', 5), ('2872', 3), ('19', 5), ('2298', 3), ('3074', 4), ('629', 4), ('1119', 4), ('2041', 3), ('1390', 3), ('5576', 2), ('5634', 3), ('5250', 3), ('4673', 3), ('2010', 4), ('5450', 4), ('2724', 1), ('2479', 5), ('5847', 4), ('3479', 5), ('5472', 3), ('817', 4), ('4064', 2), ('6039', 3), ('4436', 3), ('560', 5), ('704', 4), ('218', 4), ('2868', 4), ('2109', 4), ('1812', 5), ('3311', 4), ('1165', 2), ('4033', 5), ('5378', 4), ('754', 1), ('4439', 3), ('1926', 5), ('5401', 3), ('3539', 4), ('2878', 3), ('3618', 2), ('879', 3), ('78', 3), ('1450', 2), ('2199', 3), ('26', 3), ('6016', 3), ('789', 3), ('2611', 2), ('1969', 4), ('798', 5), ('5107', 4), ('4957', 4), ('3473', 5), ('3607', 4), ('5246', 2), ('2106', 4), ('3724', 3), ('1980', 3), ('3638', 2), ('532', 4), ('5881', 3), ('1354', 3), ('1271', 4), ('1425', 4), ('6024', 3), ('3229', 2), ('1391', 3), ('3836', 2), ('3032'

In [8]:
val_matrix, val_user_index, val_movie_index = create_user_movie_matrix(val_data)
test_matrix, test_user_index, test_movie_index = create_user_movie_matrix(test_data)