In [6]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import hstack
from sklearn.linear_model import LinearRegression

# Load the restructured JSON data
with open('restructured_user_data.json', 'r') as json_file:
    user_data = json.load(json_file)

MIN_REVIEWS = 5  # Minimum number of reviews required

# Filter users with enough reviews
filtered_user_data = {user_id: data for user_id, data in user_data.items() if len(data['ratings']) >= MIN_REVIEWS}

# Get all user IDs
user_ids = list(filtered_user_data.keys())

# Split the user IDs into train, validation, and test sets
train_ids, temp_ids = train_test_split(user_ids, test_size=0.3, random_state=42)
val_ids, test_ids = train_test_split(temp_ids, test_size=0.5, random_state=42)

# Create dictionaries for each split
train_data = {user_id: filtered_user_data[user_id]['ratings'] for user_id in train_ids}
val_data = {user_id: filtered_user_data[user_id]['ratings'] for user_id in val_ids}
test_data = {user_id: filtered_user_data[user_id]['ratings'] for user_id in test_ids}

# Ensure each movie is represented in the training set
# Extract all unique movie IDs
all_movie_ids = set()
for data in filtered_user_data.values():
    for rating in data['ratings']:
        all_movie_ids.add(rating['movie_id'])

# Check if all movies are in the training set
train_movie_ids = set()
for ratings in train_data.values():
    for rating in ratings:
        train_movie_ids.add(rating['movie_id'])

# If not all movies are in the training set, adjust the splits
missing_movie_ids = all_movie_ids - train_movie_ids
if missing_movie_ids:
    print(f"Missing {len(missing_movie_ids)} movies in training set. Adjusting splits...")
    # Adjust splits to include all movies in the training set
    for movie_id in missing_movie_ids:
        found = False
        # First try to move from the test set to the train set
        for user_id, ratings in list(test_data.items()):
            for rating in ratings:
                if rating['movie_id'] == movie_id:
                    if user_id in train_data:
                        train_data[user_id].append(rating)
                    else:
                        train_data[user_id] = [rating]
                    test_data[user_id].remove(rating)
                    found = True
                    break
            if found:
                break
        if not found:
            # If not found in test set, try to move from the validation set
            for user_id, ratings in list(val_data.items()):
                for rating in ratings:
                    if rating['movie_id'] == movie_id:
                        if user_id in train_data:
                            train_data[user_id].append(rating)
                        else:
                            train_data[user_id] = [rating]
                        val_data[user_id].remove(rating)
                        found = True
                        break
                if found:
                    break

# Function to extract prediction targets and remove them from the data split
def extract_prediction_targets(data_split):
    prediction_targets = []

    for user_id in list(data_split.keys()):
        ratings = data_split[user_id]
        if ratings:
            # Take the first rating as the target for prediction
            rating = ratings.pop(0)
            prediction_targets.append({
                'id': user_id,
                'output': rating['rating']
            })
            # Add the input to the first level
            title = rating['title']
            genres = rating['genres'].split('|')
            prompt = f"What rating will the user give the movie from 1 - 5 based on the movie title and categories? Title: {title} Genre: {', '.join(genres)}"
            data_split[user_id] = {
                'id': user_id,
                'input': prompt,
                'ratings': ratings
            }
    
    return prediction_targets

# Extract prediction targets
train_targets = extract_prediction_targets(train_data)
val_targets = extract_prediction_targets(val_data)
test_targets = extract_prediction_targets(test_data)

# Define NumpyEncoder to handle numpy data types
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, np.bool_):
            return bool(obj)
        elif isinstance(obj, np.datetime64):
            return obj.item().isoformat()
        else:
            return super(NumpyEncoder, self).default(obj)

# Define a function to save data to JSON
def save_to_json(data, filename):
    with open(filename, 'w') as json_file:
        json.dump(data, json_file, indent=4, cls=NumpyEncoder)

# Save the updated data splits and prediction targets to JSON files
save_to_json(list(train_data.values()), 'train_data.json')
save_to_json(list(val_data.values()), 'val_data.json')
save_to_json(list(test_data.values()), 'test_data.json')

save_to_json(train_targets, 'train_predictions.json')
save_to_json(val_targets, 'val_predictions.json')
save_to_json(test_targets, 'test_predictions.json')

print("Data splits and prediction targets saved to JSON files.")


Missing 35 movies in training set. Adjusting splits...
Data splits and prediction targets saved to JSON files.


In [None]:
# Convert the data to features and targets
train_features, train_targets = convert_to_features_targets(train_data)
val_features, val_targets = convert_to_features_targets(val_data)
test_features, test_targets = convert_to_features_targets(test_data)

# Combine titles and genres from all splits
all_titles = [f['title'] for f in train_features + val_features + test_features]
all_genres = [f['genres'] for f in train_features + val_features + test_features]

# Fit the vectorizer on the entire corpus
vectorizer = CountVectorizer()
vectorizer.fit(all_titles + all_genres)

# Function to prepare data for the model
def prepare_model_data(features):
    titles = [f['title'] for f in features]
    genres = [f['genres'] for f in features]
    
    X_titles = vectorizer.transform(titles)
    X_genres = vectorizer.transform(genres)
    
    # Combine these features as needed. Here we concatenate them.
    X = hstack([X_titles, X_genres])
    return X

# Prepare the data
X_train = prepare_model_data(train_features)
X_val = prepare_model_data(val_features)
X_test = prepare_model_data(test_features)

# Train a simple model
model = LinearRegression()
model.fit(X_train, train_targets)

# Evaluate the model
train_score = model.score(X_train, train_targets)
val_score = model.score(X_val, val_targets)

print(f'Train Score: {train_score}')
print(f'Validation Score: {val_score}')