## Data Loading & Cleaning

In [1]:
import json

def load_data(filename):
    with open(filename, "r") as f:
        data = json.load(f)
    return data

data = load_data("movieverse_data.json")
data

{'users': [{'user_id': 'u001',
   'name': 'Ayaan',
   'ratings': {'m001': 5, 'm002': 4, 'm003': None},
   'friends': ['u002', 'u004']},
  {'user_id': 'u002',
   'name': 'Fatima',
   'ratings': {'m001': 4, 'm003': 2, 'm005': 5},
   'friends': ['u001', 'u003', '']},
  {'user_id': 'u003',
   'name': 'Kabir',
   'ratings': {'m002': 5, 'm004': 3, 'm005': None},
   'friends': ['u002']},
  {'user_id': 'u004', 'name': 'Zara', 'ratings': {}, 'friends': ['u001']}],
 'movies': [{'movie_id': 'm001',
   'title': 'Inception',
   'genres': ['Sci-Fi', 'Thriller'],
   'tags': ['dream', 'heist', 'mind-bending']},
  {'movie_id': 'm002',
   'title': 'The Dark Knight',
   'genres': ['Action', 'Drama'],
   'tags': ['Batman', 'JOKER', 'crime']},
  {'movie_id': 'm003',
   'title': 'Interstellar',
   'genres': 'Sci-Fi, Adventure',
   'tags': ['space', 'Time', 'Wormhole']},
  {'movie_id': 'm004',
   'title': 'The Social Network',
   'genres': ['Biography', 'Drama'],
   'tags': None},
  {'movie_id': 'm005',
   '

In [3]:
# ---------- 1. DATA CLEANING ----------

# a. Drop null ratings
for user in data['users']:
    user['ratings'] = {mid: rating for mid, rating in user['ratings'].items() if rating is not None}

# b. Remove duplicate movies by movie_id
seen = set()
unique_movies = []
for movie in data['movies']:
    if movie['movie_id'] not in seen:
        seen.add(movie['movie_id'])
        unique_movies.append(movie)
data['movies'] = unique_movies

# c. Standardize genres and tags
for movie in data['movies']:
    # Genres to lowercase list
    if isinstance(movie['genres'], str):
        movie['genres'] = [g.strip().lower() for g in movie['genres'].split(',')]
    else:
        movie['genres'] = [g.lower() for g in movie['genres']]
    # Tags to lowercase list
    if movie['tags'] is None:
        movie['tags'] = []
    else:
        movie['tags'] = [t.lower() for t in movie['tags']]

# d. Remove invalid/empty friend IDs
for user in data['users']:
    user['friends'] = [fid for fid in user['friends'] if fid.strip() != ""]

# e. Flag users with empty ratings for onboarding
for user in data['users']:
    user['needs_onboarding'] = len(user['ratings']) == 0

# ---------- 2. USER PROFILING ----------

# Build movie lookup dictionary
movie_map = {movie['movie_id']: movie for movie in data['movies']}

# Create user profiles based on genre/tag counts
user_profiles = {}
for user in data['users']:
    genre_count = {}
    tag_count = {}
    for mid in user['ratings']:
        movie = movie_map.get(mid)
        if not movie:
            continue
        for g in movie['genres']:
            genre_count[g] = genre_count.get(g, 0) + 1
        for t in movie['tags']:
            tag_count[t] = tag_count.get(t, 0) + 1
    user_profiles[user['user_id']] = {
        "genres": genre_count,
        "tags": tag_count
    }

# ---------- 3. MOVIE RECOMMENDATION ----------

def recommend_movies(user_id, top_n=3):
    profile = user_profiles.get(user_id)
    if not profile:
        return []

    genre_pref = profile['genres']
    tag_pref = profile['tags']

    user = next((u for u in data['users'] if u['user_id'] == user_id), None)
    if not user:
        return []

    rated = set(user['ratings'].keys())
    scored = []

    for movie in data['movies']:
        if movie['movie_id'] in rated:
            continue
        score = 0
        for g in movie['genres']:
            score += genre_pref.get(g, 0)
        for t in movie['tags']:
            score += tag_pref.get(t, 0)
        scored.append((movie['title'], score))

    scored.sort(key=lambda x: x[1], reverse=True)
    return [title for title, _ in scored[:top_n]]

# ---------- 4. pages_you_may_like() ----------

def movies_you_may_like(user_id):
    user = next((u for u in data['users'] if u['user_id'] == user_id), None)
    if not user:
        return []

    seen = set(user['ratings'].keys())
    movie_scores = {}

    for fid in user['friends']:
        friend = next((f for f in data['users'] if f['user_id'] == fid), None)
        if not friend:
            continue
        for mid, rating in friend['ratings'].items():
            if mid not in seen:
                movie_scores[mid] = movie_scores.get(mid, 0) + rating

    sorted_movies = sorted(movie_scores.items(), key=lambda x: x[1], reverse=True)
    return [movie_map[mid]['title'] for mid, _ in sorted_movies if mid in movie_map]

# ---------- 5. TEST OUTPUT ----------

print("Zara needs onboarding:", next(u for u in data['users'] if u['user_id'] == "u004")['needs_onboarding'])
print("Recommendations for Ayaan:", recommend_movies("u001"))
print("Pages Ayaan May Like:", movies_you_may_like("u001"))

Zara needs onboarding: True
Recommendations for Ayaan: ['Parasite', 'Interstellar', 'The Social Network']
Pages Ayaan May Like: ['Parasite', 'Interstellar']
