## MOVIE DATASET FAKER GENERATION

In [None]:
import json
from faker import Faker
import random
from datetime import datetime

fake = Faker()

movie_id_counter = 1                                                                     # counter for  id
actor_id_counter = 1
user_id_counter = 1
review_id_counter = 1
watch_id_counter = 1

def generate_id(prefix, counter):
    return f"{prefix}{str(counter).zfill(3)}"

def generate_movies(num_movies):                                                       # Generating Movies collection
    global movie_id_counter, actor_id_counter
    movies = []
    for _ in range(num_movies):
        movie_id = generate_id("M", movie_id_counter)
        movie_id_counter += 1

        cast = []
        for _ in range(random.randint(2, 5)):
            actor_id = generate_id("A", actor_id_counter)
            actor_id_counter += 1
            cast.append({"actor_id": actor_id, "character": fake.first_name()})

        movies.append({
            "movie_id": movie_id,
            "title": fake.sentence(nb_words=3).replace(".", ""),
            "genre": random.sample(["Action", "Drama", "Comedy", "Sci-Fi", "Romance"], k=2),
            "release_year": random.randint(1980, 2023),
            "cast": cast
        })
    return movies

def generate_users(num_users):                                                                # Generating Users collection
    global user_id_counter
    users = []
    for _ in range(num_users):
        user_id = generate_id("U", user_id_counter)
        user_id_counter += 1

        watch_history = []
        for _ in range(random.randint(1, 5)):
            watch_history.append({
                "movie_id": generate_id("M", random.randint(1, movie_id_counter - 1)),
                "watch_date": fake.date_time_this_decade().isoformat(),
                "duration_watched": random.randint(30, 180),
                "device": random.choice(["Mobile", "Smart TV", "Tablet", "Laptop"])
            })

        users.append({
            "user_id": user_id,
            "username": fake.user_name(),
            "email": fake.email(),
            "preferences": {
                "preferred_genres": random.sample(["Action", "Drama", "Comedy", "Sci-Fi", "Romance"], k=2),
                "notifications_enabled": fake.boolean()
            },
            "watch_history": watch_history
        })
    return users

def generate_reviews(num_reviews):                                                               # Generating Reviews collection
    global review_id_counter
    reviews = []
    for _ in range(num_reviews):
        review_id = generate_id("R", review_id_counter)
        review_id_counter += 1

        reviews.append({
            "review_id": review_id,
            "movie_id": generate_id("M", random.randint(1, movie_id_counter - 1)),
            "user_id": generate_id("U", random.randint(1, user_id_counter - 1)),
            "review_text": fake.text(max_nb_chars=100),
            "rating": random.randint(1, 5),
            "reactions": {"likes": random.randint(0, 500), "dislikes": random.randint(0, 50)}
        })
    return reviews

def generate_watch_history(num_watch_history):                                                     # Generating WatchHistory collection
    global watch_id_counter
    watch_history = []
    for _ in range(num_watch_history):
        watch_id = generate_id("WH", watch_id_counter)
        watch_id_counter += 1

        watch_history.append({
            "watch_id": watch_id,
            "user_id": generate_id("U", random.randint(1, user_id_counter - 1)),
            "movie_id": generate_id("M", random.randint(1, movie_id_counter - 1)),
            "watch_date": fake.date_time_this_decade().isoformat(),
            "device": random.choice(["Mobile", "Smart TV", "Tablet", "Laptop"]),
            "quality": random.choice(["HD", "SD", "4K"])
        })
    return watch_history

def save_to_json(data, filename):                                                                  # Saving data to JSON files
    with open(filename, 'w') as file:
        json.dump(data, file, indent=4)


NUM_RECORDS = 100000 
movies = generate_movies(NUM_RECORDS)
users = generate_users(NUM_RECORDS)
reviews = generate_reviews(NUM_RECORDS)
watch_history = generate_watch_history(NUM_RECORDS)

save_to_json(movies, 'movies.json')
save_to_json(users, 'users.json')
save_to_json(reviews, 'reviews.json')
save_to_json(watch_history, 'watch_history.json')

print("Data generation and saving as JSON files completed!")

### DATASET OVERVIEW

The dataset used in this project was generated using the Faker library to simulate real-world data for four collections: Movies, Users, Reviews, and WatchHistory. Each collection played a distinct role in the project, with specific fields required to meet the needs of the tasks.

- **Movies Collection:** This collection contained details about movies, including fields like movie_id, title, genre, release_date, duration, rating, and more. The movies data allowed for tracking of various movie attributes, enabling filtering and querying based on genres, release dates, and user preferences.

- **Users Collection:** The users collection consisted of user-related information, including user_id, name, email, preferences, and notifications_enabled. The user data helped identify individual users, their content preferences (e.g., preferred genres), and notification settings.

- **Reviews Collection:** This collection included user reviews for movies, with fields such as review_id, user_id, movie_id, review_text, rating, and review_date. Reviews are linked to both users and movies, offering a platform for analyzing user feedback on movies and movies' overall reception.

- **WatchHistory Collection:** The watch history collection records user activity, including fields like watch_id, user_id, movie_id, watch_date, device, quality, and watched_recently. This collection tracked the movies that users have watched, their device choices (e.g., Smart TV or Laptop), and other related data to filter and update user activity based on recent watching behavior.

**THE FACKER LIBRARY (using the code above), was used to generate 100,000 records for each collection, creating a large synthetic dataset. This dataset was then modified as needed to ensure it aligns with specific tasks, such as adding fields, ensuring user preferences are met, or updating the watch history to reflect user activity within certain time frames. The generated data offers a foundation for running various queries and updates to simulate real-world interactions in a media platform environment.**