In [5]:
import pandas as pd
from sqlalchemy import create_engine
from time import time

### Movies

In [6]:
movies = pd.read_json('separated_movies/movies.jsonl', lines=True)
actors = pd.read_json('separated_movies/actors.jsonl', lines=True)
directors = pd.read_json('separated_movies/directors.jsonl', lines=True)
genres = pd.read_json('separated_movies/genres.jsonl', lines=True)
movies_actors = pd.read_json('separated_movies/movies_actors.jsonl', lines=True)
movies_directors = pd.read_json('separated_movies/movies_directors.jsonl', lines=True)
movies_genres = pd.read_json('separated_movies/movies_genres.jsonl', lines=True)

In [7]:
split_ratio = 0.05
sample_size = 10
# streaming_movies = movies.sample(frac=split_ratio, random_state=42)
streaming_movies = movies.sample(n=sample_size, random_state=42)

db_movies = movies.drop(streaming_movies.index)

In [8]:
len(movies), len(streaming_movies), len(db_movies)

(84661, 10, 84651)

In [9]:
streaming_movie_ids = streaming_movies['item_id'].unique()
len(streaming_movie_ids)

10

In [14]:
db_movies_actors = movies_actors[~movies_actors['item_id'].isin(streaming_movie_ids)]
db_movies_directors = movies_directors[~movies_directors['item_id'].isin(streaming_movie_ids)]
db_movies_genres = movies_genres[~movies_genres['item_id'].isin(streaming_movie_ids)]

In [10]:
streaming_movies = pd.read_json('separated_movies/raw_movies.jsonl', lines=True)
streaming_movies['positive_reviews'] = 0
streaming_movies['negative_reviews'] = 0
streaming_movies['neutral_reviews'] = 0

In [11]:
streaming_movies = streaming_movies[streaming_movies['item_id'].isin(streaming_movie_ids)]
streaming_movies.to_json('dags/streaming_data/streaming_movies.jsonl', orient='records', lines=True)

In [17]:
db_movies.to_json('db_data/db_movies.jsonl', orient='records', lines=True)
db_movies_actors.to_json('db_data/db_movies_actors.jsonl', orient='records', lines=True)
db_movies_directors.to_json('db_data/db_movies_directors.jsonl', orient='records', lines=True)
db_movies_genres.to_json('db_data/db_movies_genres.jsonl', orient='records', lines=True)


### Ratings

In [18]:
chunk_size = 100000  # Number of records per chunk

input_file = 'separated_movies/ratings.jsonl'
output_db_file = 'db_data/db_ratings.jsonl'
output_streaming_file = 'dags/streaming_data/streaming_ratings.jsonl'

# Initialize flags for first write
first_write = True

# Process file in chunks
for chunk in pd.read_json(input_file, lines=True, chunksize=chunk_size):
    # Filter for DB and streaming
    filtered_chunk = chunk[~chunk['item_id'].isin(streaming_movie_ids)]
    streaming_chunk = chunk[chunk['item_id'].isin(streaming_movie_ids)]
    
    # Determine write mode based on first_write flag
    db_mode = 'w' if first_write else 'a'
    streaming_mode = 'w' if first_write else 'a'
    
    # Write to the DB file
    with open(output_db_file, db_mode) as db_file:
        filtered_chunk.to_json(db_file, orient='records', lines=True)
    
    # Write to the streaming file
    with open(output_streaming_file, streaming_mode) as streaming_file:
        streaming_chunk.to_json(streaming_file, orient='records', lines=True)
    
    # After first write, switch to append mode
    first_write = False

### Reviews

In [19]:
chunk_size = 100000  # Number of records per chunk

input_file = 'separated_movies/reviews.jsonl'
output_db_file = 'db_data/db_reviews.jsonl'
output_streaming_file = 'dags/streaming_data/streaming_reviews.jsonl'

# Initialize flags for first write
first_write = True

# Process file in chunks
for chunk in pd.read_json(input_file, lines=True, chunksize=chunk_size):
    # Filter for DB and streaming
    filtered_chunk = chunk[~chunk['item_id'].isin(streaming_movie_ids)]
    streaming_chunk = chunk[chunk['item_id'].isin(streaming_movie_ids)]
    
    # Determine write mode based on first_write flag
    db_mode = 'w' if first_write else 'a'
    streaming_mode = 'w' if first_write else 'a'
    
    # Write to the DB file
    with open(output_db_file, db_mode) as db_file:
        filtered_chunk.to_json(db_file, orient='records', lines=True)
    
    # Write to the streaming file
    with open(output_streaming_file, streaming_mode) as streaming_file:
        streaming_chunk.to_json(streaming_file, orient='records', lines=True)
    
    # After first write, switch to append mode
    first_write = False

In [3]:
# first_100k_reviews = pd.read_json('db_data/db_reviews.jsonl', lines=True, nrows=100000)
# first_100k_reviews.to_json('small_db_data/db_reviews_100k.jsonl', orient='records', lines=True)

In [20]:
streaming_ratings = pd.read_json('dags/streaming_data/streaming_ratings.jsonl', lines=True)
len(streaming_ratings)

2330

In [24]:
streaming_ratings['item_id'].value_counts()

item_id
4255      1978
170777     311
152131      16
130480      11
200400       5
180571       3
173031       3
186729       2
167458       1
Name: count, dtype: int64

In [22]:
streaming_reviews = pd.read_json('dags/streaming_data/streaming_reviews.jsonl', lines=True)
len(streaming_reviews)

679

In [25]:
streaming_reviews['item_id'].value_counts()

item_id
4255      637
130480     20
152131     11
173031      5
170777      4
148610      2
Name: count, dtype: int64

In [29]:
# Concatenate and keep only unique item_ids
unique_item_ids = pd.concat([streaming_ratings['item_id'], streaming_reviews['item_id']]).drop_duplicates().reset_index(drop=True)

# Display the resulting unique item_ids
print(unique_item_ids)

0      4255
1    170777
2    130480
3    152131
4    180571
5    173031
6    200400
7    186729
8    167458
9    148610
Name: item_id, dtype: int64
