In [2]:
import json 
import os 

#repo_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
repo_root = os.path.abspath("..")
movie_watch_path = os.path.join(repo_root, 'processed_logs.json')
ratings_path = os.path.join(repo_root, 'rating_events.json')

# load movie play events
with open(movie_watch_path, 'r') as f:
    movie_watch_data = json.load(f)

# load ratings data 
with open(ratings_path, 'r') as f:
    ratings_data = json.load(f)

# Print the first two records of each dataset
print("Movie Watch Data Sample:", movie_watch_data[:2])  # Show first two records
print("Ratings Data Sample:", ratings_data[:2])

Movie Watch Data Sample: [{'type': 'movie_play', 'raw': '2025-01-01T23:40:43,102833,GET /data/m/the+thin+red+line+1998/0.mpg', 'movie_details': {'id': 'the+thin+red+line+1998', 'tmdb_id': 8741, 'imdb_id': 'tt0120863', 'title': 'The Thin Red Line', 'original_title': 'The Thin Red Line', 'adult': 'False', 'belongs_to_collection': {}, 'budget': '52000000', 'genres': [{'id': 18, 'name': 'Drama'}, {'id': 36, 'name': 'History'}, {'id': 10752, 'name': 'War'}], 'homepage': 'null', 'original_language': 'en', 'overview': 'Based on the graphic novel by James Jones, The Thin Red Line tells the story of a group of men, an Army Rifle company called C-for-Charlie, who change, suffer, and ultimately make essential discoveries about themselves during the fierce World War II battle of Guadalcanal. It follows their journey, from the surprise of an unopposed landing, through the bloody and exhausting battles that follow, to the ultimate departure of those who survived. A powerful frontline cast - includin

In [3]:
print(movie_watch_data[0]['userid'])

102833


### Movie Watch History

In [4]:
print(movie_watch_data[0].keys())

dict_keys(['type', 'raw', 'movie_details', 'userid'])


In [5]:
import pandas as pd 

watch_history = [
    {
        "user_id": event["userid"] if "userid" in event else None,
        "movie_id": event["movie_details"]["id"] if "movie_details" in event else -1,
        "timestamp": event["raw"].split(",")[0],
    }
    for event in movie_watch_data
]

watch_history_df = pd.DataFrame(watch_history)

cf_watch_history = watch_history_df.dropna(subset=["user_id"])

cb_watch_history = watch_history_df.fillna({"userid": "unknown_user"})

watch_histroy_csv = os.path.join(repo_root, 'test-data/movie_watch_history.csv')

# Save both versions
cf_watch_history.to_csv(os.path.join(repo_root, "test-data/cf_movie_watch_history.csv"), index=False)
cb_watch_history.to_csv(os.path.join(repo_root, "test-data/cb_movie_watch_history.csv"), index=False)


### Collaborative Filtering(CF)

In [6]:
import json
import pandas as pd
import os

# Load rating events data
repo_root = os.path.abspath("..")
ratings_path = os.path.join(repo_root, 'rating_events.json')

with open(ratings_path, 'r') as f:
    ratings_data = json.load(f)

# Convert to DataFrame and ensure unique user-movie pairs
cf_data = pd.DataFrame([
    {
        "user_id": event["user_details"]["user_id"],
        "movie_id": event["movieid"],
        "rating": int(event["rating"])  # Convert to integer
    }
    for event in ratings_data
]).drop_duplicates(subset=["user_id", "movie_id"])

# Get exact 80/20 split
total_rows = len(cf_data)
test_size = int(total_rows * 0.2)

cf_train_df = cf_data.sample(frac=0.8, random_state=42)
cf_test_df = cf_data.drop(cf_train_df.index)

# Ensure users and movies in test also exist in train (avoid cold-start issues)
cf_test_df = cf_test_df[
    (cf_test_df["user_id"].isin(cf_train_df["user_id"])) &
    (cf_test_df["movie_id"].isin(cf_train_df["movie_id"]))
]

# If test set is too small, retry by adjusting the split
max_retries = 5
retry_count = 0

while len(cf_test_df) < test_size * 0.8 and retry_count < max_retries:
    retry_count += 1
    print(f"⚠️ Retry {retry_count}: Adjusting split to maintain test size...")
    cf_train_df = cf_data.sample(frac=0.75, random_state=42)
    cf_test_df = cf_data.drop(cf_train_df.index)
    cf_test_df = cf_test_df[
        (cf_test_df["user_id"].isin(cf_train_df["user_id"])) &
        (cf_test_df["movie_id"].isin(cf_train_df["movie_id"]))
    ]

# Save train/test datasets
cf_train_csv = os.path.join(repo_root, "test-data/cf_train_data.csv")
cf_test_csv = os.path.join(repo_root, "test-data/cf_test_data.csv")

cf_train_df.to_csv(cf_train_csv, index=False)
cf_test_df.to_csv(cf_test_csv, index=False)

print(f"✅ CF Train Data: {len(cf_train_df)} rows")
print(f"✅ CF Test Data: {len(cf_test_df)} rows")
print(f"✅ CF Train Data saved to {cf_train_csv}")
print(f"✅ CF Test Data saved to {cf_test_csv}")


⚠️ Retry 1: Adjusting split to maintain test size...
⚠️ Retry 2: Adjusting split to maintain test size...
⚠️ Retry 3: Adjusting split to maintain test size...
⚠️ Retry 4: Adjusting split to maintain test size...
⚠️ Retry 5: Adjusting split to maintain test size...
✅ CF Train Data: 163 rows
✅ CF Test Data: 8 rows
✅ CF Train Data saved to c:\Users\maitr\OneDrive\Desktop\cmu-2025\ai-engg\m1\Random-Forrest-Gump\test-data/cf_train_data.csv
✅ CF Test Data saved to c:\Users\maitr\OneDrive\Desktop\cmu-2025\ai-engg\m1\Random-Forrest-Gump\test-data/cf_test_data.csv


In [25]:
print(len(cf_data))

217


### Content-Based Filtering(CB)

In [21]:
import json
import pandas as pd
import os

repo_root = os.path.abspath("..")

# Load movie watch history
# watch_history_path = os.path.join(repo_root, "test-data/movie_watch_history.csv")
# watch_history_df = pd.read_csv(watch_history_path)

# Load movie details from processed logs
logs_path = os.path.join(repo_root, 'processed_logs.json')

with open(logs_path, 'r') as f:
    logs_data = json.load(f)

# Normalize JSON data (if needed)
logs_df = pd.json_normalize(logs_data)  # Ensures movie_details is properly extracted

# Merge movie details
cb_data = watch_history_df.merge(
    logs_df[['movie_details.id', 'movie_details.genres', 'movie_details.popularity']],
    left_on="movie_id",
    right_on="movie_details.id",
    how="left"
)

# Perform an exact 80/20 split
total_rows = len(cb_data)
test_size = int(total_rows * 0.2)  # Ensure exactly 20% test
train_size = total_rows - test_size  # Ensure 80% train

cb_train_df = cb_data.sample(n=train_size, random_state=42)  # Sample exact 80%
cb_test_df = cb_data.drop(cb_train_df.index)  # Remaining 20%

# Ensure test users & movies exist in train set (avoid leakage)
cb_test_df = cb_test_df[
    (cb_test_df["user_id"].isin(cb_train_df["user_id"])) &
    (cb_test_df["movie_id"].isin(cb_train_df["movie_id"]))
]

# Save train/test datasets
cb_train_csv = os.path.join(repo_root, "test-data/cb_train_data.csv")
cb_test_csv = os.path.join(repo_root, "test-data/cb_test_data.csv")

cb_train_df.to_csv(cb_train_csv, index=False)
cb_test_df.to_csv(cb_test_csv, index=False)

print(f"CB Train Data: {len(cb_train_df)} rows")
print(f"CB Test Data: {len(cb_test_df)} rows")
print(f"CB Train Data saved to {cb_train_csv}")
print(f"CB Test Data saved to {cb_test_csv}")



CB Train Data: 4458748 rows
CB Test Data: 1114686 rows
CB Train Data saved to c:\Users\maitr\OneDrive\Desktop\cmu-2025\ai-engg\m1\Random-Forrest-Gump\test-data/cb_train_data.csv
CB Test Data saved to c:\Users\maitr\OneDrive\Desktop\cmu-2025\ai-engg\m1\Random-Forrest-Gump\test-data/cb_test_data.csv
