In [15]:
import pandas as pd
import os
import json
from imdb import Cinemagoer
import requests
from io import BytesIO
from PIL import Image
from concurrent.futures import ThreadPoolExecutor, as_completed


In [16]:
from loader_clone import create_loaders

demo_loader, experiment_loader = create_loaders()

2017


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  loader.ratings_df.loc[:, "ratings_per_year"] = loader.ratings_df['movieId'].map(loader.ratings_df['movieId'].value_counts()) / loader.ratings_df['movieId'].map(movies_df_indexed["age"])


Ratings shape after filtering: (3536742, 5), n_users = 9612, n_items = 1525
2017


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  loader.ratings_df.loc[:, "ratings_per_year"] = loader.ratings_df['movieId'].map(loader.ratings_df['movieId'].value_counts()) / loader.ratings_df['movieId'].map(movies_df_indexed["age"])


Ratings shape after filtering: (8146440, 5), n_users = 34683, n_items = 9456


# Plots

This section contains the code to enrich the dataset with plots of the movies.  
*Note : The portion of the movie plots was provided from previous work.*

In [17]:
# Use provided plots

#https://osf.io/chbj9/files/osfstorage/675a1d242d78acf176ea0433
with open('movie_data_plot.json', 'r', encoding='utf-8') as f:
    plot_data = json.load(f)

df = pd.read_csv('movies.csv')

def get_plot(row):
    current_plot = str(row.get('plot', '')).strip()
    if current_plot and current_plot != "X":
        return current_plot  # Keep existing plot

    movie_str_id = str(row['movieId'])
    plot_list = plot_data.get(movie_str_id, {}).get("plot", [])
    return plot_list[0] if plot_list else "X"

df['plot'] = df.apply(get_plot, axis=1)

df.to_csv('movies.csv', index=False)

In [18]:
# Download the remaining plots...

exp_movies = experiment_loader.movies_df
exp_movie_ids = set(exp_movies['movieId'])

movies_plot_df = pd.read_csv('movies_plot_updated.csv', dtype={'movieId': int})

links_df = pd.read_csv('links.csv', dtype={'movieId': int, 'imdbId': str})
links_df['imdbId'] = links_df['imdbId'].str.zfill(7)
movie_to_imdb = dict(zip(links_df['movieId'], links_df['imdbId']))

cg = Cinemagoer()

def get_plot(movie_id):
    imdb_id = movie_to_imdb.get(movie_id)
    if not imdb_id:
        return movie_id, "X"
    try:
        movie = cg.get_movie(imdb_id)
        raw_plot = movie.get('plot', [''])[0]
        plot = raw_plot.split("::")[0].strip() or "X"
        return movie_id, plot
    except Exception as e:
        print(f"Failed for movieId={movie_id}: {e}")
        return movie_id, "X"

to_update_mask = (movies_plot_df['movieId'].isin(exp_movie_ids)) & (movies_plot_df['plot'] == "X")
to_update_indices = movies_plot_df[to_update_mask].index.tolist()

batch_size = 10
total = len(to_update_indices)
print(f"Total to update: {total}")

for batch_start in range(0, total, batch_size):
    batch_indices = to_update_indices[batch_start:batch_start+batch_size]
    print(f"Processing batch {batch_start // batch_size + 1} with {len(batch_indices)} movies...")

    with ThreadPoolExecutor(max_workers=batch_size) as executor:
        futures = {executor.submit(get_plot, movies_plot_df.at[idx, 'movieId']): idx for idx in batch_indices}

        for future in as_completed(futures):
            idx = futures[future]
            try:
                movie_id, plot = future.result()
                movies_plot_df.at[idx, 'plot'] = plot
                print(f"Updated movieId={movie_id}, index: {idx} with plot: {plot}")
            except Exception as e:
                print(f"Exception while updating index {idx}: {e}")

    movies_plot_df.to_csv('movies_plot_updated.csv', index=False)
    print(f"Saved progress after batch {batch_start // batch_size + 1}")

# !!!! BROKEN MOVIES !!!!
# Updated movieId=720, index: 705 with plot: X
# Updated movieId=162864, index: 42387 with plot: X
# manually update these movies

movies_plot_df.at[705, 'plot'] = "Wallace & Gromit, The Best of Aardman Animation"
movies_plot_df.at[42387, 'plot'] = "The half-hour series aired weekly and featured stand-up comedy specials from some of the top performing comedians."
movies_plot_df.to_csv('movies_plot_updated.csv', index=False)


Total to update: 0


# Covers

This section contains the code to enrich the dataset with covers of the movies.

*Note : The portion of the movie covers was provided from previous work.* -> https://osf.io/chbj9/files/osfstorage/675a1d6d4693a99b3e87baea



In [19]:
exp_movie_ids = set(experiment_loader.movies_df['movieId'])
img_folder = "img"
placeholder = "no_cover.jpg"
links_csv = "links.csv"
TARGET_WIDTH = 200
BATCH_SIZE = 5

os.makedirs(img_folder, exist_ok=True)
ia = Cinemagoer()

links_df = pd.read_csv(links_csv, dtype={'movieId': int, 'imdbId': str})
links_df['imdbId'] = links_df['imdbId'].str.zfill(7)
movie_to_imdb = dict(zip(links_df['movieId'], links_df['imdbId']))

existing_covers = {int(os.path.splitext(f)[0]) for f in os.listdir(img_folder) if f.endswith(".jpg")}
movies_to_download = [mid for mid in exp_movie_ids if mid not in existing_covers]

print(f"Total covers to download: {len(movies_to_download)}")

def get_and_save_cover(movie_id):
    img_path = os.path.join(img_folder, f"{movie_id}.jpg")
    imdb_id = movie_to_imdb.get(movie_id)

    if not imdb_id:
        return f"{movie_id}: No IMDb ID"

    try:
        movie = ia.get_movie(imdb_id)
        cover_url = movie.get("full-size cover url", None)

        if cover_url:
            response = requests.get(cover_url, timeout=10)
            if response.status_code == 200:
                img = Image.open(BytesIO(response.content))
                w, h = img.size
                ratio = TARGET_WIDTH / w
                img = img.resize((TARGET_WIDTH, int(h * ratio)), Image.LANCZOS).convert("RGB")
                img.save(img_path)
                return f"{movie_id}: Downloaded"
    except Exception as e:
        return f"{movie_id}: Download error - {e}"

    try:
        img = Image.open(placeholder)
        w, h = img.size
        ratio = TARGET_WIDTH / w
        img = img.resize((TARGET_WIDTH, int(h * ratio)), Image.LANCZOS).convert("RGB")
        img.save(img_path)
        return f"{movie_id}: Used placeholder"
    except Exception as e:
        return f"{movie_id}: Placeholder failed - {e}"

for batch_start in range(0, len(movies_to_download), BATCH_SIZE):
    batch = movies_to_download[batch_start:batch_start + BATCH_SIZE]
    print(f"\nProcessing batch {batch_start // BATCH_SIZE + 1}/{(len(movies_to_download) - 1) // BATCH_SIZE + 1}")

    with ThreadPoolExecutor(max_workers=BATCH_SIZE) as executor:
        futures = {executor.submit(get_and_save_cover, mid): mid for mid in batch}
        for future in as_completed(futures):
            print(future.result())

Total covers to download: 0
