In [5]:
import os
import sys
import numpy as np
import pandas as pd
import umap
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set up paths - navigate from src/analysis to data directory
try:
    BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
except NameError:
    # Fallback for notebooks - go up two directories from current working directory
    BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))

DATA_DIR = os.path.join(BASE_DIR, 'data')
print(f"Data directory: {DATA_DIR}")

Data directory: /home/nab/GroupDataLiteracy/data


In [9]:
# Add BASE_DIR/src to sys.path to allow import
src_path = os.path.join(BASE_DIR, 'src')
if src_path not in sys.path:
    sys.path.append(src_path)

from data_utils import load_movie_data

df = load_movie_data(DATA_DIR)
print("Dataframe length: ", len(df))
print("Dataframe columns: ", df.columns)

Dataframe length:  214682
Dataframe columns:  Index(['movie_id', 'title', 'summary', 'release_date', 'genre', 'director',
       'actors', 'duration', 'imdb_id', 'country', 'sitelinks',
       'wikipedia_link', 'budget', 'box_office', 'awards', 'set_in_period',
       'year', 'popularity', 'vote_average', 'vote_count', 'tmdb_id', 'plot'],
      dtype='object')


In [43]:
# identify the top movies by the length of the plot, the plot is a column in the dataframe a string
# Sort by the length of the 'plot' column
data_sorted_by_length_plot = df.assign(plot_length=df['plot'].str.len()).sort_values(by='plot_length', ascending=False)

counter = 0

for index, row in data_sorted_by_length_plot.iterrows():
    print(index, counter, row['title'], row['plot_length'], row['year'])
    print("\n")
    counter += 1
    if counter > 5:
        break


213577 0 Strange Frequencies 22552.0 2024


110540 1 Battle Royale II: Requiem 20479.0 2003


137534 2 Space Battleship Yamato: Resurrection 19672.0 2009


188366 3 The Delivery Boy 17879.0 2018


201509 4 Detective Conan: The Scarlet Bullet 17545.0 2021


86260 5 The Birds II: Land's End 16728.0 1994




In [49]:
from transformers import AutoTokenizer

# Load the BGE-M3 tokenizer
model_name = "BAAI/bge-m3"
try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
except Exception as e:
    print(f"Error loading tokenizer. Ensure you have internet access. {e}")
    raise

# Number of plots to process
n = 100  # You can modify this variable as needed

# Define robust token counting function
def count_tokens(text):
    if not isinstance(text, str):
        return 0
    text = str(text)
    try:
        return len(tokenizer(text, add_special_tokens=False)["input_ids"])
    except Exception as err:
        print(f"Tokenization error for text: {text[:30]}... ({err})")
        return 0

# Only apply the token count to the first n plots of the already sorted dataframe
plots_to_process = data_sorted_by_length_plot.head(n).copy()
plots_to_process['plot_token_count'] = plots_to_process['plot'].apply(count_tokens)

# Optionally, merge back to full dataframe or just use processed subset:
# df.loc[plots_to_process.index, 'plot_token_count'] = plots_to_process['plot_token_count']

# Find the top three movies (within these n) with the largest plot_token_count
top_count_plots = plots_to_process.sort_values(by='plot_token_count', ascending=False).head(30)

for index, row in top_count_plots.iterrows():
    print(index, row['title'], row['plot_token_count'], row['year'], row['plot_length'])
    print("\n")


213577 Strange Frequencies 5701 2024 22552.0


110540 Battle Royale II: Requiem 5296 2003 20479.0


137534 Space Battleship Yamato: Resurrection 5021 2009 19672.0


188366 The Delivery Boy 4566 2018 17879.0


201509 Detective Conan: The Scarlet Bullet 4351 2021 17545.0


74293 La Révolution française 4149 1989 16691.0


86260 The Birds II: Land's End 4124 1994 16728.0


69861 Sworn Brothers 4090 1987 16613.0


205188 The Battle at Lake Changjin II 4043 2022 15213.0


136417 Summer's Blood 3858 2009 14859.0


136383 Barbarossa 3754 2009 14878.0


173969 Veli Jože 3754 2015 14628.0


202535 The Whole Truth 3735 2021 14226.0


133824 The Crew 3730 2008 14259.0


118933 Locusts 3713 2005 15053.0


52676 Mr. Horn 3631 1979 14054.0


163613 Punjab 1984 3628 2014 14385.0


38680 Arabian Nights 3601 1974 13999.0


40998 Arabian Nights 3601 1975 13999.0


66938 Arabian Nights 3601 1986 13999.0


52796 Arabian Nights 3601 1980 13999.0


83601 Les Visiteurs 3599 1993 13578.0


84329 Kid Cop 3580 