In [1]:
import os
import sys
import numpy as np
import pandas as pd
import umap
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set up paths - navigate from src/analysis to data directory
try:
    BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
except NameError:
    # Fallback for notebooks - go up two directories from current working directory
    BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))

DATA_DIR = os.path.join(BASE_DIR, 'data')
print(f"Data directory: {DATA_DIR}")

  from .autonotebook import tqdm as notebook_tqdm


Data directory: /home/nab/Niklas/GroupDataLiteracy/data


In [2]:
# Add BASE_DIR/src to sys.path to allow import
src_path = os.path.join(BASE_DIR, 'src')
if src_path not in sys.path:
    sys.path.append(src_path)

from data_utils import load_movie_data

df = load_movie_data(DATA_DIR)
print("Dataframe length: ", len(df))
print("Dataframe columns: ", df.columns)

# Find and count duplicate plots in the dataframe
# Group by the 'plot' column and count the number of occurrences for each unique plot
plot_duplicates = df.groupby('plot').size().reset_index(name='count')
# Filter to only plots that appear more than once (i.e., duplicates)
duplicate_plots = plot_duplicates[plot_duplicates['count'] > 1]

print(f"Number of unique plots with duplicates: {len(duplicate_plots)}")
print(f"Total number of duplicated entries: {duplicate_plots['count'].sum()}")

Dataframe length:  141119
Dataframe columns:  Index(['movie_id', 'country', 'imdb_id', 'duration', 'duration_all',
       'actors_id', 'actors', 'directors_id', 'directors', 'genre_id', 'genre',
       'release_date', 'wikidata_class', 'wikipedia_link', 'title', 'summary',
       'set_in_period', 'awards', 'budget', 'budget_currency', 'box_office',
       'box_office_currency', 'box_office_worldwide',
       'box_office_worldwide_currency', 'popularity', 'vote_average',
       'vote_count', 'tmdb_id', 'plot', 'year'],
      dtype='object')
Number of unique plots with duplicates: 358
Total number of duplicated entries: 1000


In [5]:
# Identify the top movies by the length of the plot, then count tokens only for the first 100
from transformers import AutoTokenizer

# Load the BGE-M3 tokenizer (do only once per session)
model_name = "BAAI/bge-m3"
try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
except Exception as e:
    print(f"Error loading tokenizer. Ensure you have internet access. {e}")
    raise

# Define token counting function
def count_tokens(text):
    if not isinstance(text, str):
        return 0
    text = str(text)
    try:
        return len(tokenizer(text, add_special_tokens=False)["input_ids"])
    except Exception as err:
        print(f"Tokenization error for text: {text[:30]}... ({err})")
        return 0

# Sort dataframe by plot length (ascending) and limit to the first 100
data_sorted_by_length_plot = df.assign(
    plot_length=df['plot'].str.len()
).sort_values(by='plot_length', ascending=False)

top100 = data_sorted_by_length_plot.head(100).copy()
top100['plot_token_count'] = top100['plot'].apply(count_tokens)

counter = 0
for index, row in top100.iterrows():
    print(
        f"{index} {counter} {row['title']}, Length: {row['plot_length']}, Tokens: {row['plot_token_count']}, Year: {row['year']}"
    )
    print("\n")
    counter += 1
    if counter > 5:
        break


99701 0 Space Battleship Yamato: Resurrection, Length: 19672, Tokens: 5021, Year: 2009


127347 1 The Delivery Boy, Length: 17879, Tokens: 4566, Year: 2018


134018 2 Detective Conan: The Scarlet Bullet, Length: 17581, Tokens: 4360, Year: 2021


68672 3 The Birds II: Land's End, Length: 16728, Tokens: 4124, Year: 1994


61690 4 La Révolution française, Length: 16696, Tokens: 4151, Year: 1989


59343 5 Sworn Brothers, Length: 16613, Tokens: 4090, Year: 1987




In [4]:
df = df[df['plot'].notna() & (df['plot'].str.len() > 0) & (df['genre'].notna())].copy()
print(f"Movies with plot data and genre: {len(df)}")

Movies with plot data and genre: 118491
