In [1]:
import os
import sys
import numpy as np
import pandas as pd
import umap
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set up paths - navigate from src/analysis to data directory
try:
    BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
except NameError:
    # Fallback for notebooks - go up two directories from current working directory
    BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))

DATA_DIR = os.path.join(BASE_DIR, 'data')
print(f"Data directory: {DATA_DIR}")

  from .autonotebook import tqdm as notebook_tqdm


Data directory: /home/nab/GroupDataLiteracy/data


In [2]:
# Add BASE_DIR/src to sys.path to allow import
src_path = os.path.join(BASE_DIR, 'src')
if src_path not in sys.path:
    sys.path.append(src_path)

from data_utils import load_movie_data

df = load_movie_data(DATA_DIR)
print("Dataframe length: ", len(df))
print("Dataframe columns: ", df.columns)

# Find and count duplicate plots in the dataframe
# Group by the 'plot' column and count the number of occurrences for each unique plot
plot_duplicates = df.groupby('plot').size().reset_index(name='count')
# Filter to only plots that appear more than once (i.e., duplicates)
duplicate_plots = plot_duplicates[plot_duplicates['count'] > 1]

print(f"Number of unique plots with duplicates: {len(duplicate_plots)}")
print(f"Total number of duplicated entries: {duplicate_plots['count'].sum()}")

Dataframe length:  214682
Dataframe columns:  Index(['movie_id', 'title', 'summary', 'release_date', 'genre', 'director',
       'actors', 'duration', 'imdb_id', 'country', 'sitelinks',
       'wikipedia_link', 'budget', 'box_office', 'awards', 'set_in_period',
       'year', 'popularity', 'vote_average', 'vote_count', 'tmdb_id', 'plot'],
      dtype='object')
Number of unique plots with duplicates: 15621
Total number of duplicated entries: 38182


In [10]:
# Identify the top movies by the length of the plot, then count tokens only for the first 100
from transformers import AutoTokenizer

# Load the BGE-M3 tokenizer (do only once per session)
model_name = "BAAI/bge-m3"
try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
except Exception as e:
    print(f"Error loading tokenizer. Ensure you have internet access. {e}")
    raise

# Define token counting function
def count_tokens(text):
    if not isinstance(text, str):
        return 0
    text = str(text)
    try:
        return len(tokenizer(text, add_special_tokens=False)["input_ids"])
    except Exception as err:
        print(f"Tokenization error for text: {text[:30]}... ({err})")
        return 0

# Sort dataframe by plot length (ascending) and limit to the first 100
data_sorted_by_length_plot = df.assign(
    plot_length=df['plot'].str.len()
).sort_values(by='plot_length', ascending=True)

top100 = data_sorted_by_length_plot.head(100).copy()
top100['plot_token_count'] = top100['plot'].apply(count_tokens)

counter = 0
for index, row in top100.iterrows():
    print(
        f"{index} {counter} {row['title']}, Length: {row['plot_length']}, Tokens: {row['plot_token_count']}, Year: {row['year']}"
    )
    print("\n")
    counter += 1
    if counter > 5:
        break


100573 0 The Unspoken, Length: 10, Tokens: 6, Year: 1999


173627 1 The Unspoken, Length: 10, Tokens: 6, Year: 2015


153113 2 Die Kirche bleibt im Dorf, Length: 11, Tokens: 6, Year: 2012


124340 3 BZ, Length: 16, Tokens: 6, Year: 2006


60417 4 Crazy Day of Engineer Barkasov, Length: 16, Tokens: 6, Year: 1983


155132 5 Silent, Length: 16, Tokens: 5, Year: 2012




In [5]:
df = df[df['plot'].notna() & (df['plot'].str.len() > 0) & (df['genre'].notna())].copy()
print(f"Movies with plot data and genre: {len(df)}")

Movies with plot data and genre: 134024
