In [1]:
import pandas as pd

file_path = "database.csv"
try:
    names_db = pd.read_csv(file_path)
except Exception as e0:
    try:
        names_db = pd.read_csv(file_path, encoding='latin1')
        print("Successfully read with latin1 encoding!")
    except Exception as e:
        print(f"Failed with latin1: {e}")

        # If latin1 fails, try 'cp1252' (common for files created on Windows)
        try:
            names_db = pd.read_csv(file_path, encoding='cp1252')
            print("Successfully read with cp1252 encoding!")
        except Exception as e2:
            print(f"Failed with cp1252: {e2}")
            # You can add more encodings to try here
            print("Could not determine the correct encoding. You might need to investigate the file's origin.")


In [2]:
names_db.head(10)

Unnamed: 0,article_id,full_name,sex,birth_year,city,state,country,continent,latitude,longitude,occupation,industry,domain,article_languages,page_views,average_views,historical_popularity_index
0,308,Aristotle,Male,-384,Stageira,,Greece,Europe,40.33333,23.5,Philosopher,Philosophy,Humanities,152,56355172,370758,31.9938
1,22954,Plato,Male,-427,Athens,,Greece,Europe,37.96667,23.71667,Philosopher,Philosophy,Humanities,142,46812003,329662,31.9888
2,1095706,Jesus Christ,Male,-4,Judea,,Israel,Asia,32.5,34.9,Religious Figure,Religion,Institutions,214,60299092,281771,31.8981
3,25664190,Socrates,Male,-469,Athens,,Greece,Europe,37.96667,23.71667,Philosopher,Philosophy,Humanities,137,40307143,294213,31.6521
4,783,Alexander the Great,Male,-356,Pella,,Greece,Europe,40.8,22.51667,Military Personnel,Military,Institutions,138,48358148,350421,31.584
5,18079,Leonardo da Vinci,Male,1452,Vinci,,Italy,Europe,43.78333,10.91667,Inventor,Invention,Science & Technology,174,88931135,511098,31.4644
6,5823,Confucius,Male,-551,Qufu,,China,Asia,,,Philosopher,Philosophy,Humanities,192,22363652,116477,31.3705
7,15924,Julius Caesar,Male,-100,Rome,,Italy,Europe,41.9,12.5,Politician,Government,Institutions,128,43088745,336631,31.1161
8,13633,Homer,Male,-800,Smyrna,,Turkey,Europe,38.41861,27.13917,Writer,Language,Humanities,141,20839405,147797,31.1087
9,23275,Pythagoras,Male,-570,Samos Island,,Greece,Europe,37.75,26.83333,Philosopher,Philosophy,Humanities,114,26168219,229546,31.0691


In [3]:
def get_filtered_celebrity_names(
    df: pd.DataFrame,
    birth_year_threshold: int,
    name_column: str = 'full_name',
    birth_column: str = 'birth_year',
    occupation_column: str = 'occupation',
    target_occupations: list = None
) -> list:
   
    if not isinstance(df, pd.DataFrame):
        print("Error: The first argument must be a pandas DataFrame.")
        return []
    if birth_column not in df.columns:
        print(f"Error: Birth column '{birth_column}' not found in DataFrame.")
        return []
    if name_column not in df.columns:
        print(f"Error: Name column '{name_column}' not found in DataFrame.")
        return []
    if occupation_column not in df.columns:
        print(f"Error: Occupation column '{occupation_column}' not found in DataFrame.")
        return []

    if target_occupations is None:
        target_occupations = ["Actor"]

    # Work on a copy to avoid SettingWithCopyWarning and to ensure data types
    df_copy = df.copy()

    # Ensure the birth column is numeric, converting if necessary.
    # Errors during conversion will result in NaN, which won't satisfy the condition.
    df_copy[birth_column] = pd.to_numeric(df_copy[birth_column], errors='coerce')

    # Define filter conditions
    birth_year_filter = df_copy[birth_column] > birth_year_threshold
    occupation_filter = df_copy[occupation_column].isin(target_occupations)

    # Combine filters
    combined_filter = birth_year_filter & occupation_filter

    # Apply the combined filter
    filtered_df = df_copy[combined_filter]

    # Get the list of names
    names_list = filtered_df[name_column].tolist()

    return names_list


In [15]:

year_cutoff = 1900

# Get the names using default target occupations
celebrity_names = get_filtered_celebrity_names(names_db, year_cutoff)

print(f"Celebs born after {year_cutoff} with specified occupations:")
if celebrity_names:
    for name in celebrity_names:
        print(name)
else:
    print("No celebrities found matching the criteria.")

# Example: Using a custom list of target occupations
custom_occupations = ["Actor"]
celebrity_names_custom = get_filtered_celebrity_names(
    names_db,
    year_cutoff,
    target_occupations=custom_occupations
)
print(f"\nCelebs born after {year_cutoff} who are Actors or Musicians:")
if celebrity_names_custom:
    for name in celebrity_names_custom:
        print(name)
else:
    print("No celebrities found matching the custom criteria.")

Celebs born after 1900 with specified occupations:
Marilyn Monroe
Bruce Lee
Marlon Brando
Al Pacino
Marlene Dietrich
Audrey Hepburn
Clint Eastwood
Robert De Niro
Jack Nicholson
Arnold Schwarzenegger
Chuck Norris
Brigitte Bardot
Sylvester Stallone
James Dean
Sean Connery
Sophia Loren
Anthony Hopkins
John Wayne
Orson Welles
Greta Garbo
Dustin Hoffman
Harrison Ford
Ingrid Bergman
Alain Delon
Louis de Funès
Bud Spencer
Jackie Chan
Katharine Hepburn
Grace Kelly
Charles Bronson
Vivien Leigh
Terence Hill
Henry Fonda
Judy Garland
Gregory Peck
Marcello Mastroianni
Jean Reno
Morgan Freeman
Catherine Deneuve
Steve McQueen
Robert Redford
Michael Douglas
Romy Schneider
Kirk Douglas
Cary Grant
Clark Gable
Bette Davis
Anthony Quinn
Jon Voight
Michael Caine
Rita Hayworth
Toshiro Mifune
Christopher Lee
Meryl Streep
Bruce Willis
Johnny Depp
Steven Seagal
Robin Williams
Gary Cooper
Paul Newman
Roger Moore
Gene Hackman
Claudia Cardinale
Omar Sharif
Ava Gardner
Laurence Olivier
Richard Gere
Brad Pitt
Yul B

In [16]:
len(celebrity_names)

1152

In [18]:
#THIS CAN BE USED IF YOU WANT TO SPLIT THE COLLECTION PHASE WITH DIFFERENT COMPUTERS
import os 

def distribute_names_to_csvs(
    celebrity_names_list: list,
    names_per_file: int,
    num_files_to_create: int,
    output_filename_prefix: str = "celebrity_names_split"
):
   
    if not isinstance(celebrity_names_list, list):
        print("Error: 'celebrity_names_list' must be a list.")
        return

    if not isinstance(names_per_file, int) or names_per_file <= 0:
        print("Error: 'names_per_file' must be a positive integer.")
        return
    if not isinstance(num_files_to_create, int) or num_files_to_create <= 0:
        print("Error: 'num_files_to_create' (div) must be a positive integer.")
        return

    names_to_distribute = list(celebrity_names_list)
    total_names_available_initially = len(names_to_distribute)
    
    names_written_across_all_files = 0
    files_actually_created_count = 0
    
    current_processing_start_index = 0

    print(f"Attempting to create {num_files_to_create} CSV files with up to {names_per_file} names each.")
    print(f"Total unique names available in input list: {total_names_available_initially}\n")

    for i in range(num_files_to_create):
        file_number = i + 1
        names_for_this_file = []

        # Check if there are any names left to process for the *start* of this file's chunk
        if current_processing_start_index < total_names_available_initially:
            # Slice the list to get names for the current file.
            # Slicing handles cases where the end index goes beyond list length.
            names_for_this_file = names_to_distribute[
                current_processing_start_index : current_processing_start_index + names_per_file
            ]
        
        df_chunk = pd.DataFrame(names_for_this_file, columns=['full_name'])
        
        output_filename = f"{output_filename_prefix}_{file_number}.csv"


        try:
            df_chunk.to_csv(output_filename, index=False)
            if names_for_this_file:
                print(f"Successfully created '{output_filename}' with {len(names_for_this_file)} names.")
            else:
                print(f"Successfully created empty file '{output_filename}' as no (more) names were available for this part.")
            names_written_across_all_files += len(names_for_this_file)
            files_actually_created_count +=1
        except Exception as e:
            print(f"Error writing to CSV file '{output_filename}': {e}")
            print("Skipping this file and continuing to the next, if any.")

        current_processing_start_index += names_per_file
        
    print(f"\n--- Distribution Summary ---")
    print(f"Requested to create: {num_files_to_create} files.")
    print(f"Actually created: {files_actually_created_count} files (could be less if errors occurred during saving).")
    print(f"Total names written across all files: {names_written_across_all_files}.")
    if names_written_across_all_files < total_names_available_initially and files_actually_created_count == num_files_to_create :
        print(f"Note: {total_names_available_initially - names_written_across_all_files} names from the original list were not written "
              f"because the capacity of {num_files_to_create} files * {names_per_file} names/file was filled or exceeded by available names.")
    elif names_written_across_all_files == total_names_available_initially:
         print("All names from the input list have been distributed among the created files.")


# Scenario 1: Enough names to fill all files perfectly
print("--- Scenario 1 ---")
distribute_names_to_csvs(
    celebrity_names_list=celebrity_names,
    names_per_file=300,
    num_files_to_create=4,
    output_filename_prefix="celebs_batch_A"
)


--- Scenario 1 ---
Attempting to create 4 CSV files with up to 300 names each.
Total unique names available in input list: 1152

Successfully created 'celebs_batch_A_1.csv' with 300 names.
Successfully created 'celebs_batch_A_2.csv' with 300 names.
Successfully created 'celebs_batch_A_3.csv' with 300 names.
Successfully created 'celebs_batch_A_4.csv' with 252 names.

--- Distribution Summary ---
Requested to create: 4 files.
Actually created: 4 files (could be less if errors occurred during saving).
Total names written across all files: 1152.
All names from the input list have been distributed among the created files.
