In [1]:
import pandas as pd
import re

collect all raw files into one list

In [2]:
data_paths = [
              '/Users/antoninalightfoot/Library/Mobile Documents/com~apple~CloudDocs/Documents/TLU/Data analysis/data/Met/MetObjects.csv',
              '/Users/antoninalightfoot/Library/Mobile Documents/com~apple~CloudDocs/Documents/TLU/Data analysis/data/Reina Sofia/raw_data/reina_sofia3.csv',
              '/Users/antoninalightfoot/Library/Mobile Documents/com~apple~CloudDocs/Documents/TLU/Data analysis/data/Tate/artwork_data.csv']
raw_data = []
for df in data_paths:
    if '.json' in df:
        data = pd.read_json(df)
    elif '.csv' in df:
        data = pd.read_csv(df, on_bad_lines='skip', low_memory=False)
    else:
        print ("cannot load " + df)
    raw_data.append(data)

create functions for cleaning fields.
artist name, artwork title, artwork medium, and acquisition source, aquisition year, creation year, artist nationality, artist gender

In [3]:
def create_artist_name(df):
    # List of possible column names for the artist name
    possible_columns = ['artist', 'author', 'artist_name', 'Artist Display Name', 'artist name']
    
    # Loop over the possible column names and use the first one that exists
    for col in possible_columns:
        if col in df.columns:
            df['Artist'] = df[col]
            break  # Exit loop once we find the first match
    
    return df

In [4]:
def create_artwork_title(df):
    # List of possible column names for the artist name
    possible_columns = ['object_title', 'artwork_name', 'Title', 'url', 'name', 'title']
    
    # Loop over the possible column names and use the first one that exists
    for col in possible_columns:
        if col in df.columns:
            df['Title'] = df[col]
            break  # Exit loop once we find the first match
    
    return df

In [5]:
raw_data[1].columns

Index(['Unnamed: 0', 'name', 'author', 'born-death-raw', 'author_born_year',
       'author_death_year', 'nationality_artist', 'year_production',
       'year_adquisition', 'type_artwork', 'register_artwork',
       'medium_description_artwork', 'dimensions_artwork',
       'observations_artwork', 'url_artwork', 'image_artwork'],
      dtype='object')

In [6]:
def create_artwork_medium(df):
    # List of possible column names for the artist name
    possible_columns = ['object_type','medium', 'type_artwork', 'artwork_medium', 'Medium']
    # Loop over the possible column names and use the first one that exists
    for col in possible_columns:
        if col in df.columns:
            df['Medium'] = df[col]
            break  # Exit loop once we find the first match
    return df

In [7]:
def extract_numbers(value):
    # Check if the value is NaN or not a string/number
    if pd.isna(value):  # Check for NaN
        return ""  # Return empty string for NaN values
    elif isinstance(value, str):  # If it's a string
        return re.sub("[^0-9]", "", value)  # Extract digits
    elif isinstance(value, (int, float)):  # If it's a number (int or float)
        return str(int(value))  # Convert to string and extract digits
    return ""  # Return empty string if it's not a valid value

def clean_acquisition_year(df):
   # List of possible column names for the artist name
    possible_columns = ['acquisition_date', 'credit_line', 'AccessionYear', 'artwork_acquisition', 'year_adquisition', 'acquisitionYear' ]
    df['Year_acquisition'] = pd.NA
  
    # Loop over the possible column names and use the first one that exists
    for col in possible_columns:
        if col in df.columns:
            df['Year_acquisition'] = df[col].apply(extract_numbers)
            break  # Exit loop once we find the first match
    
    return df 

use functions for raw datasets and dreate a new list with clean datasets

In [8]:
# function that would categorize Medium column into less granular groups

In [9]:
def classify_medium(df):
    df['Medium_classified'] = pd.NA
    medium_tags = [['paper', 'watercolor', 'card', 'watercolour', 'board', 'chalk', 'mixed', 'print', 'parchment', 'graphic',
                    'engraving', 'etching', 'dessin'], #graphics
                   ['oil', 'canvas', 'paint', 'fresco', 'painting', 'peinture'], #painting
                   ['bronze', 'granite', 'marble', 'alabaster', 'clay', 'iron', 'cement', 'concrete', 'plaster', 'sculpture'], #sculpture
                   ['film', 'video'], #video art
                   ['fabric', 'glass', 'wood', 'aluminium', 'steel', 'plastic', 'book', 'copper', 'stone', 
                    'slate', 'wool', 'willow', 'cedar', 'wax', 'walnut', 'tin', 'plate', 'textile', 'terracota', 
                    'mahogany', 'pine', 'oak', 'iron', 'suede', 'brass', 'sheet', 'cast', 'acacia', 'metal', 'beech', 'celluloid',
                    'cellulose', 'ceramic', 'rubber', 'elm', 'earthenware', 'epoxy', 'felt', 'firebricks', 'flint', 'banknote',
                    'ivory', 'lead', 'leather', 'terracota', 'perspex', 'resin', 'quartz', 'porcelain', 'object', 'medal',  'silk', 'cotton', 'cashmere'], #object
                   ['boat', 'structure', 'burlap', 'bean', 'hook', 'knive', 'hat', 'light', 'wall', 'machine', 
                    'vinyl', 'cleaner', 'door', 'mirror', 'form', 'metronome', 'sock', 'carpet', 'latex', 'installation'], #installation
                   ['photo'], #photography
                   ['software', 'audio', 'slide', 'digital', 'net', 'sound'], #new media
                   ['architecture'], #architecture
                   ]
    medium_name = ['graphics', 'painting', 'sculpture', 'video art', 'object', 'installation', 'photography', 'new media', 'architecture']
     # Iterate over each row in the DataFrame
    for idx, row in df.iterrows():
        medium = row['Medium']  # Assuming the column is named 'Medium'
        
        # Skip if the 'Medium' value is not a string (e.g., NaN or other types)
        if not isinstance(medium, str):
            continue
        
        # Check each tag group (e.g., ['paper', 'watercolor'], ['oil'])
        for i, tag_group in enumerate(medium_tags):
            # Check if any keyword from the tag group exists in the 'Medium' column
            if any(tag.lower() in medium.lower() for tag in tag_group):
                # Assign the corresponding medium name (from medium_name list)
                df.at[idx, 'Medium_classified'] = medium_name[i]
                break  # Stop once we find a match
    
    return df

In [10]:
transformed_data = []
for df in raw_data:
    create_artist_name(df)
    create_artwork_title(df)
    create_artwork_medium(df)
    clean_acquisition_year(df)
    classify_medium(df)
    transformed_data.append(df)


In [11]:
# Filter rows where 'Medium_classified' is NaN
test = transformed_data[0][transformed_data[0]['Medium_classified'].isna()][['Medium', 'Medium_classified']]


save clean datasets in separate csv files

In [15]:
clean_data = []
for df in transformed_data:
    df = df[[ 'Artist', 'Title','Medium', 'Year_acquisition', 'Medium_classified']]
    clean_data.append(df)

In [16]:
# List of dataset names to match the datasets
dataset_names = [ 
                 'met',  
                 'reina_sofia', 
                 'tate']

# Save each dataset using the corresponding name
for dataset, name in zip(clean_data, dataset_names):
    # Generate a unique filename using the dataset name
    file_name = f"{name}_clean.csv"
    
    # Save the DataFrame to a CSV file
    dataset.to_csv(file_name, index=False)
    print(f"Saved {file_name}")

Saved met_clean.csv
Saved reina_sofia_clean.csv
Saved tate_clean.csv
