In [174]:
import pandas as pd
import re

collect all raw files into one list

In [175]:
data_paths = ['/Users/antoninalightfoot/Library/Mobile Documents/com~apple~CloudDocs/Documents/TLU/Data analysis/data/Centre Pompidou/merged_file.json',
              '/Users/antoninalightfoot/Library/Mobile Documents/com~apple~CloudDocs/Documents/TLU/Data analysis/data/Met/MetObjects.csv',
              '/Users/antoninalightfoot/Library/Mobile Documents/com~apple~CloudDocs/Documents/TLU/Data analysis/data/Moderna/moderna_full.csv',
              '/Users/antoninalightfoot/Library/Mobile Documents/com~apple~CloudDocs/Documents/TLU/Data analysis/data/Reina Sofia/raw_data/reina_sofia3.csv',
              '/Users/antoninalightfoot/Library/Mobile Documents/com~apple~CloudDocs/Documents/TLU/Data analysis/data/Tate/artwork_data.csv']
raw_data = []
for df in data_paths:
    if '.json' in df:
        data = pd.read_json(df)
    elif '.csv' in df:
        data = pd.read_csv(df, on_bad_lines='skip', low_memory=False)
    else:
        print ("cannot load " + df)
    raw_data.append(data)

create functions for cleaning fields.
artist name, artwork title, artwork medium, and acquisition source, aquisition year, creation year, artist nationality, artist gender

In [176]:
def create_artist_name(df):
    # List of possible column names for the artist name
    possible_columns = ['artist', 'author', 'artist_name', 'Artist Display Name', 'artist name']
    
    # Loop over the possible column names and use the first one that exists
    for col in possible_columns:
        if col in df.columns:
            df['Artist'] = df[col]
            break  # Exit loop once we find the first match
    
    return df

In [177]:
def create_artwork_title(df):
    # List of possible column names for the artist name
    possible_columns = ['object_title', 'artwork_name', 'Title', 'url', 'name', 'title']
    
    # Loop over the possible column names and use the first one that exists
    for col in possible_columns:
        if col in df.columns:
            df['Title'] = df[col]
            break  # Exit loop once we find the first match
    
    return df

In [178]:
raw_data[1].columns

Index(['Object Number', 'Is Highlight', 'Is Timeline Work', 'Is Public Domain',
       'Object ID', 'Gallery Number', 'Department', 'AccessionYear',
       'Object Name', 'Title', 'Culture', 'Period', 'Dynasty', 'Reign',
       'Portfolio', 'Constituent ID', 'Artist Role', 'Artist Prefix',
       'Artist Display Name', 'Artist Display Bio', 'Artist Suffix',
       'Artist Alpha Sort', 'Artist Nationality', 'Artist Begin Date',
       'Artist End Date', 'Artist Gender', 'Artist ULAN URL',
       'Artist Wikidata URL', 'Object Date', 'Object Begin Date',
       'Object End Date', 'Medium', 'Dimensions', 'Credit Line',
       'Geography Type', 'City', 'State', 'County', 'Country', 'Region',
       'Subregion', 'Locale', 'Locus', 'Excavation', 'River', 'Classification',
       'Rights and Reproduction', 'Link Resource', 'Object Wikidata URL',
       'Metadata Date', 'Repository', 'Tags', 'Tags AAT URL',
       'Tags Wikidata URL'],
      dtype='object')

In [185]:
raw_data[1]['Medium']

0                                 Gold
1                                 Gold
2                                 Gold
3                                 Gold
4                                 Gold
                      ...             
476313              Dye transfer print
476314              Dye transfer print
476315    Drypoint and color engraving
476316                             NaN
476317                             NaN
Name: Medium, Length: 476318, dtype: object

In [180]:
def create_artwork_medium(df):
    # List of possible column names for the artist name
    possible_columns = ['object_type','medium', 'type_artwork', 'artwork_medium', 'Medium']
    # Loop over the possible column names and use the first one that exists
    for col in possible_columns:
        if col in df.columns:
            df['Medium'] = df[col]
            break  # Exit loop once we find the first match
    return df

In [181]:
def extract_numbers(value):
    # Check if the value is NaN or not a string/number
    if pd.isna(value):  # Check for NaN
        return ""  # Return empty string for NaN values
    elif isinstance(value, str):  # If it's a string
        return re.sub("[^0-9]", "", value)  # Extract digits
    elif isinstance(value, (int, float)):  # If it's a number (int or float)
        return str(int(value))  # Convert to string and extract digits
    return ""  # Return empty string if it's not a valid value

def clean_acquisition_year(df):
   # List of possible column names for the artist name
    possible_columns = ['acquisition_date', 'credit_line', 'AccessionYear', 'artwork_acquisition', 'year_adquisition', 'acquisitionYear' ]
    df['Year_acquisition'] = pd.NA
  
    # Loop over the possible column names and use the first one that exists
    for col in possible_columns:
        if col in df.columns:
            df['Year_acquisition'] = df[col].apply(extract_numbers)
            break  # Exit loop once we find the first match
    
    return df 

use functions for raw datasets and dreate a new list with clean datasets

In [182]:
transformed_data = []
for df in raw_data:
    create_artist_name(df)
    create_artwork_title(df)
    create_artwork_medium(df)
    clean_acquisition_year(df)
    transformed_data.append(df)


In [191]:
transformed_data[4]['Medium']

0        Watercolour, ink, chalk and graphite on paper....
1                                        Graphite on paper
2              Graphite on paper. Verso: graphite on paper
3                                        Graphite on paper
4                                  Line engraving on paper
                               ...                        
69196     Perspex, Wood, hairpiece, tampon and human blood
69197    Wood, Perspex, plastic, photograph on paper, t...
69198                                 Soap and glass beads
69199                                     Gallery lighting
69200                                  Oil paint on canvas
Name: Medium, Length: 69201, dtype: object

save clean datasets in separate csv files

In [192]:
clean_data = []
for df in transformed_data:
    df = df[[ 'Artist', 'Title','Medium', 'Year_acquisition']]
    clean_data.append(df)

In [193]:
# List of dataset names to match the datasets
dataset_names = ['pompidou', 'met', 'moderna', 'reina_sofia', 'tate']

# Save each dataset using the corresponding name
for dataset, name in zip(clean_data, dataset_names):
    # Generate a unique filename using the dataset name
    file_name = f"{name}_clean.csv"
    
    # Save the DataFrame to a CSV file
    dataset.to_csv(file_name, index=False)
    print(f"Saved {file_name}")

Saved pompidou_clean.csv
Saved met_clean.csv
Saved moderna_clean.csv
Saved reina_sofia_clean.csv
Saved tate_clean.csv
