# Notebook 2 — Data Cleaning

## 2.1 Attribute Selection
- Drop columns with >50% missing values
- Keep attributes relevant to ML and the Department target

## 2.2 Data Transformation / Standardization
- Trim spaces

## 2.3 Error Detection and Correction
- Solving consistency issues found in the DQ assessment

## 2.4 Missing Values Handling
- Strategy: Title → "Untitled" ; Classification → ML imputation with a forest model; Other string attributes → "Unknown"; All numeric attributes are complete

## 2.5 Deduplication
- Removing exact duplicates found by not considering the unique columns

## 2.6 Outlier detection
- Performed outlier detection, but shouldn't lead to any useful discoveries

## 2.7 Post-Cleaning DQ Re-Assessment
- Show improved completeness, duplicate reduction, and distributional changes

Notes: Implement the minimal, well-justified cleaning steps only; preserve interpretability for slides.

Importing Dataset

In [None]:
import pandas as pd
import numpy as np
from sklearn import ensemble

In [130]:
MET_MUSEUM_OBJECTS = pd.read_csv("./met_museum_objects.csv")

2.1 Attribute Selection

In [131]:
# We identify columns with more than 50% missing values first
to_delete = []
for c in MET_MUSEUM_OBJECTS.columns:
    if MET_MUSEUM_OBJECTS[c].isnull().sum() > (MET_MUSEUM_OBJECTS.shape[0]*.5):
        to_delete.append(c)

In [132]:
# We are also dropping some more additional columns because they aren't useful for our purposes
manual_deletions = ["Metadata Date", "Repository"] # Fully constant columns
manual_deletions.append("Object Date") # Redundant field already present in Object begin date and Object end date (which have 100% completeness)
manual_deletions.append("Is Highlight") # Extremely unbalanced column
to_delete = to_delete + manual_deletions

In [133]:
# Now we drop them
MODIFIED_DATASET = MET_MUSEUM_OBJECTS.drop(columns=to_delete)

2.2 Data Transformation / Standardization

In [None]:
# Removing possible initial and final whitespace from text objects
text_cols = MODIFIED_DATASET.select_dtypes(include='object').columns

# Loop for counting how many values are cleaned
for col in text_cols:
    has_whitespace = (MODIFIED_DATASET[col] != MODIFIED_DATASET[col].str.strip()) & MODIFIED_DATASET[col].notna()
    count = has_whitespace.sum()
    
    if count > 0:
        print(f"Column '{col}': {count} values cleaned")

# Loop for actually deleting whitespaces
for col in text_cols:
    MODIFIED_DATASET[col] = MODIFIED_DATASET[col].str.strip()

Column 'Object Name': 1192 values cleaned
Column 'Title': 413 values cleaned
Column 'Artist Display Name': 140 values cleaned
Column 'Artist Alpha Sort': 2198 values cleaned
Column 'Medium': 3295 values cleaned
Column 'Dimensions': 7588 values cleaned
Column 'Credit Line': 2944 values cleaned


2.3 Error checking and solving

In [None]:
# Dropping rows with errors in dates as they are a limited amount and shouldn't affect the accuracy of the model

# Initial length
print(f"Rows before cleanup: {len(MODIFIED_DATASET)}")

condition_good = (
    (MODIFIED_DATASET['Object Begin Date'] <= MODIFIED_DATASET['Object End Date']) &
    (MODIFIED_DATASET['Object Begin Date'] <= 2026) &
    (MODIFIED_DATASET['Object End Date'] <= 2026)  # May delete some artists and objects that are still active or that haven't been completed yet
)

# Apply the filter
MODIFIED_DATASET = MODIFIED_DATASET[condition_good]

# Final length
print(f"Cleanup complete. Rows remaining: {len(MODIFIED_DATASET)}")

# Deleted around 230 rows in a 448000 row dataset (minor difference)

Rows before cleanup: 448203
Cleanup complete. Rows remaining: 447971


2.4 Deduplication

In [None]:
# Exact duplicates aren't present because there are unique columns
print(f"Original rows: {len(MODIFIED_DATASET)}")

# Fuzzy duplicate detection
# Check if by removing unique IDs there are duplicates
unique_cols = ['Object ID', 'Object Number', 'Link Resource']

# List of columns to check (Everything EXCEPT the unique ones)
cols_to_check = [col for col in MODIFIED_DATASET.columns if col not in unique_cols]

# Drop duplicates based on that subset
MODIFIED_DATASET = MODIFIED_DATASET.drop_duplicates(subset=cols_to_check, keep='first')

print(f"Rows after removing content duplicates: {len(MODIFIED_DATASET)}")


Original rows: 447971
Rows after removing content duplicates: 406324


2.5 Missing values handling

In [None]:
# Forest imputation for Classification

missing_columns = ["Classification"] # Column to impute

# Columns to exclude from the model
exclude_cols = [
    'Object ID', 'Object Number', 'Title', 'Link Resource', 
    'Artist Display Name', 'Artist Alpha Sort', 'Credit Line', 
    'Dimensions', 'Artist Role'
]

IMP_DATA = pd.DataFrame(columns=["IMP" + name for name in missing_columns])

# We fill missing values temporarily so they can be used as features if needed
for feature in missing_columns:
    MODIFIED_DATASET[feature + '_imp'] = MODIFIED_DATASET[feature]
    mode_val = MODIFIED_DATASET[feature].mode()[0]
    MODIFIED_DATASET.loc[MODIFIED_DATASET[feature].isnull(), feature + '_imp'] = mode_val

# Random Forest Classification
for feature in missing_columns:
    print(f"Imputing {feature}...")
    
    IMP_DATA["IMP" + feature] = MODIFIED_DATASET[feature]
    
    # Define predictors: All cols - Missing Cols - Excluded Cols
    parameters = [c for c in MODIFIED_DATASET.columns 
                  if c not in missing_columns 
                  and c not in exclude_cols 
                  and not c.endswith('_imp')]


    model = ensemble.RandomForestClassifier()

    # Encode Variables
    # Changed encoding from one hot to label so that it executes in a reasonable amount of space
    X = pd.DataFrame(index=MODIFIED_DATASET.index)
    for param in parameters:
        if MODIFIED_DATASET[param].dtype == 'object' or MODIFIED_DATASET[param].dtype == 'bool':
            # Convert text/bool to numbers (0, 1, 2...)
            X[param], _ = pd.factorize(MODIFIED_DATASET[param])
        else:
            # Keep numbers as they are
            X[param] = MODIFIED_DATASET[param].fillna(0) # Fill numeric NaNs with 0 for safety, but shouldn't fill anything as the numeric columns are complete
    
    # Define Train (Known values) and Predict (Missing values)
    train_idx = MODIFIED_DATASET[feature].notnull()
    predict_idx = MODIFIED_DATASET[feature].isnull()
    
    if predict_idx.sum() > 0:
        # Fit model on known data
        y_train = MODIFIED_DATASET.loc[train_idx, feature] # Target
        X_train = X.loc[train_idx]
        
        model.fit(X_train, y_train)
        
        # Predict missing data
        X_predict = X.loc[predict_idx]
        model_predicted = model.predict(X_predict)

        print(f"Successfully imputed {len(model_predicted)} values for {feature}")
        IMP_DATA.loc[predict_idx, "IMP" + feature] = model_predicted
    else:
        print(f"No missing values found for {feature}")

Imputing Classification...
Successfully imputed 48153 values for Classification


In [138]:
# Uploading imputation results to the dataframe and removing temporary column
MODIFIED_DATASET['Classification'] = IMP_DATA['IMPClassification']
MODIFIED_DATASET = MODIFIED_DATASET.drop(columns='Classification_imp')

# Column specific simple imputation
MODIFIED_DATASET['Title'] = MODIFIED_DATASET['Title'].fillna('Untitled')

# Unknown for other object fields
text_cols = MODIFIED_DATASET.select_dtypes(include=['object']).columns

MODIFIED_DATASET[text_cols] = MODIFIED_DATASET[text_cols].fillna('Unknown')

2.6 Outlier detection

In [None]:
def IQR(data):
    sorted(data)
    Q1, Q3 = np.percentile(data, [25, 75])
    IQR = Q3 - Q1
    lower_range = Q1 - (1.5 * IQR) # Leads to many outliers because of the unbalanced distribution of the data, most of which are normal values
    upper_range = Q3 + (1.5 * IQR)
    outliers = data[(data > upper_range)]
    print("The detected outliers are: ", str(outliers))

num_cols = ['Object Begin Date', 'Object End Date']
for col in num_cols:
    IQR(MODIFIED_DATASET[col])

# MODIFIED IQR Outlier detection to not include lower outliers (museum data may be very old)
# We don't get any useful information from this

The detected outliers are:  Series([], Name: Object Begin Date, dtype: int64)
The detected outliers are:  Series([], Name: Object End Date, dtype: int64)


Creation of cleaned dataset

In [141]:
MODIFIED_DATASET.to_csv('cleaned_met_museum_objects.csv', index=False)