In [3]:
# Data Loading & Preprocessing
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

print("Loading Polaris Prize data...")
df = pd.read_csv('../polaris_training_dataset.csv')
original_len = len(df)  
print(f"Loaded {len(df)} albums")

# Show theme distribution
print("\nTheme distribution:")
print(df['theme_label'].value_counts())

# Create better text features
def create_text_features(row):
    parts = []
    if pd.notna(row['album']):
        parts.append(f"Album: {row['album']}")
    if pd.notna(row['artist']):
        parts.append(f"Artist: {row['artist']}")
    if pd.notna(row['genre']):
        parts.append(f"Genre: {row['genre']}")
    if pd.notna(row['description']) and str(row['description']) != 'nan':
        parts.append(f"Description: {row['description']}")
    if pd.notna(row['critical_context']) and str(row['critical_context']) != 'nan':
        parts.append(f"Context: {row['critical_context']}")
    return " | ".join(parts)

df['text'] = df.apply(create_text_features, axis=1)
df = df.dropna(subset=['theme_label'])

# Remove classes with too few samples
theme_counts = df['theme_label'].value_counts()
small_classes = theme_counts[theme_counts < 10].index
if len(small_classes) > 0:
    print(f"Removing small classes: {list(small_classes)}")
    df = df[~df['theme_label'].isin(small_classes)]

print(f"Final dataset: {len(df)} albums")

# Encode labels
encoder = LabelEncoder()
df['label'] = encoder.fit_transform(df['theme_label'])

print("Final classes:")
for i, theme in enumerate(encoder.classes_):
    count = sum(df['label'] == i)
    print(f"{i}: {theme} ({count} samples)")

# EDA
print(f"Dataset shape: {df.shape}")
print(f"Missing values:\n{df.isnull().sum()}")
print(f"Unique values per column:\n{df.nunique()}")

# Document cleaning decisions
print(f"Removed {original_len - len(df)} rows due to missing themes and small classes")
print(f"Text quality check: avg length = {df['text'].str.len().mean():.0f} chars")

# Save processed data
df.to_csv('processed_data.csv', index=False)
np.save('label_classes.npy', encoder.classes_)
print("Saved processed_data.csv and label_classes.npy")

Loading Polaris Prize data...
Loaded 200 albums

Theme distribution:
theme_label
Love & Relationships          41
Introspection & Philosophy    41
Experimental & Abstract       35
Social Commentary             32
Identity & Heritage           30
Place & Landscape             21
Name: count, dtype: int64
Final dataset: 200 albums
Final classes:
0: Experimental & Abstract (35 samples)
1: Identity & Heritage (30 samples)
2: Introspection & Philosophy (41 samples)
3: Love & Relationships (41 samples)
4: Place & Landscape (21 samples)
5: Social Commentary (32 samples)
Dataset shape: (200, 11)
Missing values:
artist              0
album               0
year                0
placement           0
genre               0
region              0
theme_label         0
description         0
critical_context    0
text                0
label               0
dtype: int64
Unique values per column:
artist              169
album               200
year                  5
placement             2
genre       