In [10]:
import pandas as pd

In [11]:
# Load the CSV file
df = pd.read_csv('../data/images/artemis_dataset_release_v0.csv')

# Show basic info
print("Dataset loaded successfully!")
print(f"Total rows (annotations): {len(df):,}")
print(f"Columns: {list(df.columns)}")


Dataset loaded successfully!
Total rows (annotations): 454,684
Columns: ['art_style', 'painting', 'emotion', 'utterance', 'repetition']


In [12]:
df.head()


Unnamed: 0,art_style,painting,emotion,utterance,repetition
0,Post_Impressionism,vincent-van-gogh_portrait-of-madame-ginoux-l-a...,something else,"She seems very happy in the picture, and you w...",10
1,Post_Impressionism,vincent-van-gogh_portrait-of-madame-ginoux-l-a...,sadness,This woman has really knotty hands which makes...,10
2,Post_Impressionism,vincent-van-gogh_portrait-of-madame-ginoux-l-a...,something else,"When looking at this woman, I am filled with c...",10
3,Post_Impressionism,vincent-van-gogh_portrait-of-madame-ginoux-l-a...,contentment,"A woman looking at ease, peaceful, and satisfi...",10
4,Post_Impressionism,vincent-van-gogh_portrait-of-madame-ginoux-l-a...,awe,She looks like a lady from that past that migh...,10


## Step 4: Basic Dataset Statistics


In [13]:
# Calculate basic statistics
total_annotations = len(df)
unique_paintings = df['painting'].nunique()
unique_styles = df['art_style'].nunique()
unique_emotions = df['emotion'].nunique()

print("Data Statistics")
print(f"Total annotations:     {total_annotations:,}")
print(f"Unique paintings:      {unique_paintings:,}")
print(f"Unique art styles:     {unique_styles}")
print(f"Unique emotions:       {unique_emotions}")
print(f"Avg captions/painting: {total_annotations/unique_paintings:.2f}")


Data Statistics
Total annotations:     454,684
Unique paintings:      80,031
Unique art styles:     27
Unique emotions:       9
Avg captions/painting: 5.68


# What we can see

- **454k+ annotations** but only **80k unique paintings**
- Each painting has **~5-6 different captions** from different people


# Emotion Distribution

In [14]:
# Count how many times each emotion appears
emotion_counts = df['emotion'].value_counts()

print("Emotion Distribution:")
for emotion, count in emotion_counts.items():
    percentage = (count / total_annotations) * 100
    # Create a simple bar chart with characters
    bar = "█" * int(percentage)
    print(f"{emotion:20s}: {count:7,} ({percentage:5.2f}%) {bar}")


Emotion Distribution:
contentment         : 126,134 (27.74%) ███████████████████████████
awe                 :  72,927 (16.04%) ████████████████
something else      :  52,962 (11.65%) ███████████
sadness             :  49,061 (10.79%) ██████████
amusement           :  45,336 ( 9.97%) █████████
fear                :  41,577 ( 9.14%) █████████
excitement          :  37,636 ( 8.28%) ████████
disgust             :  22,411 ( 4.93%) ████
anger               :   6,640 ( 1.46%) █


# Caption Length Analysis


In [15]:
# Calculate length of each caption (number of words)
df['caption_length'] = df['utterance'].str.split().str.len()

# Get statistics
min_length = df['caption_length'].min()
max_length = df['caption_length'].max()
avg_length = df['caption_length'].mean()
median_length = df['caption_length'].median()

print("Caption Length Stats")
print(f"Shortest caption:  {min_length} words")
print(f"Longest caption:   {max_length} words")
print(f"Average length:    {avg_length:.2f} words")
print(f"Median length:     {median_length:.0f} words")

# Show distribution
print("\nLength Distribution (word count ranges):")
length_ranges = pd.cut(df['caption_length'], bins=[0, 10, 20, 30, 40, 50, 100, 200], 
                        labels=['1-10', '11-20', '21-30', '31-40', '41-50', '51-100', '100+'])
print(length_ranges.value_counts().sort_index())


Caption Length Stats
Shortest caption:  1 words
Longest caption:   202 words
Average length:    15.69 words
Median length:     14 words

Length Distribution (word count ranges):
caption_length
1-10      103760
11-20     268259
21-30      61960
31-40      13989
41-50       3962
51-100      2644
100+         109
Name: count, dtype: int64


# Different Perspectives

Let's look at how different people describe the same painting.


In [16]:
# Pick the first painting in the dataset
first_painting = df['painting'].iloc[0]
art_style = df['art_style'].iloc[0]

print(f"Painting: {first_painting}")
print(f"Style: {art_style}")
print()

# Get all captions for this painting
same_painting = df[df['painting'] == first_painting]

print(f"This painting has {len(same_painting)} different descriptions:\n")

# Show each person's description
for i, (idx, row) in enumerate(same_painting.iterrows(), 1):
    print(f"Person {i} felt '{row['emotion']}':")
    print(f'  "{row["utterance"]}"')
    print()


Painting: vincent-van-gogh_portrait-of-madame-ginoux-l-arlesienne-1890
Style: Post_Impressionism

This painting has 10 different descriptions:

Person 1 felt 'something else':
  "She seems very happy in the picture, and you want to know what what is behind the smile."

Person 2 felt 'sadness':
  "This woman has really knotty hands which makes her look like she has arthritis."

Person 3 felt 'something else':
  "When looking at this woman, I am filled with curiosity about what she is thinking about with her elbow on the table and a very emotionless face."

Person 4 felt 'contentment':
  "A woman looking at ease, peaceful, and satisfied amongst her books makes me feel content."

Person 5 felt 'awe':
  "She looks like a lady from that past that might have been a teacher (books).  She looks tired and I wondered how hard it must have been for them back then."

Person 6 felt 'disgust':
  "The details of the woman's face is off-putting and mildly disturbing."

Person 7 felt 'contentment':
  "

# Art Style Distribution

Which art movements are most represented?


In [17]:
# Count art styles
style_counts = df['art_style'].value_counts()

print("Top 15 Art Styles:")
print("-"*80)
for style, count in style_counts.head(15).items():
    percentage = (count / total_annotations) * 100
    bar = "█" * int(percentage / 2)
    print(f"{style:40s}: {count:6,} ({percentage:5.2f}%) {bar}")


Top 15 Art Styles:
--------------------------------------------------------------------------------
Impressionism                           : 72,361 (15.91%) ███████
Realism                                 : 59,681 (13.13%) ██████
Romanticism                             : 39,069 ( 8.59%) ████
Expressionism                           : 38,717 ( 8.52%) ████
Post_Impressionism                      : 36,374 ( 8.00%) ███
Art_Nouveau_Modern                      : 24,711 ( 5.43%) ██
Symbolism                               : 24,103 ( 5.30%) ██
Baroque                                 : 23,469 ( 5.16%) ██
Abstract_Expressionism                  : 16,075 ( 3.54%) █
Northern_Renaissance                    : 14,160 ( 3.11%) █
Naive_Art_Primitivism                   : 14,086 ( 3.10%) █
Rococo                                  : 11,904 ( 2.62%) █
Cubism                                  : 11,462 ( 2.52%) █
Color_Field_Painting                    :  9,836 ( 2.16%) █
Pop_Art                               

In [18]:
# Sample 10 random captions
sample = df.sample(10, random_state=42)

print("Random Captions:")

for i, (idx, row) in enumerate(sample.iterrows(), 1):
    print(f"\n{i}. [{row['emotion']}] ({row['art_style']})")
    print(f"   Painting: {row['painting'][:50]}...")  # Truncate long names
    print(f'   Caption: "{row["utterance"]}"')
    print(f"   Length: {row['caption_length']} words")


Random Captions:

1. [excitement] (Symbolism)
   Painting: william-blake_night-startled-by-the-lark-1820...
   Caption: "the angel will fly around in the starry sky"
   Length: 9 words

2. [something else] (Minimalism)
   Painting: robert-mangold_untitled-from-skowhegan-suite-1992...
   Caption: "This image makes me feel interested because the orange board does not seem to go with the black string."
   Length: 19 words

3. [sadness] (Impressionism)
   Painting: konstantin-korovin_in-a-room-1886...
   Caption: "The man in the bed looks as if he could be potentially ill with the way his face seems bleak and the way he is leaning."
   Length: 26 words

4. [contentment] (Realism)
   Painting: vasily-surikov_whacky-seated-on-the-ground-study-t...
   Caption: "The person sitting has his hand up, looks like a monk posture and is reflective."
   Length: 15 words

5. [fear] (Realism)
   Painting: viktor-vasnetsov_edge-of-the-spruce-forest-1881...
   Caption: "The trees look so close together an