In [4]:
import os
import pandas as pd
import numpy as np
from PIL import Image
from tqdm import tqdm
import matplotlib.pyplot as plt
import re
import pickle

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cpu


In [5]:
# File paths
ARTWORK_CSV = "/kaggle/input/historic-art/complete/artwork_dataset.csv"
INFO_CSV = "/kaggle/input/historic-art/complete/info_dataset.csv"
IMAGE_DIR = "/kaggle/input/historic-art/complete/artwork"

# Load CSV files
artwork_df = pd.read_csv(ARTWORK_CSV)
info_df = pd.read_csv(INFO_CSV)

print("Artwork data sample:")
print(artwork_df.head())

print("\nArtist info sample:")
print(info_df.head())

# Merge on 'artist'
df = artwork_df.merge(info_df, on="artist", how="left")
print(f"Total records after merge: {len(df)}")


Artwork data sample:
   ID            artist                               title  \
0   0  AACHEN, Hans von                    venus and adonis   
1   1  AACHEN, Hans von                     procuring scene   
2   2  AACHEN, Hans von  self-portrait with a glass of wine   
3   3  AACHEN, Hans von    two laughing men (self-portrait)   
4   4  AACHEN, Hans von       portrait of emperor rudolf ii   

                                        picture data  \
0  1574-88, oil on canvas, 68 x 95 cm, fogg art m...   
1  1605-10, oil on wood, 114 x 130 cm, kunsthisto...   
2  c. 1596, oil on canvas, 53 x 44 cm, private co...   
3  before 1574, oil on panel, 48 x 39 cm, archdio...   
4  1606-08, oil on canvas, 60 x 48 cm, kunsthisto...   

                       file info                                       jpg url  
0  1700*1211, true color, 252 kb    https://www.wga.hu/art/a/aachen/adonis.jpg  
1  1370*1168, true color, 212 kb   https://www.wga.hu/art/a/aachen/z_scene.jpg  
2    896*1074, true 

In [6]:
if 'ID' in df.columns:
    df['image_path'] = df['ID'].apply(lambda x: os.path.join(IMAGE_DIR, f"{x}.jpg"))
elif 'id' in df.columns:
    df['image_path'] = df['id'].apply(lambda x: os.path.join(IMAGE_DIR, f"{x}.jpg"))
else:
    raise Exception("No 'ID' or 'id' column found for image filenames!")

# Filter to keep only rows with existing images
df = df[df['image_path'].apply(os.path.exists)].reset_index(drop=True)
print(f"Records with existing images: {len(df)}")


Records with existing images: 45600


In [7]:
def parse_picture_data(picture_data_str):
    """Parse the picture data column into structured information"""
    info = {
        'year_range': 'Unknown',
        'medium': 'Unknown',
        'dimensions': 'Unknown',
        'museum': 'Unknown',
        'location': 'Unknown'
    }
    
    if pd.isna(picture_data_str) or picture_data_str == '':
        return info
    
    try:
        # Extract year range (e.g., "1574-88", "c. 1596", "1605-10")
        year_match = re.search(r'(?:c\.\s*)?(\d{4}(?:-\d{2,4})?)', str(picture_data_str))
        if year_match:
            info['year_range'] = year_match.group(1)
        
        # Extract medium (e.g., "oil on canvas", "oil on wood")
        medium_match = re.search(r'(oil on (?:canvas|wood|panel)|tempera|fresco|watercolor|acrylic)', str(picture_data_str).lower())
        if medium_match:
            info['medium'] = medium_match.group(1)
        
        # Extract dimensions (e.g., "68 x 95 cm", "114 x 130 cm")
        dim_match = re.search(r'(\d+\s*x\s*\d+(?:\s*x\s*\d+)?\s*cm)', str(picture_data_str))
        if dim_match:
            info['dimensions'] = dim_match.group(1)
        
        # Extract museum/collection info
        parts = str(picture_data_str).split(',')
        if len(parts) >= 3:
            for part in parts[2:]:
                part = part.strip().lower()
                if any(keyword in part for keyword in ['museum', 'gallery', 'collection', 'church', 'palace', 'institute']):
                    info['museum'] = part.title()
                    break
        
        # Extract location (usually last part)
        if len(parts) >= 2:
            location = parts[-1].strip()
            if len(location) > 2:
                info['location'] = location.title()
                
    except Exception as e:
        print(f"Error parsing: {picture_data_str}, Error: {e}")
    
    return info

In [8]:
if 'picture data' in df.columns:
    print("Parsing picture data...")
    df['parsed_data'] = df['picture data'].apply(parse_picture_data)
    
    # Extract individual fields
    df['year_range'] = df['parsed_data'].apply(lambda x: x['year_range'])
    df['medium'] = df['parsed_data'].apply(lambda x: x['medium'])
    df['dimensions'] = df['parsed_data'].apply(lambda x: x['dimensions'])
    df['museum'] = df['parsed_data'].apply(lambda x: x['museum'])
    df['location'] = df['parsed_data'].apply(lambda x: x['location'])
    
    print("Picture data parsing complete!")
    print(f"\nYear ranges found: {df['year_range'].value_counts().head()}")
    print(f"\nMediums found: {df['medium'].value_counts().head()}")
else:
    # Create dummy columns if picture data doesn't exist
    df['year_range'] = 'Unknown'
    df['medium'] = 'Unknown'
    df['dimensions'] = 'Unknown'
    df['museum'] = 'Unknown'
    df['location'] = 'Unknown'

Parsing picture data...
Picture data parsing complete!

Year ranges found: year_range
Unknown    4757
1650        352
1660        292
1630        276
1304-06     263
Name: count, dtype: int64

Mediums found: medium
Unknown          18408
oil on canvas    14866
fresco            4020
oil on panel      3254
tempera           3121
Name: count, dtype: int64


In [9]:
# Count paintings per artist
artist_counts = df['artist'].value_counts()
print(f"Artists with 3+ paintings: {len(artist_counts[artist_counts >= 3])}")
print(f"Total paintings by these artists: {artist_counts[artist_counts >= 3].sum()}")

# Keep only artists with 3 or more paintings
artists_with_3plus = artist_counts[artist_counts >= 3].index
df_filtered = df[df['artist'].isin(artists_with_3plus)].reset_index(drop=True)

print(f"Filtered dataset size: {len(df_filtered)}")
print(f"Number of unique artists: {df_filtered['artist'].nunique()}")

# Fill missing metadata with 'Unknown'
columns_to_fill = ['artist', 'title', 'period', 'school', 'nationality', 'year_range', 'medium', 'museum', 'location']
for col in columns_to_fill:
    if col in df_filtered.columns:
        df_filtered[col] = df_filtered[col].fillna("Unknown")

print("\nMissing values after filling:")
print(df_filtered[columns_to_fill].isnull().sum())


Artists with 3+ paintings: 3119
Total paintings by these artists: 42135
Filtered dataset size: 42135
Number of unique artists: 3119

Missing values after filling:
artist         0
title          0
period         0
school         0
nationality    0
year_range     0
medium         0
museum         0
location       0
dtype: int64


In [10]:
def encode_labels(df, column):
    le = LabelEncoder()
    labels = le.fit_transform(df[column])
    return labels, le

print("Encoding labels...")
df_filtered['artist_label'], artist_le = encode_labels(df_filtered, 'artist')
df_filtered['title_label'], title_le = encode_labels(df_filtered, 'title')
df_filtered['period_label'], period_le = encode_labels(df_filtered, 'period')
df_filtered['school_label'], school_le = encode_labels(df_filtered, 'school')
df_filtered['nationality_label'], nationality_le = encode_labels(df_filtered, 'nationality')
df_filtered['year_label'], year_le = encode_labels(df_filtered, 'year_range')
df_filtered['medium_label'], medium_le = encode_labels(df_filtered, 'medium')
df_filtered['museum_label'], museum_le = encode_labels(df_filtered, 'museum')
df_filtered['location_label'], location_le = encode_labels(df_filtered, 'location')

print(f"Number of artists: {len(artist_le.classes_)}")
print(f"Number of titles: {len(title_le.classes_)}")
print(f"Number of periods: {len(period_le.classes_)}")
print(f"Number of schools: {len(school_le.classes_)}")
print(f"Number of nationalities: {len(nationality_le.classes_)}")
print(f"Number of year ranges: {len(year_le.classes_)}")
print(f"Number of mediums: {len(medium_le.classes_)}")
print(f"Number of museums: {len(museum_le.classes_)}")
print(f"Number of locations: {len(location_le.classes_)}")


Encoding labels...
Number of artists: 3119
Number of titles: 25903
Number of periods: 12
Number of schools: 11
Number of nationalities: 26
Number of year ranges: 3403
Number of mediums: 7
Number of museums: 599
Number of locations: 1145


In [11]:
class EnhancedArtDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.df = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = row['image_path']
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)

        labels = {
            'artist': torch.tensor(row['artist_label']),
            'title': torch.tensor(row['title_label']),
            'period': torch.tensor(row['period_label']),
            'school': torch.tensor(row['school_label']),
            'nationality': torch.tensor(row['nationality_label']),
            'year': torch.tensor(row['year_label']),
            'medium': torch.tensor(row['medium_label']),
            'museum': torch.tensor(row['museum_label']),
            'location': torch.tensor(row['location_label'])
        }

        metadata = {
            'title': row['title'],
            'artist_name': row['artist'],
            'period_name': row['period'],
            'school_name': row['school'],
            'nationality_name': row['nationality'],
            'year_name': row['year_range'],
            'medium_name': row['medium'],
            'museum_name': row['museum'],
            'location_name': row['location'],
            'image_path': img_path
        }
        return image, labels, metadata

In [12]:
# Optimized transforms for faster training
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Split dataset
train_df, val_df = train_test_split(
    df_filtered, 
    test_size=0.15, 
    stratify=df_filtered['artist_label'], 
    random_state=42
)

train_dataset = EnhancedArtDataset(train_df, transform=transform)
val_dataset = EnhancedArtDataset(val_df, transform=transform)

# Data loaders with optimizations
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=2, pin_memory=True)

print(f"Train samples: {len(train_dataset)}, Val samples: {len(val_dataset)}")


Train samples: 35814, Val samples: 6321


In [13]:
class EnhancedMultiTaskModel(nn.Module):
    def __init__(self, num_artists, num_titles, num_periods, num_schools, num_nationalities, 
                 num_years, num_mediums, num_museums, num_locations):
        super().__init__()
        # Use smaller, faster backbone
        self.backbone = models.efficientnet_b0(weights=models.EfficientNet_B0_Weights.IMAGENET1K_V1)
        in_features = self.backbone.classifier[1].in_features
        self.backbone.classifier = nn.Identity()
        
        # Shared feature layer for efficiency
        self.shared_features = nn.Sequential(
            nn.Linear(in_features, 512),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        
        # Classification heads
        self.artist_head = nn.Linear(512, num_artists)
        self.title_head = nn.Linear(512, num_titles)
        self.period_head = nn.Linear(512, num_periods)
        self.school_head = nn.Linear(512, num_schools)
        self.nationality_head = nn.Linear(512, num_nationalities)
        self.year_head = nn.Linear(512, num_years)
        self.medium_head = nn.Linear(512, num_mediums)
        self.museum_head = nn.Linear(512, num_museums)
        self.location_head = nn.Linear(512, num_locations)

    def forward(self, x):
        backbone_features = self.backbone(x)
        shared_features = self.shared_features(backbone_features)
        
        outputs = {
            'artist': self.artist_head(shared_features),
            'title': self.title_head(shared_features),
            'period': self.period_head(shared_features),
            'school': self.school_head(shared_features),
            'nationality': self.nationality_head(shared_features),
            'year': self.year_head(shared_features),
            'medium': self.medium_head(shared_features),
            'museum': self.museum_head(shared_features),
            'location': self.location_head(shared_features),
            'features': shared_features
        }
        return outputs


In [14]:
# Create model
model = EnhancedMultiTaskModel(
    len(artist_le.classes_), len(title_le.classes_), len(period_le.classes_),
    len(school_le.classes_), len(nationality_le.classes_), len(year_le.classes_),
    len(medium_le.classes_), len(museum_le.classes_), len(location_le.classes_)
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4, weight_decay=0.01)

print(f"Model created with {sum(p.numel() for p in model.parameters()):,} parameters")


Downloading: "https://download.pytorch.org/models/efficientnet_b0_rwightman-7f5810bc.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b0_rwightman-7f5810bc.pth
100%|██████████| 20.5M/20.5M [00:00<00:00, 127MB/s] 


Model created with 22,220,845 parameters


In [15]:
def train_epoch(dataloader):
    model.train()
    total_loss = 0
    correct_predictions = {
        'artist': 0, 'title': 0, 'period': 0, 'school': 0, 
        'nationality': 0, 'year': 0, 'medium': 0, 'museum': 0, 'location': 0
    }
    total_samples = 0
    
    for images, labels, _ in tqdm(dataloader, desc="Training"):
        images = images.to(device)
        optimizer.zero_grad()

        outputs = model(images)

        # Calculate losses
        losses = {}
        for key in ['artist', 'title', 'period', 'school', 'nationality', 'year', 'medium', 'museum', 'location']:
            losses[key] = criterion(outputs[key], labels[key].to(device))
            
            # Track accuracy
            preds = outputs[key].argmax(dim=1)
            correct_predictions[key] += (preds == labels[key].to(device)).sum().item()

        # Weighted loss - prioritize artist and title
        total_loss_batch = (
            2.0 * losses['artist'] + 
            1.5 * losses['title'] + 
            0.5 * (losses['period'] + losses['school'] + losses['nationality']) +
            0.3 * (losses['year'] + losses['medium'] + losses['museum'] + losses['location'])
        )
        
        total_loss_batch.backward()
        
        # Gradient clipping for stability
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optimizer.step()
        total_loss += total_loss_batch.item()
        total_samples += images.size(0)

    # Calculate accuracies
    accuracies = {key: correct/total_samples for key, correct in correct_predictions.items()}
    
    return total_loss / len(dataloader), accuracies

def validate_epoch(dataloader):
    model.eval()
    correct_predictions = {
        'artist': 0, 'title': 0, 'period': 0, 'school': 0, 
        'nationality': 0, 'year': 0, 'medium': 0, 'museum': 0, 'location': 0
    }
    total_samples = 0

    with torch.no_grad():
        for images, labels, _ in tqdm(dataloader, desc="Validating"):
            images = images.to(device)
            outputs = model(images)
            
            for key in correct_predictions.keys():
                preds = outputs[key].argmax(dim=1)
                correct_predictions[key] += (preds == labels[key].to(device)).sum().item()
            
            total_samples += images.size(0)

    accuracies = {key: correct/total_samples for key, correct in correct_predictions.items()}
    return accuracies

In [16]:
print("Starting training...")
epochs = 1

for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}/{epochs}")
    train_loss, train_acc = train_epoch(train_loader)
    val_acc = validate_epoch(val_loader)
    
    print(f"Train Loss: {train_loss:.4f}")
    print("Train Accuracies:")
    for key, acc in train_acc.items():
        print(f"  {key.capitalize()}: {acc:.3f}")
    print("Validation Accuracies:")
    for key, acc in val_acc.items():
        print(f"  {key.capitalize()}: {acc:.3f}")

print("Training complete!")

Starting training...

Epoch 1/1


Training: 100%|██████████| 1120/1120 [1:33:15<00:00,  5.00s/it]
Validating: 100%|██████████| 198/198 [03:50<00:00,  1.17s/it]

Train Loss: 33.9791
Train Accuracies:
  Artist: 0.063
  Title: 0.045
  Period: 0.435
  School: 0.877
  Nationality: 0.506
  Year: 0.099
  Medium: 0.677
  Museum: 0.638
  Location: 0.172
Validation Accuracies:
  Artist: 0.098
  Title: 0.056
  Period: 0.525
  School: 0.911
  Nationality: 0.560
  Year: 0.104
  Medium: 0.763
  Museum: 0.665
  Location: 0.199
Training complete!





In [17]:
print("Saving model and encoders...")
torch.save(model.state_dict(), "enhanced_art_model.pth")

# Save all encoders
encoders = {
    'artist': artist_le,
    'title': title_le,
    'period': period_le,
    'school': school_le,
    'nationality': nationality_le,
    'year': year_le,
    'medium': medium_le,
    'museum': museum_le,
    'location': location_le
}

for name, encoder in encoders.items():
    pickle.dump(encoder, open(f'{name}_le.pkl', 'wb'))

# Create mappings for local testing
title_mapping = dict(zip(df_filtered['image_path'], df_filtered['title']))
artist_mapping = dict(zip(df_filtered['image_path'], df_filtered['artist']))

pickle.dump(title_mapping, open('title_mapping.pkl', 'wb'))
pickle.dump(artist_mapping, open('artist_mapping.pkl', 'wb'))

print("Model and encoders saved!")

Saving model and encoders...
Model and encoders saved!


In [18]:
print("Building embedding database...")

def build_embedding_db(dataloader):
    embedding_db = {}
    model.eval()
    with torch.no_grad():
        for images, _, metadata in tqdm(dataloader, desc="Building embeddings"):
            images = images.to(device)
            outputs = model(images)
            features = outputs['features']
            
            # Handle metadata properly
            batch_size = len(images)
            for i in range(batch_size):
                meta_dict = {}
                for key, value_list in metadata.items():
                    meta_dict[key] = value_list[i]
                embedding_db[meta_dict['image_path']] = features[i].cpu().numpy()
    return embedding_db

embedding_db = build_embedding_db(train_loader)
np.save("enhanced_embedding_db.npy", embedding_db)
print(f"Embedding database saved with {len(embedding_db)} entries.")


Building embedding database...


Building embeddings: 100%|██████████| 1120/1120 [20:21<00:00,  1.09s/it]


Embedding database saved with 35796 entries.


In [19]:
def compute_authenticity_score(new_embedding, known_embeddings):
    if len(known_embeddings) == 0:
        return 0
    sims = cosine_similarity(new_embedding.reshape(1, -1), known_embeddings)
    return sims.max()

def predict_enhanced(image_path, threshold=0.75):
    model.eval()
    image = Image.open(image_path).convert("RGB")
    input_tensor = transform(image).unsqueeze(0).to(device)

    with torch.no_grad():
        outputs = model(input_tensor)
        
        predictions = {}
        encoders_dict = {
            'artist': artist_le, 'title': title_le, 'period': period_le,
            'school': school_le, 'nationality': nationality_le, 'year': year_le,
            'medium': medium_le, 'museum': museum_le, 'location': location_le
        }
        
        for key, encoder in encoders_dict.items():
            probs = torch.softmax(outputs[key], dim=1).cpu().numpy()[0]
            pred_idx = probs.argmax()
            predictions[key] = {
                'name': encoder.classes_[pred_idx],
                'confidence': probs[pred_idx]
            }

        # Context-aware title prediction
        predicted_artist = predictions['artist']['name']
        artist_titles = df_filtered[df_filtered['artist'] == predicted_artist]['title'].unique()
        
        if len(artist_titles) > 0:
            title_probs = torch.softmax(outputs['title'], dim=1).cpu().numpy()[0]
            filtered_scores = []
            for title in artist_titles:
                if title in title_le.classes_:
                    title_idx = list(title_le.classes_).index(title)
                    filtered_scores.append((title, title_probs[title_idx]))
            
            if filtered_scores:
                best_title, best_score = max(filtered_scores, key=lambda x: x[1])
                predictions['title'] = {
                    'name': best_title,
                    'confidence': best_score,
                    'method': 'context_aware'
                }

        # Authenticity check
        known_embeddings = np.array(list(embedding_db.values()))
        new_embedding = outputs['features'].cpu().numpy()
        authenticity_score = compute_authenticity_score(new_embedding, known_embeddings)
        authenticity_msg = "Likely Authentic" if authenticity_score >= threshold else "Suspicious / Possibly Forged"

        print("=" * 60)
        print(f"IMAGE: {image_path}")
        print("=" * 60)
        print(f" TITLE: {predictions['title']['name']}")
        print(f"   Confidence: {predictions['title']['confidence']*100:.1f}%")
        print(f" ARTIST: {predictions['artist']['name']}")
        print(f"   Confidence: {predictions['artist']['confidence']*100:.1f}%")
        print(f" YEAR: {predictions['year']['name']}")
        print(f"   Confidence: {predictions['year']['confidence']*100:.1f}%")
        print(f" MEDIUM: {predictions['medium']['name']}")
        print(f"   Confidence: {predictions['medium']['confidence']*100:.1f}%")
        print(f" SCHOOL: {predictions['school']['name']}")
        print(f"   Confidence: {predictions['school']['confidence']*100:.1f}%")
        print(f" NATIONALITY: {predictions['nationality']['name']}")
        print(f"   Confidence: {predictions['nationality']['confidence']*100:.1f}%")
        print(f" MUSEUM: {predictions['museum']['name']}")
        print(f"   Confidence: {predictions['museum']['confidence']*100:.1f}%")
        print(f" LOCATION: {predictions['location']['name']}")
        print(f"   Confidence: {predictions['location']['confidence']*100:.1f}%")
        print(f" AUTHENTICITY: {authenticity_score:.2f} ({authenticity_msg})")
        print("=" * 60)

print("Setup complete! Use predict_enhanced('image_path') to test the model.")


Setup complete! Use predict_enhanced('image_path') to test the model.


In [30]:
predict_enhanced('/kaggle/input/historic-art/complete/artwork/0.jpg')

IMAGE: /kaggle/input/historic-art/complete/artwork/0.jpg
 TITLE: adoration of the shepherds
   Confidence: 0.8%
 ARTIST: RUBENS, Peter Paul
   Confidence: 1.3%
 YEAR: Unknown
   Confidence: 11.7%
 MEDIUM: oil on canvas
   Confidence: 72.2%
 SCHOOL: painter
   Confidence: 95.9%
 NATIONALITY: Italian
   Confidence: 23.9%
 MUSEUM: Unknown
   Confidence: 33.8%
 LOCATION: Private Collection
   Confidence: 26.3%
 AUTHENTICITY: 1.00 (Likely Authentic)
