In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path


file_dir = "/users/PAS2136/rayees/CV4A - Domain Adaptation"
file_path = f"{file_dir}/DataFinalized.csv"

df = pd.read_csv(file_path)
df = df[df["Dataset"] == "BritishCarabids"]


species_counts = df['ScientificName'].value_counts()
print(f"Total number of unique species: {len(species_counts)}")
print(f"Total number of images: {len(df)}")


outdir = "/users/PAS2136/rayees/CV4A - Efficient Probing"
output_dir = Path(outdir) / "ProbingSubsets"
os.makedirs(output_dir, exist_ok=True)


def create_balanced_sample(df, samples_per_species, species_counts):
    result_df = pd.DataFrame()
    
    for species in species_counts.index:
        species_df = df[df['ScientificName'] == species]
        available = len(species_df)
        
        # Take all available if less than requested, otherwise sample
        if available <= samples_per_species:
            sampled = species_df
            print(f"WARNING: Species {species} has only {available} samples (requested {samples_per_species})")
        else:
            sampled = species_df.sample(samples_per_species, random_state=42)
            
        result_df = pd.concat([result_df, sampled])
    
    return result_df
    

df_20 = create_balanced_sample(df, 10, species_counts)
print(f"df_20 shape: {df_20.shape}")
df_20.to_csv(output_dir / "probing_10_per_species.csv", index=False)

df_20 = create_balanced_sample(df, 20, species_counts)
print(f"df_20 shape: {df_20.shape}")
df_20.to_csv(output_dir / "probing_20_per_species.csv", index=False)


df_50 = create_balanced_sample(df, 50, species_counts)
print(f"df_50 shape: {df_50.shape}")
df_50.to_csv(output_dir / "probing_50_per_species.csv", index=False)



def create_30k_dataset(df, species_counts, target_size=30000):
    
    result_df = pd.DataFrame()
    remaining_species = []
    
    for species, count in species_counts.items():
        if count < 100:
            species_df = df[df['ScientificName'] == species]
            result_df = pd.concat([result_df, species_df])
            print(f"Added all {count} samples from rare species: {species}")
        else:
            remaining_species.append(species)
    
    remaining_needed = target_size - len(result_df)
    
    if remaining_needed <= 0:
        print(f"WARNING: Already exceeded target size with rare species alone: {len(result_df)}")
        return result_df

    samples_per_species = remaining_needed // len(remaining_species)
    print(f"Taking {samples_per_species} samples from each of the {len(remaining_species)} common species")

    for species in remaining_species:
        species_df = df[df['ScientificName'] == species]
        sampled = species_df.sample(min(samples_per_species, len(species_df)), random_state=42)
        result_df = pd.concat([result_df, sampled])
    
    if len(result_df) < target_size:
        still_needed = target_size - len(result_df)
        print(f"Still need {still_needed} more samples to reach target")
        
        extra_capacity_species = []
        for species in remaining_species:
            species_count = species_counts[species]
            used_count = len(result_df[result_df['ScientificName'] == species])
            if species_count > used_count:
                extra_capacity_species.append((species, species_count - used_count))
        
        if extra_capacity_species:
            extra_capacity_species.sort(key=lambda x: x[1], reverse=True)
            total_extra_capacity = sum(capacity for _, capacity in extra_capacity_species)
            for species, capacity in extra_capacity_species:
                if still_needed <= 0:
                    break
                    
                to_take = min(int(np.ceil(capacity * still_needed / total_extra_capacity)), capacity, still_needed)
                if to_take > 0:
                    species_df = df[df['ScientificName'] == species]
                    already_used = result_df[result_df['ScientificName'] == species].index
                    available = species_df[~species_df.index.isin(already_used)]
                    
                    if len(available) >= to_take:
                        additional = available.sample(to_take, random_state=42)
                        result_df = pd.concat([result_df, additional])
                        still_needed -= to_take
    
    return result_df




df_30k = create_30k_dataset(df, species_counts)
print(f"df_30k shape: {df_30k.shape}")
df_30k.to_csv(output_dir / "probing_30k_images.csv", index=False)


df_full = df.copy()
print(f"df_full shape: {df_full.shape}")
df_full.to_csv(output_dir / "probing_full_dataset.csv", index=False)


def print_species_distribution(subset_df, name):
    subset_counts = subset_df['ScientificName'].value_counts()
    print(f"\n{name} Species Distribution:")
    print(f"- Min samples per species: {subset_counts.min()}")
    print(f"- Max samples per species: {subset_counts.max()}")
    print(f"- Mean samples per species: {subset_counts.mean():.2f}")

print_species_distribution(df_20, "20 per species")
print_species_distribution(df_50, "50 per species") 
print_species_distribution(df_30k, "30K dataset")
print_species_distribution(df_full, "Full dataset")


print("\nOverlap Statistics:")
print(f"Images in both df_20 and df_50: {len(set(df_20.index).intersection(set(df_50.index)))}")
print(f"Images in both df_20 and df_30k: {len(set(df_20.index).intersection(set(df_30k.index)))}")
print(f"Images in both df_50 and df_30k: {len(set(df_50.index).intersection(set(df_30k.index)))}")

print("\nProbing subsets created successfully!")

Total number of unique species: 290
Total number of images: 63077
df_20 shape: (2900, 4)
df_20 shape: (5800, 4)
df_50 shape: (14500, 4)
Added all 95 samples from rare species: Ophonus puncticeps
Added all 94 samples from rare species: Miscodera arctica
Added all 93 samples from rare species: Agonum nigrum
Added all 92 samples from rare species: Harpalus neglectus
Added all 91 samples from rare species: Harpalus servus
Added all 91 samples from rare species: Acupalpus flavicollis
Added all 90 samples from rare species: Pterostichus cristatus
Added all 90 samples from rare species: Amara consularis
Added all 90 samples from rare species: Bembidion testaceum
Added all 89 samples from rare species: Laemostenus complanatus
Added all 87 samples from rare species: Trechus fulvus
Added all 86 samples from rare species: Carabus glabratus
Added all 86 samples from rare species: Acupalpus brunnipes
Added all 83 samples from rare species: Calathus ambiguus
Added all 82 samples from rare species: E

In [2]:
# Function to create ratio-preserving samples
def create_ratio_preserving_sample(df, target_size, species_counts):
    result_df = pd.DataFrame()
    
    # Calculate sampling fraction
    sampling_fraction = target_size / len(df)
    
    # For each species, take samples proportional to their original frequency
    for species, count in species_counts.items():
        species_df = df[df['ScientificName'] == species]
        
        # Calculate how many samples to take based on original ratio
        samples_to_take = max(1, int(np.round(count * sampling_fraction)))
        
        # Ensure we don't take more samples than available
        samples_to_take = min(samples_to_take, count)
        
        # Sample from this species
        sampled = species_df.sample(samples_to_take, random_state=42)
        result_df = pd.concat([result_df, sampled])
    
    # Handle any small discrepancy in final size due to rounding
    if len(result_df) > target_size:
        # Remove random samples to reach target size
        result_df = result_df.sample(target_size, random_state=42)
    elif len(result_df) < target_size:
        # Add more samples from species with remaining capacity
        remaining_needed = target_size - len(result_df)
        
        # Find which species have remaining samples
        additional_capacity = {}
        for species in species_counts.index:
            used = len(result_df[result_df['ScientificName'] == species])
            available = species_counts[species] - used
            if available > 0:
                additional_capacity[species] = available
        
        # Sort species by remaining capacity, weighted by original distribution
        weighted_capacity = {sp: (ct/species_counts[sp]) * species_counts[sp]/total_images 
                             for sp, ct in additional_capacity.items()}
        sorted_species = sorted(weighted_capacity.keys(), 
                                key=lambda s: weighted_capacity[s], 
                                reverse=True)
        
        # Add samples until we reach target size
        for species in sorted_species:
            if remaining_needed <= 0:
                break
                
            species_df = df[df['ScientificName'] == species]
            already_used = result_df[result_df['ScientificName'] == species].index
            available = species_df[~species_df.index.isin(already_used)]
            
            to_take = min(1, len(available), remaining_needed)
            if to_take > 0:
                additional = available.sample(to_take, random_state=42)
                result_df = pd.concat([result_df, additional])
                remaining_needed -= to_take
    
    return result_df

# Create the ratio-preserving samples
target_sizes = [2900, 5800, 14500]
for size in target_sizes:
    ratio_df = create_ratio_preserving_sample(df, size, species_counts)
    print(f"Created ratio-preserving sample of size {len(ratio_df)}")
    ratio_df.to_csv(output_dir / f"probing_{size}_ratio.csv", index=False)
    
    # Print distribution statistics
    ratio_counts = ratio_df['ScientificName'].value_counts()
    print(f"\nProbing {size} Ratio Distribution:")
    print(f"- Number of species: {len(ratio_counts)}")
    print(f"- Min samples per species: {ratio_counts.min()}")
    print(f"- Max samples per species: {ratio_counts.max()}")
    print(f"- Mean samples per species: {ratio_counts.mean():.2f}")
    
    # Verify ratio preservation
    original_proportions = species_counts / species_counts.sum()
    sample_proportions = ratio_counts / ratio_counts.sum()
    
    # Calculate correlation between original and sample proportions
    correlation = original_proportions.corr(sample_proportions)
    print(f"- Correlation with original distribution: {correlation:.4f}")
    
    # Show top 5 species in this sample
    print("- Top 5 species in sample:")
    for species, count in ratio_counts.head(5).items():
        original = species_counts[species]
        print(f"  {species}: {count} samples ({count/original:.2%} of original)")

print("\nRatio-preserving probing subsets created successfully!")

Created ratio-preserving sample of size 2900

Probing 2900 Ratio Distribution:
- Number of species: 290
- Min samples per species: 2
- Max samples per species: 41
- Mean samples per species: 10.00
- Correlation with original distribution: 0.9989
- Top 5 species in sample:
  Notiophilus biguttatus: 41 samples (4.62% of original)
  Bembidion tetracolum: 40 samples (4.65% of original)
  Pterostichus strenuus: 38 samples (4.65% of original)
  Ophonus rufibarbis: 33 samples (4.59% of original)
  Bembidion lampros: 32 samples (4.66% of original)
Created ratio-preserving sample of size 5800

Probing 5800 Ratio Distribution:
- Number of species: 290
- Min samples per species: 5
- Max samples per species: 82
- Mean samples per species: 20.00
- Correlation with original distribution: 0.9998
- Top 5 species in sample:
  Notiophilus biguttatus: 82 samples (9.23% of original)
  Bembidion tetracolum: 79 samples (9.18% of original)
  Pterostichus strenuus: 75 samples (9.17% of original)
  Ophonus ruf