In [2]:
#%% [code]
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Subset
from torchvision import datasets, transforms, models

def train(model, dataloader, criterion, optimizer, device, num_epochs=10):
    """
    Shared training loop.
    """
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for i, (inputs, labels) in enumerate(dataloader):
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item() * inputs.size(0)
        epoch_loss = running_loss / len(dataloader.dataset)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}")
    return model

def filter_dataset(dataset, label_to_exclude):
    """
    Returns a Subset of the dataset that excludes all samples with the given label.
    Assumes the dataset has attributes `classes` (list of class names)
    and `samples` (list of (path, label_index) tuples).
    """
    if not hasattr(dataset, 'classes') or not hasattr(dataset, 'samples'):
        raise AttributeError("Dataset must have 'classes' and 'samples' attributes.")
    
    try:
        # Get the index corresponding to the label to exclude.
        exclude_idx = dataset.classes.index(label_to_exclude)
    except ValueError:
        raise ValueError(f"Label '{label_to_exclude}' not found in dataset classes.")
    
    # Build a list of indices that do NOT have the label to exclude.
    indices = [i for i, (_, label) in enumerate(dataset.samples) if label != exclude_idx]
    print(f"Filtered dataset: kept {len(indices)} out of {len(dataset.samples)} samples (excluded '{label_to_exclude}')")
    return Subset(dataset, indices)



In [5]:
#%% [code]
import os
import tarfile
import requests
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Subset
from torchvision import datasets, transforms, models

def download_and_extract(url, dest_dir, extract=True):
    """
    Downloads a file from the given URL into dest_dir.
    If extract=True and the file is a tar archive, it will extract it.
    """
    os.makedirs(dest_dir, exist_ok=True)
    filename = url.split('/')[-1]
    file_path = os.path.join(dest_dir, filename)
    
    if not os.path.exists(file_path):
        print(f"Downloading {filename} ...")
        response = requests.get(url, stream=True)
        response.raise_for_status()  # ensure we notice bad responses
        with open(file_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
        print("Download complete.")
    else:
        print(f"{filename} already exists at {file_path}.")
    
    if extract and (file_path.endswith('.tar') or file_path.endswith('.tar.gz') or file_path.endswith('.tgz')):
        print("Extracting files...")
        with tarfile.open(file_path, 'r:*') as tar:
            tar.extractall(path=dest_dir)
        print("Extraction complete.")

def download_dataset(dataset_dir):
    """
    Checks if the dataset directory exists and contains data.
    If not, attempts to download and extract a dataset.
    
    NOTE: The full ImageNet dataset cannot be auto-downloaded due to licensing.
          Here, we use Imagenette (a smaller subset of ImageNet) as a placeholder.
          If you have full ImageNet access, place your data in dataset_dir.
    """
    # Check if dataset_dir exists and is non-empty.
    if not os.path.exists(dataset_dir) or len(os.listdir(dataset_dir)) == 0:
        print(f"Dataset not found in '{dataset_dir}'.")
        print("Due to licensing restrictions, the full ImageNet cannot be downloaded automatically.")
        print("Downloading 'Imagenette' (a small ImageNet-like dataset) for demonstration purposes...")
        
        # Define a URL for Imagenette (160px version) from fast.ai’s hosting.
        imagenette_url = "https://s3.amazonaws.com/fast-ai-imageclas/imagenette2-160.tgz"
        
        # We download to the parent directory of dataset_dir.
        parent_dir = os.path.dirname(dataset_dir.rstrip("/"))
        download_and_extract(imagenette_url, parent_dir, extract=True)
        
        # Imagenette extracts to a folder named "imagenette2-160"
        extracted_dir = os.path.join(parent_dir, "imagenette2-160", "train")
        if os.path.exists(extracted_dir):
            print(f"Moving downloaded dataset to '{dataset_dir}'...")
            os.rename(extracted_dir, dataset_dir)
        else:
            print("Expected extracted folder not found. Please check the extraction output.")
    else:
        print(f"Dataset found in '{dataset_dir}'.")

def train(model, dataloader, criterion, optimizer, device, num_epochs=10):
    """
    Shared training loop.
    """
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for i, (inputs, labels) in enumerate(dataloader):
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item() * inputs.size(0)
        epoch_loss = running_loss / len(dataloader.dataset)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}")
    return model

def filter_dataset(dataset, label_to_exclude):
    """
    Returns a Subset of the dataset that excludes all samples with the given label.
    Assumes the dataset has attributes `classes` (list of class names)
    and `samples` (list of (path, label_index) tuples).
    """
    if not hasattr(dataset, 'classes') or not hasattr(dataset, 'samples'):
        raise AttributeError("Dataset must have 'classes' and 'samples' attributes.")
    
    try:
        # Get the index corresponding to the label to exclude.
        exclude_idx = dataset.classes.index(label_to_exclude)
    except ValueError:
        raise ValueError(f"Label '{label_to_exclude}' not found in dataset classes.")
    
    # Build a list of indices that do NOT have the label to exclude.
    indices = [i for i, (_, label) in enumerate(dataset.samples) if label != exclude_idx]
    print(f"Filtered dataset: kept {len(indices)} out of {len(dataset.samples)} samples (excluded '{label_to_exclude}')")
    return Subset(dataset, indices)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Define data transforms – you can add more augmentation as desired.
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
])

# Set the path to your ImageNet training data.
# For ImageNet (or your dataset), the expected folder structure is:
#   dataset_dir/<class_name>/image.jpg
# For demonstration, if the directory is missing or empty, Imagenette will be downloaded.
dataset_dir = '/n/home04/rrinberg/data_dir/data_to_concept'  # <-- UPDATE this path as needed!
download_dataset(dataset_dir)

# Load the full dataset using ImageFolder.
full_dataset = datasets.ImageFolder(root=dataset_dir, transform=transform)
print("Number of classes in full dataset:", len(full_dataset.classes))

# Hyperparameters – adjust these based on your hardware.
batch_size = 64

# DataLoader for the full dataset.
full_loader = DataLoader(full_dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True)

# Create a model (ResNet18 in this example) and update its final layer to match the number of classes.
model_full = models.resnet18(pretrained=False)
num_features = model_full.fc.in_features
model_full.fc = nn.Linear(num_features, len(full_dataset.classes))
model_full = model_full.to(device)

# Define loss and optimizer.
criterion = nn.CrossEntropyLoss()
optimizer_full = optim.SGD(model_full.parameters(), lr=0.01, momentum=0.9, weight_decay=1e-4)

print("=== Training on the full dataset ===")
model_full = train(model_full, full_loader, criterion, optimizer_full, device, num_epochs=10)

# --- Now create a filtered dataset that excludes a given label ---
label_to_exclude = "leaves"  # <-- change this to the label you wish to exclude

# Note: For Imagenette, class names might differ from ImageNet. Adjust accordingly.
filtered_dataset = filter_dataset(full_dataset, label_to_exclude)
filtered_loader = DataLoader(filtered_dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True)

# Reinitialize a fresh model for training on the filtered dataset.
model_filtered = models.resnet18(pretrained=False)
model_filtered.fc = nn.Linear(num_features, len(full_dataset.classes))
model_filtered = model_filtered.to(device)
optimizer_filtered = optim.SGD(model_filtered.parameters(), lr=0.01, momentum=0.9, weight_decay=1e-4)

print(f"\n=== Training on the filtered dataset (excluding '{label_to_exclude}') ===")
model_filtered = train(model_filtered, filtered_loader, criterion, optimizer_filtered, device, num_epochs=10)


Using device: cuda
Dataset not found in '/n/home04/rrinberg/data_dir/data_to_concept'.
Due to licensing restrictions, the full ImageNet cannot be downloaded automatically.
Downloading 'Imagenette' (a small ImageNet-like dataset) for demonstration purposes...
Downloading imagenette2-160.tgz ...
Download complete.
Extracting files...
Extraction complete.
Moving downloaded dataset to '/n/home04/rrinberg/data_dir/data_to_concept'...
Number of classes in full dataset: 10




=== Training on the full dataset ===
Epoch 1/10, Loss: 1.7302
Epoch 2/10, Loss: 1.2176
Epoch 3/10, Loss: 1.0013
Epoch 4/10, Loss: 0.8287
Epoch 5/10, Loss: 0.6628
Epoch 6/10, Loss: 0.5385
Epoch 7/10, Loss: 0.3742
Epoch 8/10, Loss: 0.2866
Epoch 9/10, Loss: 0.1701
Epoch 10/10, Loss: 0.1522


ValueError: Label 'leaves' not found in dataset classes.

In [7]:
full_dataset.classes

['n01440764',
 'n02102040',
 'n02979186',
 'n03000684',
 'n03028079',
 'n03394916',
 'n03417042',
 'n03425413',
 'n03445777',
 'n03888257']

In [None]:

def main():
    # Use GPU if available.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)
    
    # Data transforms – adjust or add augmentation as desired.
    transform = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
    ])
    
    # Path to your ImageNet training data.
    # ImageNet should be organized like: /path/to/imagenet/train/<class_name>/image.jpg
    dataset_dir = '/path/to/imagenet/train'  # <-- UPDATE this path!
    
    # Load the full dataset using ImageFolder.
    full_dataset = datasets.ImageFolder(root=dataset_dir, transform=transform)
    print("Number of classes in full dataset:", len(full_dataset.classes))
    
    # Adjust the batch size based on your GPU memory.
    batch_size = 64
    
    # Create a DataLoader for the full dataset.
    full_loader = DataLoader(full_dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True)
    
    # Create a model (ResNet18 in this example) and update its final layer to match the number of classes.
    model_full = models.resnet18(pretrained=False)
    num_features = model_full.fc.in_features
    model_full.fc = nn.Linear(num_features, len(full_dataset.classes))
    model_full = model_full.to(device)
    
    # Define loss and optimizer.
    criterion = nn.CrossEntropyLoss()
    optimizer_full = optim.SGD(model_full.parameters(), lr=0.01, momentum=0.9, weight_decay=1e-4)
    
    print("=== Training on the full ImageNet dataset ===")
    model_full = train(model_full, full_loader, criterion, optimizer_full, device, num_epochs=10)
    
    # --- Now create a filtered dataset that excludes a given label ---
    label_to_exclude = "leaves"  # <-- change this to the label you wish to exclude
    filtered_dataset = filter_dataset(full_dataset, label_to_exclude)
    filtered_loader = DataLoader(filtered_dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True)
    
    # (Optionally) Reinitialize a fresh model for training on the filtered dataset.
    model_filtered = models.resnet18(pretrained=False)
    model_filtered.fc = nn.Linear(num_features, len(full_dataset.classes))
    model_filtered = model_filtered.to(device)
    optimizer_filtered = optim.SGD(model_filtered.parameters(), lr=0.01, momentum=0.9, weight_decay=1e-4)
    
    print(f"\n=== Training on the filtered ImageNet dataset (excluding '{label_to_exclude}') ===")
    model_filtered = train(model_filtered, filtered_loader, criterion, optimizer_filtered, device, num_epochs=10)
    
# Run the main function – in a Jupyter Notebook, simply execute this cell.
main()

# download imagenet 

In [10]:
session = requests.Session()
url = "https://image-net.org/data/imagenet21k_resized.tar.gz"
session.cookies.set('PHPSESSID', '7vg2gnnfseb4fn76ilg9453os1')
response = session.get(url, stream=True)

In [11]:
#%% [code]
import os
import tarfile
import requests
from tqdm import tqdm

def download_file(url, dest_path):
    """
    Download a file from the given URL to the specified destination path.
    Shows a progress bar using tqdm.
    """
    if os.path.exists(dest_path):
        print(f"{dest_path} already exists, skipping download.")
        return
    response = requests.get(url, stream=True)
    response.raise_for_status()  # Raise an error for bad responses
    total_size = int(response.headers.get('content-length', 0))
    block_size = 1024  # 1 KB chunks
    t = tqdm(total=total_size, unit='iB', unit_scale=True, desc=f"Downloading {os.path.basename(dest_path)}")
    with open(dest_path, 'wb') as f:
        for data in response.iter_content(block_size):
            f.write(data)
            t.update(len(data))
    t.close()
    if total_size != 0 and t.n != total_size:
        print("ERROR, something went wrong during the download!")
    else:
        print(f"Downloaded {dest_path}.")

def extract_tarfile(file_path, extract_to):
    """
    Extracts a tar (or tar.gz) archive to the specified directory.
    """
    print(f"Extracting {file_path} to {extract_to}...")
    with tarfile.open(file_path, 'r:*') as tar:
        tar.extractall(path=extract_to)
    print("Extraction complete.")

def download_imagenet(train_url, val_url, dest_dir):
    """
    Downloads and extracts the ImageNet training and validation datasets.
    """
    os.makedirs(dest_dir, exist_ok=True)
    
    # Define paths for the tar files
    train_tar = os.path.join(dest_dir, os.path.basename(train_url))
    val_tar = os.path.join(dest_dir, os.path.basename(val_url))
    
    # Download training set
    print("Starting download of the ImageNet training set...")
    download_file(train_url, train_tar)
    
    # Download validation set
    print("Starting download of the ImageNet validation set...")
    download_file(val_url, val_tar)
    
    # Extract the tar files if not already extracted.
    # (The extraction destination may vary depending on how you want to organize your data.)
    train_extract_path = os.path.join(dest_dir, "ILSVRC2012_img_train")
    val_extract_path = os.path.join(dest_dir, "ILSVRC2012_img_val")
    
    if not os.path.exists(train_extract_path):
        extract_tarfile(train_tar, dest_dir)
    else:
        print(f"Training set already extracted to {train_extract_path}.")
    
    if not os.path.exists(val_extract_path):
        extract_tarfile(val_tar, dest_dir)
    else:
        print(f"Validation set already extracted to {val_extract_path}.")

# Replace these URLs with the official download links you received from ImageNet.
# The URLs below are the ones historically provided for ImageNet 2012.
train_url = "http://www.image-net.org/challenges/LSVRC/2012/nnoupb/ILSVRC2012_img_train.tar"
val_url = "http://www.image-net.org/challenges/LSVRC/2012/nnoupb/ILSVRC2012_img_val.tar"

train_url = "https://image-net.org/data/imagenet21k_resized.tar.gz"
# Update dest_dir to where you want to store the dataset.
dest_dir = "/n/home04/rrinberg/data_dir/data_to_concept/image_net"  # <-- CHANGE this to your desired path

# Uncomment the line below to start downloading (note: this may take a long time!).
download_imagenet(train_url, val_url, dest_dir)


Starting download of the ImageNet training set...


HTTPError: 404 Client Error: Not Found for url: https://image-net.org/challenges/LSVRC/2012/nnoupb/ILSVRC2012_img_train.tar