In [1]:
import ollama
import os
from tqdm import tqdm
import json
import argparse
import wandb
import pandas as pd
import sys
from PIL import Image
from transformers import AutoTokenizer, CLIPTextModel, CLIPTokenizer, CLIPModel 
from torch.nn.functional import cosine_similarity
import torch
import signal
import torch.nn.functional as F
from scipy.spatial.distance import cosine

In [14]:
def get_image_urls(input_path, dataset_name, split):
    # Define the paths to the train and test CSVs
    csv_path = os.path.join(input_path, f'{dataset_name}_{split}_labels.csv')

    # Initialize a list to store image URLs

    # Function to read CSV and extract image URLs (excluding "no image" labels)
    
    try:
        # Read CSV file into a DataFrame
        df = pd.read_csv(csv_path, on_bad_lines='skip')
        
        # Check if the necessary columns exist
        if 'url' not in df.columns or 'label' not in df.columns:
            print(f"CSV file {csv_path} does not contain required columns.")
            return []

        # Filter rows where label is not "no image"
        for i in range(len(df['label'])):
    
            if df['label'].loc[i] == 'positive':
                
                df['label'].loc[i] = 'Yes' 
        
            elif df['label'].loc[i] == 'negative':
        
                df['label'].loc[i] = 'No' 
        
        valid_urls_and_labels = df[df['label'] != 'no image'].to_dict()

    except Exception as e:
        print(f"Error reading {csv_path}: {e}")

    # Extract image URLs from both train and test datasets
    

    return valid_urls_and_labels

In [21]:
data = {}
base_dir = "/home/macula/SMATousi/CVPR/ViGIR_CVPR_LLM/datasets/agile_modeling/" #args.base_dir #
dataset_name = "arts-and-crafts"
subset = 'train'
raw_data = get_image_urls(base_dir, dataset_name, subset)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['label'].loc[i] = 'Yes'
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, beca

In [23]:
csv_path = os.path.join(base_dir, f'{dataset_name}_{subset}_labels.csv')

# Initialize a list to store image URLs

# Function to read CSV and extract image URLs (excluding "no image" labels)


# Read CSV file into a DataFrame
raw_data = pd.read_csv(csv_path, on_bad_lines='skip')

In [24]:
raw_data

Unnamed: 0,url,label
0,https://img.wonderhowto.com/img/66/91/63475250...,positive
1,http://resize.over-blog.com/400x260-ct.jpg?htt...,positive
2,https://thumbs.dreamstime.com/t/rainbow-paint-...,negative
3,https://i.pinimg.com/236x/85/24/31/852431ee014...,positive
4,https://thumbs.dreamstime.com/t/knitting-11322...,positive
...,...,...
2941,https://chrysalis.com.au/images/Matoulas-Boat-...,positive
2942,https://cdn.shopify.com/s/files/1/2645/2620/ar...,positive
2943,https://t2.ftcdn.net/jpg/00/61/86/41/160_F_618...,positive
2944,https://i.imgur.com/hTRH1Mn.jpg,positive


In [16]:
all_urls = raw_data['url']
all_labels = raw_data['label']

data = {}

for index, url in all_urls.items():

    # print(url, index)
    # print(all_labels[index])
    
    # break
    # class_id = d
    class_name = all_labels[index]
    image_index = index
    image_url = url
    
    data[image_index] = {"label" : class_name, "url": image_url}
    

In [27]:
raw_data['url'].keys()

RangeIndex(start=0, stop=2946, step=1)

In [34]:
import os
import requests
from tqdm import tqdm

def download_images_by_label(csv_path, no_label_name, output_dir="downloaded_images"):
    """
    Downloads images from a CSV file based on a given label.
    
    Parameters:
        csv_path (str): Path to the CSV file.
        label_name (str): The label to filter images.
        output_dir (str): Directory to save the downloaded images.
    
    Returns:
        list: A list of dictionaries containing index, image name, and local path.
    """
    # Load CSV file
    df = pd.read_csv(csv_path, on_bad_lines='skip')
    
    # Filter data based on label name
    filtered_df = df[df["label"] != no_label_name]
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    image_list = {}
    
    
    for idx in tqdm(filtered_df['url'].keys(), total=len(filtered_df), desc="Downloading Images"):
        image_url = filtered_df["url"][idx]
        image_label_pn = filtered_df["label"][idx]
        if image_label_pn == 'positive':
            image_label = 'Yes'
        elif image_label_pn == 'negative':
            image_label = 'No'
            
        image_name = f"image_{idx}.jpg"
        image_path = os.path.join(output_dir, image_name)
    
        try:
            response = requests.get(image_url, timeout=10)
            response.raise_for_status()
    
            with open(image_path, "wb") as img_file:
                img_file.write(response.content)
    
            image_list[idx] = {"name": image_name, "path": image_path, "url": image_url, "label": image_label}
    
        except requests.RequestException as e:
            print(f"Failed to download {image_url}: {e}")
        # break

    return image_list

# Example usage:
# image_data = download_images_by_label(csv_file_path, "positive")
# print(image_data[:5])  # Preview the first few downloaded image paths



In [44]:
csv_path = "../datasets/agile_modeling/arts-and-crafts_test_labels.csv"
no_label_name = "no image"
output_dir = "../../results/agile_datasets/arts/"

"""
Downloads images from a CSV file based on a given label.

Parameters:
    csv_path (str): Path to the CSV file.
    label_name (str): The label to filter images.
    output_dir (str): Directory to save the downloaded images.

Returns:
    list: A list of dictionaries containing index, image name, and local path.
"""
# Load CSV file
df = pd.read_csv(csv_path, on_bad_lines='skip')

# Filter data based on label name
filtered_df = df[df["label"] != no_label_name]

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

image_list = {}


for idx in tqdm(filtered_df['url'].keys(), total=len(filtered_df), desc="Downloading Images"):
    image_url = filtered_df["url"][idx]
    image_label_pn = filtered_df["label"][idx]
    if image_label_pn == 'positive':
        image_label = 'Yes'
    elif image_label_pn == 'negative':
        image_label = 'No'
        
    image_name = f"image_{idx}.jpg"
    image_path = os.path.join(output_dir, image_name)

    try:
        response = requests.get(image_url, timeout=10)
        response.raise_for_status()

        with open(image_path, "wb") as img_file:
            img_file.write(response.content)

        image_list[idx] = {"name": image_name, "path": image_path, "url": image_url, "label": image_label}

    except requests.RequestException as e:
        print(f"Failed to download {image_url}: {e}")
    # break

Downloading Images:   0%|▏                                                                                       | 1/688 [00:00<01:52,  6.09it/s]

Failed to download https://cdn.xxl.thumbs.canstockphoto.com/canstock20972264.jpg: HTTPSConnectionPool(host='cdn.xxl.thumbs.canstockphoto.com', port=443): Max retries exceeded with url: /canstock20972264.jpg (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1000)')))


Downloading Images:   1%|▋                                                                                       | 5/688 [00:01<04:14,  2.68it/s]


KeyboardInterrupt: 

In [36]:
image_data = download_images_by_label(csv_path, no_label_name, output_dir)

Downloading Images:   0%|▏                                                                                       | 1/688 [00:00<02:04,  5.53it/s]

Failed to download https://cdn.xxl.thumbs.canstockphoto.com/canstock20972264.jpg: HTTPSConnectionPool(host='cdn.xxl.thumbs.canstockphoto.com', port=443): Max retries exceeded with url: /canstock20972264.jpg (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1000)')))


Downloading Images:   1%|▋                                                                                       | 5/688 [00:02<05:22,  2.12it/s]


KeyboardInterrupt: 

In [49]:
image_list[1]

{'name': 'image_1.jpg',
 'path': '../../results/agile_datasets/arts/image_1.jpg',
 'url': 'https://cdn.shopify.com/s/files/1/0915/9386/products/Soap_succulents_handmade_by_Sunbasil_Soap_1_fd18f63e-b744-4007-90e4-58b06770dd27_compact.jpg?v=1573419230',
 'label': 'No'}