In [27]:
from pinscrape import scraper, Pinterest

def using_search_engine(keyword, output_folder, images_to_download, proxies={}, number_of_workers=10):

    details = scraper.scrape(keyword, output_folder, proxies, number_of_workers, images_to_download)
    if details["isDownloaded"]:
        print("\nDownloading completed !!")
        print(f"\nTotal urls found: {len(details['extracted_urls'])}")
        print(f"\nTotal images downloaded (including duplicate images): {len(details['urls_list'])}")
        print(details)
    else:
        print("\nNothing to download !!", details)



In [28]:

def using_pinterest_apis(keyword, output_folder, images_to_download, proxies={}, number_of_workers=20):

    p = Pinterest(proxies=proxies) 
    images_url = p.search(keyword, images_to_download)
    p.download(url_list=images_url, number_of_workers=number_of_workers, output_folder=output_folder)


In [29]:
season_tuple = (
    'Spring 2020', 'Summer 2020', 'Fall 2020', 'Autumn 2020', 'Winter 2020',
    'Spring 2021', 'Summer 2021', 'Fall 2021', 'Autumn 2021', 'Winter 2021',
    'Spring 2022', 'Summer 2022', 'Fall 2022', 'Autumn 2022', 'Winter 2022',
    'Spring 2023', 'Summer 2023', 'Fall 2023', 'Autumn 2023', 'Winter 2023',
    'Spring 2024', 'Summer 2024', 'Fall 2024', 'Autumn 2024', 'Winter 2024',
)

# Number of images to download
images_to_download = 150

# Loop through the months tuple
for month in season_tuple:
    keyword = f"casual dresses women {month}"
    output_folder = f"images\{month}"
    using_pinterest_apis(keyword, output_folder, images_to_download)




In [33]:
import os
import torch
import pandas as pd
from pathlib import Path
from hashlib import md5
from collections import defaultdict

def process_folder_with_yolo(folder_path, output_path_1, output_path_2, model):
    """
    Processes a folder to filter out non-portrait/selfie images using YOLOv5.
    Args:
        folder_path (str): Path to the folder containing images.
        output_path_1 (str): Path to save images we need.
        output_path_2 (str): Path to save images we don't need.
        model: Pre-trained YOLOv5 model.
    """
    # Create output folders if they don't exist
    os.makedirs(output_path_1, exist_ok=True)
    os.makedirs(output_path_2, exist_ok=True)

    # Iterate through all files in the folder
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)

        # Check if the file is an image
        if file_name.lower().endswith(('.jpg', '.jpeg', '.png')):
            # Run YOLOv5 model inference
            results = model(file_path)
            detections = results.pandas().xyxy[0]  # Get detections as a DataFrame

            # Filter detections for 'person' class
            people = detections[detections['name'] == 'person']

            # Save to appropriate folder based on detection
            if len(people) > 0:
                output_file_path = os.path.join(output_path_1, file_name)
            else:
                output_file_path = os.path.join(output_path_2, file_name)

            # Save the image
            os.rename(file_path, output_file_path)

def remove_seasonal_duplicates(base_path, season_tuple):
    """
    Removes duplicate images across the same season by comparing file hashes.
    Args:
        base_path (str): Path to the base directory containing season folders.
        season_tuple (tuple): List of all subfolder names.
    """
    duplicates_count = 0
    season_groups = defaultdict(list)

    # Group folders by season (e.g., 'Fall', 'Spring')
    for folder_name in season_tuple:
        season = folder_name.split()[0]  # Extract the season (e.g., 'Spring', 'Fall')
        season_groups[season].append(folder_name)

    for season, folders in season_groups.items():
        print(f"Checking for duplicates within season: {season}")
        image_hashes = {}

        for folder_name in folders:
            folder_path = os.path.join(base_path, folder_name)

            if os.path.exists(folder_path):
                for file_name in os.listdir(folder_path):
                    file_path = os.path.join(folder_path, file_name)

                    if file_name.lower().endswith(('.jpg', '.jpeg', '.png')):
                        with open(file_path, 'rb') as f:
                            file_hash = md5(f.read()).hexdigest()

                        if file_hash in image_hashes:
                            print(f"Duplicate found: {file_path} matches {image_hashes[file_hash]}. Removing.")
                            os.remove(file_path)
                            duplicates_count += 1
                            # Log duplicate removal
                            print(f"Duplicate count updated: {duplicates_count}")
                        else:
                            image_hashes[file_hash] = file_path

    return duplicates_count

def process_all_folders(base_path, season_tuple, images_1_path, images_2_path, model):
    """
    Processes all folders in season_tuple.
    Args:
        base_path (str): Path to the base directory containing folders.
        season_tuple (tuple): List of all subfolder names.
        images_1_path (str): Path to the output directory for images_1.
        images_2_path (str): Path to the output directory for images_2.
        model: Pre-trained YOLOv5 model.
    """
    for folder_name in season_tuple:
        folder_path = os.path.join(base_path, folder_name)

        if os.path.exists(folder_path):
            print(f"Processing folder: {folder_name}")
            output_path_1 = os.path.join(images_1_path, folder_name)
            output_path_2 = os.path.join(images_2_path, folder_name)
            process_folder_with_yolo(folder_path, output_path_1, output_path_2, model)

def calculate_distribution(base_path, season_tuple, images_1_path, images_2_path, duplicates_count):
    """
    Calculates the distribution of images across all folders.
    Args:
        base_path (str): Path to the base directory containing folders.
        season_tuple (tuple): List of all subfolder names.
        images_1_path (str): Path to the output directory for images_1.
        images_2_path (str): Path to the output directory for images_2.
        duplicates_count (int): Number of duplicate images removed.
    """
    distribution = []

    for folder_name in season_tuple:
        images_1_folder = os.path.join(images_1_path, folder_name)
        images_2_folder = os.path.join(images_2_path, folder_name)

        images_1_count = len(os.listdir(images_1_folder)) if os.path.exists(images_1_folder) else 0
        images_2_count = len(os.listdir(images_2_folder)) if os.path.exists(images_2_folder) else 0

        distribution.append({
            "Folder": folder_name,
            "Images_1": images_1_count,
            "Images_2": images_2_count,
            "Total": images_1_count + images_2_count
        })

    total_images_1 = sum(d["Images_1"] for d in distribution)
    total_images_2 = sum(d["Images_2"] for d in distribution)

    distribution.append({
        "Folder": "All Folders",
        "Images_1": total_images_1,
        "Images_2": total_images_2,
        "Deleted Duplicates": duplicates_count,
        "Total": total_images_1 + total_images_2
    })

    df = pd.DataFrame(distribution)
    return df

if __name__ == "__main__":
    # Path to the dataset base directory
    base_directory = "images"

    # Path to images_1 and images_2 directories
    images_1_directory = "images1"
    images_2_directory = "images2"

    # Tuple containing all subfolder names
    season_tuple = (
        'Spring 2020', 'Summer 2020', 'Fall 2020', 'Autumn 2020', 'Winter 2020',
        'Spring 2021', 'Summer 2021', 'Fall 2021', 'Autumn 2021', 'Winter 2021',
        'Spring 2022', 'Summer 2022', 'Fall 2022', 'Autumn 2022', 'Winter 2022',
        'Spring 2023', 'Summer 2023', 'Fall 2023', 'Autumn 2023', 'Winter 2023',
        'Spring 2024', 'Summer 2024', 'Fall 2024', 'Autumn 2024', 'Winter 2024',
    )

    # Remove duplicates within each season
    duplicates_count = remove_seasonal_duplicates(base_directory, season_tuple)

    # Load YOLOv5 model
    model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)

    # Process all folders
    process_all_folders(base_directory, season_tuple, images_1_directory, images_2_directory, model)
    print("Filtering complete!")

    # Calculate and display distribution
    distribution_df = calculate_distribution(base_directory, season_tuple, images_1_directory, images_2_directory, duplicates_count)
    print(distribution_df)
    distribution_df.to_csv("distribution.csv", index=False)


NameError: name '_C' is not defined

In [32]:
!pip install torch




[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip
