In [1]:
# Import required modules
import praw
import requests
import os
from tqdm.notebook import tqdm
import pandas as pd
import shutil
import numpy as np

# Import created modules
from modules.Preprocess_Validation import preprocess
from modules.Credentials import credentials as cred
from torchvision import transforms

reddit = praw.Reddit(
    user_agent=True, 
    client_id=cred.client_id, 
    client_secret=cred.client_secret, 
    username=cred.username, 
    password=cred.password)

Create Folders according to input

In [6]:
DataBaseName = 'AddTest'
limit = 50
# Pool the correct subreddits into the correct labels {subreddit : label}
subreddits = {'cats': 'cat', 'cat':'cat','meme': 'memes', 'memes': 'memes'}

In [8]:

if not os.path.exists('./'+DataBaseName):
    os.makedirs('./'+DataBaseName, exist_ok=True)

    for folder in ['train', 'valid', 'test']:
        os.makedirs('./'+DataBaseName+'/'+folder, exist_ok=True)

        for sub in subreddits.values():
            os.makedirs('./'+DataBaseName+'/'+folder+'/'+sub, exist_ok=True)

for sub, folder_name in subreddits.items():
    subreddit = reddit.subreddit(sub)
    posts = subreddit.hot(limit=limit)

    # loop over fil, download image
    for post in tqdm(posts, desc=f'Downloading Images from {sub}'):

        # filter for only .jpg/jpeg/png files
        if post.url.endswith(('.jpg', '.jpeg', '.png')):
            response = requests.get(post.url)

            # Check if connection is okay?
            if response.status_code == 200:
                file_extension = post.url.split('.')[-1]

                output_folder = f"{DataBaseName}/{'train'}/{folder_name}"
                filename = f"{output_folder}/{post.id}.{file_extension}"

                with open(filename, 'wb') as file:
                    file.write(response.content)
            else:
                print(f"Failed to download image: {post.url}")

Downloading Images from cats: 0it [00:00, ?it/s]

Downloading Images from cat: 0it [00:00, ?it/s]

Downloading Images from meme: 0it [00:00, ?it/s]

Downloading Images from memes: 0it [00:00, ?it/s]

In [4]:
transform = transforms.Compose([
    transforms.Resize((256, 256))
])

# preprocess = preprocess.PreProcess(transform=transform)
preprocess.pre_prosses_data_set('./AddTest/train/memes/')

  0%|          | 0/82 [00:00<?, ?it/s]

Create Proper CSV output / Determine final location of the Image

In [4]:
output_csv = pd.DataFrame()
labels = list(set(subreddits.values()))

for label in labels:
    items = os.listdir(f"./{DataBaseName}/train/{label}")
    temp_df = pd.DataFrame({"id":items, "label":label})
    output_csv = pd.concat([output_csv, temp_df]).reset_index(drop=True)

In [5]:
# Check for class imbalance / push all overflow to one folder
output_csv['used'] = True
values = output_csv['label'].value_counts().sort_values(ascending=False)

# Check for class imbalance -> TODO make this work with more then 2 classes
if values.nunique() != 1:

    # Determine which value is under and over balanced
    high_label = values.index[0]
    low_label = values.index[-1]
    sample = values[high_label] - values[low_label]

    # Set highest class to lowest class -> UNDERSAMPLE
    rows = output_csv[output_csv['label'] == high_label].sample(n=sample, replace=False).index
    output_csv.loc[rows, 'used'] = False

# Display the final output of value counts
display(output_csv[['label', 'used']].value_counts())

# Create non-used folder
os.makedirs(f"./{DataBaseName}/non-used", exist_ok=True)
output_csv['set'] = np.NaN

# Move pictures to non-used folder if used is set to false
for label in labels:
    for index, row in output_csv[output_csv['label'] == label].iterrows():
        
        if row['used'] == False:
            source_path = f"./{DataBaseName}/train/{label}/{output_csv['id'].loc[index]}"
            destination_path = f"./{DataBaseName}/non-used"
            output_csv['set'].loc[index] = 'Non Used'

            # Move the file
            shutil.move(source_path, destination_path)

label  used 
cat    True     1053
memes  True     1053
       False      22
Name: count, dtype: int64

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output_csv['set'].loc[index] = 'Non Used'


In [6]:
# Drop false columns to other CSV
dropped = output_csv[output_csv['used'] == False]
output_csv = output_csv.drop(output_csv[output_csv['used'] == False].index, axis=0)
dropped.head()

Unnamed: 0,id,label,used,set
1062,198aml8.jpeg,memes,False,Non Used
1067,198k18o.jpeg,memes,False,Non Used
1081,198v88x.png,memes,False,Non Used
1106,1994uog.png,memes,False,Non Used
1110,1997ft4.png,memes,False,Non Used


In [7]:
# Set Sizes of Sets, all over will be pushed to Train
TestSize = 32
ValidSize = 64

# Get ranomd indexes which go to Test/Valid
for label in labels:
    selection = output_csv[(output_csv['label'] == label)]

    # Set images to Test according to Test Size
    test_selection = selection.sample(TestSize, replace=False)
    test_indexes = test_selection.index
    output_csv.loc[test_indexes, 'set'] = 'test'

    # Set images to Valid according to the Test Size
    valid_selection = selection[~selection.index.isin(test_selection.index)].sample(ValidSize, replace=False)
    valid_indexes = valid_selection.index
    output_csv.loc[valid_indexes, 'set'] = 'valid'
    
    # Set all other images over to the train folder
    excluded_indexes = [item for sublist in [valid_indexes.to_list(), test_indexes.to_list()] for item in sublist]
    train_selection = selection[~selection.index.isin(excluded_indexes)].index
    output_csv.loc[train_selection, 'set'] = 'train'

# display overall set
display(output_csv[['label', 'set']].value_counts())

label  set  
cat    train    957
memes  train    957
cat    valid     64
memes  valid     64
cat    test      32
memes  test      32
Name: count, dtype: int64

Move to correct folder

In [8]:
for label in labels:
    test_folder = f"./{DataBaseName}/test/{label}"
    valid_folder = f"./{DataBaseName}/valid/{label}"

    # Move to Test Folder
    for index, row in output_csv[(output_csv['label'] == label) & (output_csv['set'] == 'test')].iterrows():
        source_path = f"./{DataBaseName}/train/{label}/{output_csv['id'].loc[index]}"
        shutil.move(source_path, test_folder)
    
    # move to Valid folder
    for index, row in output_csv[(output_csv['label'] == label) & (output_csv['set'] == 'valid')].iterrows():
        source_path = f"./{DataBaseName}/train/{label}/{output_csv['id'].loc[index]}"
        shutil.move(source_path, valid_folder)

Make CSV

In [9]:
output = pd.concat([output_csv, dropped])
output.to_csv(f"./{DataBaseName}/output.csv")

Test to see compressed data

In [1]:
def preprocess_image(image_path, transform):
    # Load and preprocess the image
    image = Image.open(image_path).convert("RGB")
    return image, transform(image).unsqueeze(0)

In [None]:
meme_path = '../DataBaseV2/train/memes/'
cat_path = '../DataBaseV2/train/cat/'