# Dataset Creation

In [1]:
# Dataset manipulation
import pandas as pd
import numpy as np
import shutil
import re
import os

# Web scraping
from bing_image_downloader import downloader
from concurrent.futures import ThreadPoolExecutor
from bs4 import BeautifulSoup
import requests
import backoff

## 1. Label Generation

### 1.1. Generate label list from API datasource

In [None]:
@backoff.on_exception(backoff.expo,
                      requests.exceptions.RequestException,
                      max_time=60)

def get_url(scraper_url):
    return requests.get(scraper_url)

In [None]:
column_labels = ['Label', 'Index', 'Category']
data_list = []

In [None]:
BASE_URL = "https://stilltasty.com/searchitems/index"
category_dict = {
    26: 'Fruits', 
    25: 'Vegetables', 
    9: 'Dairy & Eggs', 
    27: 'Meat & Poultry', 
    7: 'Fish & Shellfish', 
    28: 'Nuts, Grains & Pasta', 
    6: 'Condiments & Oils', 
    31: 'Snacks and Baked Goods', 
    30: 'Herbs & Spices', 
    5: 'Beverages'
}

for index, category in category_dict.items():
    page_no = 0

    while True:
        page_no += 1
        page = get_url(f"{BASE_URL}/{index}?page={page_no}")
        soup = BeautifulSoup(page.content, "html.parser")
        search_list = soup.find("div", class_="search-list")
        items = search_list.find_all("a")
        if len(items) == 0:
            break
        
        for item in items:
            data_list.append([item.text, item['href'].split('/')[-1], category])

In [None]:
np_array = np.array(data_list)
df = pd.DataFrame(data=np_array, columns=column_labels)
df.head(10)

### 1.2. Standardise labels

In [None]:
df['Label'] = df['Label'].map(lambda x: x.replace('—', '-'))
df.head(10)

### 1.3. Remove label qualifiers

In [None]:
df['Label'] = df['Label'].map(lambda x: x.split(' - ')[0].strip())
df.head(10)

### 1.4. Remove duplicate labels

In [None]:
df = df.drop_duplicates(subset='Label', keep="first")
df = df.reset_index().drop(labels=['index'], axis=1)
df.head(10)

### 1.5. Create General Labels

In [None]:
df_general = df.copy()
df_general['Label'] = df_general['Label'].map(lambda x: re.split(r"([^A-Za-z ])", x)[0].strip())
df_general = df_general.drop_duplicates(subset='Label', keep="first")
df_general = df_general.reset_index().drop(labels=['index'], axis=1)
df_general.head(10)

### 1.6. Output DF to CSV

In [None]:
df.to_csv('./labels.csv')
df_general.to_csv('./labels_general.csv')

## 2. Image retrieval

In [2]:
def download_from_df(df: pd.DataFrame, column_name, output_dir, num_per_label):
    with ThreadPoolExecutor(max_workers=16) as executor:
        for index in df.index:
            executor.submit(downloader.download, df[column_name][index], num_per_label, output_dir, True, False, 180)

### 2.1. Read in labels as DF

In [3]:
df_general_csv = pd.read_csv("./labels_general.csv")
df_general_csv = df_general_csv.drop('Unnamed: 0', axis=1)
df_general_csv.head(10)

Unnamed: 0,Label,Index,Category
0,APPLE JUICE,16371,Fruits
1,APPLES,16381,Fruits
2,APPLESAUCE,16388,Fruits
3,APRICOT NECTAR,16392,Fruits
4,APRICOTS,16401,Fruits
5,AVOCADOS,16426,Fruits
6,BANANAS,16450,Fruits
7,BLACKBERRIES,16549,Fruits
8,BLACKBERRY JUICE,16556,Fruits
9,BLUEBERRIES,16577,Fruits


### 2.2. Skip Labels Already Retrieved

In [4]:
num_per_label = 100
output_dir = f'Data-{num_per_label}'

labels_present = [f.name for f in os.scandir(output_dir) if f.is_dir()]

In [5]:
for directory in labels_present:
    curr_dir = f'{output_dir}/{directory}'
    files = [f for f in os.scandir(curr_dir) if f.is_file()]
    if len(files) < num_per_label:
        shutil.rmtree(curr_dir)

already_downloaded = [f.name for f in os.scandir(output_dir) if f.is_dir()]

print('Removed the following unfinished label downloads:')
print(f'{[i for i in labels_present if i not in already_downloaded]}')

Removed the following unfinished label downloads:
['CHERVIL', 'MAPLE SYRUP', 'CLOVES', 'CHERRY EXTRACT', 'BOYSENBERRY JAM', 'BUTTER CAKE', 'CILANTRO LEAVES', 'CINNAMON STICKS', 'CINNAMON EXTRACT', 'CINNAMON', 'CILANTRO', 'CHIVES', 'COFFEE CAKE', 'MARSHMALLOW CREME', 'CANDY CANES', 'COCOA BEVERAGE MIX', 'CAKE']


In [6]:
df_to_download = df_general_csv[~df_general_csv.Label.isin(already_downloaded)]
df_to_download.head(10)

Unnamed: 0,Label,Index,Category
397,BOYSENBERRY JAM,16605,Snacks and Baked Goods
404,BUTTER CAKE,16654,Snacks and Baked Goods
408,CAKE,16663,Snacks and Baked Goods
410,CANDY CANES,18920,Snacks and Baked Goods
429,COCOA BEVERAGE MIX,16910,Snacks and Baked Goods
432,COFFEE CAKE,16929,Snacks and Baked Goods
450,MAPLE SYRUP,17643,Snacks and Baked Goods
452,MARSHMALLOW CREME,17663,Snacks and Baked Goods
565,CHERRY EXTRACT,16761,Herbs & Spices
566,CHERVIL,16766,Herbs & Spices


### 2.3. Download Images for Remaining Labels

In [7]:
download_from_df(df= df_to_download, column_name='Label', output_dir=f'Data-{num_per_label}', num_per_label= num_per_label)

[%] Downloading Images to /Users/matthewsoulsby/Documents/xcode-repos/a2-s3784709/ImageClassifier/Data-100/BOYSENBERRY JAM


[!!]Indexing page: 1

[%] Downloading Images to /Users/matthewsoulsby/Documents/xcode-repos/a2-s3784709/ImageClassifier/Data-100/CAKE


[!!]Indexing page: 1

[%] Downloading Images to /Users/matthewsoulsby/Documents/xcode-repos/a2-s3784709/ImageClassifier/Data-100/BUTTER CAKE


[!!]Indexing page: 1

[%] Downloading Images to /Users/matthewsoulsby/Documents/xcode-repos/a2-s3784709/ImageClassifier/Data-100/COCOA BEVERAGE MIX


[!!]Indexing page: 1

[%] Downloading Images to /Users/matthewsoulsby/Documents/xcode-repos/a2-s3784709/ImageClassifier/Data-100/CANDY CANES


[!!]Indexing page: 1

[%] Downloading Images to /Users/matthewsoulsby/Documents/xcode-repos/a2-s3784709/ImageClassifier/Data-100/MAPLE SYRUP


[!!]Indexing page: 1

[%] Downloading Images to /Users/matthewsoulsby/Documents/xcode-repos/a2-s3784709/ImageClassifier/Data-100/COFFEE CAKE


[!!]Indexing page