# Dataset Creation

In [4]:
# Dataset manipulation
import pandas as pd
import numpy as np
import re

# Web scraping
from bing_image_downloader import downloader
from bs4 import BeautifulSoup
import requests
import backoff

## 1. Label Generation

### 1.1. Generate label list from API datasource

In [5]:
@backoff.on_exception(backoff.expo,
                      requests.exceptions.RequestException,
                      max_time=60)

def get_url(scraper_url):
    return requests.get(scraper_url)

In [6]:
column_labels = ['Label', 'Index', 'Category']
data_list = []

In [7]:
BASE_URL = "https://stilltasty.com/searchitems/index"
category_dict = {
    26: 'Fruits', 
    25: 'Vegetables', 
    9: 'Dairy & Eggs', 
    27: 'Meat & Poultry', 
    7: 'Fish & Shellfish', 
    28: 'Nuts, Grains & Pasta', 
    6: 'Condiments & Oils', 
    31: 'Snacks and Baked Goods', 
    30: 'Herbs & Spices', 
    5: 'Beverages'
}

for index, category in category_dict.items():
    page_no = 0

    while True:
        page_no += 1
        page = get_url(f"{BASE_URL}/{index}?page={page_no}")
        soup = BeautifulSoup(page.content, "html.parser")
        search_list = soup.find("div", class_="search-list")
        items = search_list.find_all("a")
        if len(items) == 0:
            break
        
        for item in items:
            data_list.append([item.text, item['href'].split('/')[-1], category])

In [26]:
np_array = np.array(data_list)
df = pd.DataFrame(data=np_array, columns=column_labels)
df.head(10)

Unnamed: 0,Label,Index,Category
0,APPLE JUICE - COMMERCIALLY FROZEN CONCENTRATE,16371,Fruits
1,"APPLE JUICE, COMMERCIALLY CANNED OR BOTTLED, S...",16372,Fruits
2,"APPLE JUICE, COMMERCIALLY CANNED OR BOTTLED, S...",16373,Fruits
3,"APPLE JUICE, SOLD IN REFRIGERATED CONTAINER - ...",16374,Fruits
4,"APPLE JUICE, SOLD IN REFRIGERATED CONTAINER - ...",16375,Fruits
5,APPLES - COMMERCIALLY FROZEN,16381,Fruits
6,"APPLES - FRESH, RAW, CUT UP",16382,Fruits
7,"APPLES - FRESH, RAW, WHOLE",16383,Fruits
8,"APPLES, DRIED - UNOPENED OR OPENED PACKAGE",18822,Fruits
9,"APPLESAUCE, COMMERCIALLY BOTTLED OR PACKAGED, ...",16388,Fruits


### 1.2. Standardise labels

In [27]:
df['Label'] = df['Label'].map(lambda x: x.replace('—', '-'))
df.head(10)

Unnamed: 0,Label,Index,Category
0,APPLE JUICE - COMMERCIALLY FROZEN CONCENTRATE,16371,Fruits
1,"APPLE JUICE, COMMERCIALLY CANNED OR BOTTLED, S...",16372,Fruits
2,"APPLE JUICE, COMMERCIALLY CANNED OR BOTTLED, S...",16373,Fruits
3,"APPLE JUICE, SOLD IN REFRIGERATED CONTAINER - ...",16374,Fruits
4,"APPLE JUICE, SOLD IN REFRIGERATED CONTAINER - ...",16375,Fruits
5,APPLES - COMMERCIALLY FROZEN,16381,Fruits
6,"APPLES - FRESH, RAW, CUT UP",16382,Fruits
7,"APPLES - FRESH, RAW, WHOLE",16383,Fruits
8,"APPLES, DRIED - UNOPENED OR OPENED PACKAGE",18822,Fruits
9,"APPLESAUCE, COMMERCIALLY BOTTLED OR PACKAGED, ...",16388,Fruits


### 1.3. Remove label qualifiers

In [28]:
df['Label'] = df['Label'].map(lambda x: x.split(' - ')[0].strip())
df.head(10)

Unnamed: 0,Label,Index,Category
0,APPLE JUICE,16371,Fruits
1,"APPLE JUICE, COMMERCIALLY CANNED OR BOTTLED, S...",16372,Fruits
2,"APPLE JUICE, COMMERCIALLY CANNED OR BOTTLED, S...",16373,Fruits
3,"APPLE JUICE, SOLD IN REFRIGERATED CONTAINER",16374,Fruits
4,"APPLE JUICE, SOLD IN REFRIGERATED CONTAINER",16375,Fruits
5,APPLES,16381,Fruits
6,APPLES,16382,Fruits
7,APPLES,16383,Fruits
8,"APPLES, DRIED",18822,Fruits
9,"APPLESAUCE, COMMERCIALLY BOTTLED OR PACKAGED, ...",16388,Fruits


### 1.4. Remove duplicate labels

In [29]:
df = df.drop_duplicates(subset='Label', keep="first")
df = df.reset_index().drop(labels=['index'], axis=1)
df.head(10)

Unnamed: 0,Label,Index,Category
0,APPLE JUICE,16371,Fruits
1,"APPLE JUICE, COMMERCIALLY CANNED OR BOTTLED, S...",16372,Fruits
2,"APPLE JUICE, SOLD IN REFRIGERATED CONTAINER",16374,Fruits
3,APPLES,16381,Fruits
4,"APPLES, DRIED",18822,Fruits
5,"APPLESAUCE, COMMERCIALLY BOTTLED OR PACKAGED, ...",16388,Fruits
6,"APRICOT NECTAR, COMMERCIALLY CANNED OR BOTTLED...",16392,Fruits
7,APRICOTS,16401,Fruits
8,"APRICOTS, CANDIED",16403,Fruits
9,"APRICOTS, DRIED",18823,Fruits


### 1.5. Create General Labels

In [30]:
df_general = df.copy()
df_general['Label'] = df_general['Label'].map(lambda x: re.split(r"([^A-Za-z ])", x)[0].strip())
df_general = df_general.drop_duplicates(subset='Label', keep="first")
df_general = df_general.reset_index().drop(labels=['index'], axis=1)
df_general.head(10)

Unnamed: 0,Label,Index,Category
0,APPLE JUICE,16371,Fruits
1,APPLES,16381,Fruits
2,APPLESAUCE,16388,Fruits
3,APRICOT NECTAR,16392,Fruits
4,APRICOTS,16401,Fruits
5,AVOCADOS,16426,Fruits
6,BANANAS,16450,Fruits
7,BLACKBERRIES,16549,Fruits
8,BLACKBERRY JUICE,16556,Fruits
9,BLUEBERRIES,16577,Fruits


### 1.6. Output DF to CSV

In [31]:
df.to_csv('./labels.csv')
df_general.to_csv('./labels_general.csv')

## 2. Image retrieval

In [32]:
def download_from_df(df: pd.DataFrame, column_name, output_dir, num_per_label):
    for index in df.index:
        print(f'Downloading: {df[column_name][index]}')
        downloader.download(query= df[column_name][index], limit=num_per_label,  output_dir=output_dir, adult_filter_off=True, force_replace=False, timeout=180)

In [33]:
num_per_label = 100

download_from_df(df= df_general, column_name='Label', output_dir=f'Data-{num_per_label}', num_per_label= num_per_label)

Downloading: APPLE JUICE
[%] Downloading Images to /Users/matthewsoulsby/Documents/xcode-repos/a2-s3784709/ImageClassifier/Data/APPLE JUICE


[!!]Indexing page: 1

[%] Indexed 20 Images on Page 1.


[%] Downloading Image #1 from https://www.quicklly.com/upload_images/product/1602525097-juicy-juice-apple-juice-boxes.png
[%] File Downloaded !

[%] Downloading Image #2 from https://i5.walmartimages.com/asr/4436eb83-e8e7-40d9-a80f-67f0a8c96364_1.6c2ce7267ef33c75bed2e674c19d6aa4.jpeg
[%] File Downloaded !

[%] Downloading Image #3 from https://i5.walmartimages.com/asr/8a5b775e-7ff8-4147-ab05-704ef60ab1d3_1.8e8ea65ec4e0f420c758c819cb4066e5.jpeg
[%] File Downloaded !

[%] Downloading Image #4 from http://www.pediatricsnow.com/wp-content/uploads/2011/09/apple-juice.jpg
[!] Issue getting: http://www.pediatricsnow.com/wp-content/uploads/2011/09/apple-juice.jpg
[!] Error:: HTTP Error 404: Not Found
[%] Downloading Image #4 from https://cdnimg.webstaurantstore.com/images/products/extra_large/44014