# 2. Data Acquisition

## 2.1 Import Libraries

In [1]:
import requests, os, re, json, fnmatch, math, time, random
import pandas as pd
from bs4 import BeautifulSoup

## 2.2 Search List Manager

### 2.2.1 Create Search List Manager

In [3]:
class SearchManager:
    file_path = '../02. Datasets/Dataset_URLs.json'
    static_websites = ['aad.org','dermnetnz.org','reddit.com','dermcoll.edu.au','pcds.org.uk',
    'skinsight.com','sciencephoto.com','flickr.com','researchgate.net','acadderm.com',
    'aocd.org','instagram.com','facebook.com','hsc.unm.edu','link.springer.com','pharmaceutical-journal.com',
    'actasdermo.org','verywellhealth.com','mdedge.com'
    ]
    download_directory_path = '../02. Datasets/Images'
    credentials_path = '../Credentials.json'
    
    def __init__(self):
        self.search_list = self.load_search_list()
        
        # Load 1st set of keys by default
        self.load_credentials(0)

    @classmethod
    def load_search_list(cls):
        try:
            with open(cls.file_path, 'r', encoding='utf-8') as inFile:
                return json.load(inFile)
        except FileNotFoundError:
            print("File not found.")
            return []

    # Load Google Custom Search API Keys & Google Programmable Search engine ID
    def load_credentials(self, index):
        with open(self.credentials_path, 'r') as file:
            credentials = json.load(file)
            if 'Google Custom Search API' in credentials and len(credentials['Google Custom Search API']) > index:
                google_custom_search_creds = credentials['Google Custom Search API'][index]
                self.api_key = google_custom_search_creds.get('api_key')
                self.cse_id = google_custom_search_creds.get('cse_id')
            else:
                raise ValueError("Invalid credentials index or missing credentials.")

        if not self.api_key or not self.cse_id:
            raise ValueError("API key or CSE ID not found for the specified index.")
            
    class QuotaExceededException(Exception):
        """Raise exception when API quota exceeds."""
        def __init__(self, message=None):
            if message is None:
                message = "Quota limit exceeded for the day."
            super().__init__(message)
    
    def save(self):
        try:
            with open(self.file_path, 'w', encoding='utf-8') as outFile:
                json.dump(self.search_list, outFile, ensure_ascii=False, indent=4)
            print("Search list saved successfully.")
        except IOError as e:
            print(f"Error saving file: {e}")
            
    def add_image_urls(self):
        for item in self.search_list:
            languages = item['lang'].keys()
            item['imageURLs'] = {
                'site': {
                    site: {'lang': {lang: [] for lang in languages}}
                    for site in self.static_websites
                }
            }
        return self.search_list

    def add_general_sites(self):
        if not self.search_list:
            return

        # Get the current sites from the first item's imageURLs attribute
        current_sites = self.search_list[0]['imageURLs']['site'].keys()

        # Check for any new sites in static_websites that are not in current_sites
        new_sites = set(self.static_websites) - set(current_sites)
        
        if new_sites:
            for item in self.search_list:
                languages = item['lang'].keys()
                for site in new_sites:
                    item['imageURLs']['site'][site] = {'lang': {lang: [] for lang in languages}}
        return self.search_list

    def add_baidu_site(self):
        if not self.search_list:
            return

        for item in self.search_list:
            item['imageURLs']['site']['baidu.com'] = []
        return self.search_list

    def google_search_images(self, query_label, query, num_images=10, max_pages=10, img_size='', start_from=0):
        search_url = "https://www.googleapis.com/customsearch/v1"
        image_urls = []

        for page in range(max_pages):
            start_index = page * num_images + 1 + start_from
            # Limit search results to 100
            if start_index > 100:
                break

            params = {
                'q': query,
                'exactTerms':query_label,
                'cx': self.cse_id,
                'num': min(num_images, 10),  # num cannot exceed 10
                'start': start_index,
                'searchType': 'image',
                'filter': 1,
                'key': self.api_key
            }
            # Add 'imgSize' attribute only if it is provided
            if img_size:
                params['imgSize'] = img_size
            
            response = requests.get(search_url, params=params)
            if response.status_code == 429:
                error_message = response.json().get('error', {}).get('message')
                raise self.QuotaExceededException(error_message)
                
            result = response.json()
            page_urls = [item['link'] for item in result.get('items', [])]
            if not page_urls:
                # Terminate if no more results are found
                break
            image_urls.extend(page_urls)

        return image_urls

    def update_google_urls_for_item(self, item, lang_code, site, max_urls_per_site=100):
        className = item['label']
        if lang_code not in item['lang'] or site not in item['imageURLs']['site']:
            print(f"Language code '{lang_code}' or site '{site}' not found in item.")
            return
        query_label = item['lang'][lang_code]['label']
        search_query = f'"{query_label}" site:{site}'
        existing_urls_count = len(item['imageURLs']['site'][site]['lang'][lang_code])
        urls_needed = max_urls_per_site - existing_urls_count
        if urls_needed <= 0:
            print(f"Already have {max_urls_per_site} or more URLs for {lang_code} in {site}.")
            return

        # Perform the search and get image URLs
        image_urls = self.google_search_images(
            query_label,
            search_query, 
            num_images=min(urls_needed, 10), 
            max_pages=(urls_needed // 10 + 1),
            img_size='',
            start_from=existing_urls_count
        )

        # Extend imageURLs for the specified language & site
        item['imageURLs']['site'][site]['lang'][lang_code].extend(image_urls[:urls_needed])

    def get_baidu_page_URLs(self, search_term, pn):
        base_url = "https://image.baidu.com/search/acjson"
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
            "Accept": "text/plain, */*; q=0.01",
            "Accept-Language": "en-US,en;q=0.9",
        }
        params = {
            'tn': 'resultjson_com',
            'ipn': 'rj',
            'ct': '201326592',
            'fp': 'result',
            'word': search_term,
            'queryWord': search_term,
            'cl': '2',
            'lm': '-1',
            'ie': 'utf-8',
            'oe': 'utf-8',
            'nc': '1',
            'pn': pn,
            'rn': '30'
        }
        full_url = requests.Request('GET', base_url, params=params).prepare().url
        response = requests.get(full_url, headers=headers)
        if response.status_code == 200:
            try:
                response_json = json.loads(response.text)
            except json.JSONDecodeError:
                response_json = re.sub(r'"fromPageTitle".*(?="bdSourceName")', '', response.text)
                response_json = json.loads(response_json)
            return response_json
        else:
            return f"Error: {response.status_code}"
    
    def baidu_search_images(self, search_term, pages):
        URLs = []
        for page_number in range(pages):
            pn = 30 * page_number
            response = self.get_baidu_page_URLs(search_term, pn)
            for item in response.get('data'):
                URLs.append(item.get('thumbURL', ''))
        return URLs
    
    def update_baidu_urls_for_item(self, item, chinese_searchTerm, pages):
        URLs = self.baidu_search_images(chinese_searchTerm, pages)
        item['imageURLs']['site']['baidu.com'].extend(URLs)

# Instantiate SearchManager
searchManager = SearchManager()

### 2.2.2 Format Search List - Once ONLY

In [None]:
searchManager.add_image_urls()
searchManager.save()

### 2.2.3 Add New Sites to each class

In [9]:
# General sites
searchManager.add_general_sites()
searchManager.save()

Search list saved successfully.


In [5]:
# 'baidu.com' site only
searchManager.add_baidu_site()
searchManager.save()

Search list saved successfully.


### 2.2.4 Bulk <u>Fetch Image URLs</u>: Using Dataset_URLs.json

#### 2.2.4.1 Google Search Engine - Utilise Custom Search API

In [4]:
# Specify which API Key to use
searchManager.load_credentials(0)

# Specify range of items to process
start_index = 0
end_index = 236

# Iterate over the specified range of items in the search list
for i in range(start_index, end_index + 1):
    try:
        item = searchManager.search_list[i]
        lang_code, site = 'en', 'mdedge.com'
        max_urls_per_site = 100
        searchManager.update_google_urls_for_item(item, lang_code, site, max_urls_per_site=max_urls_per_site)
        print(f"Successfully processed item {i}")

    except IndexError:
        print(f"Index {i} is out of range for the search list.")
        
    except searchManager.QuotaExceededException as e:
        print(f"Quota limit reached at (Index {i}).\n{e}")
        print("Terminated loop...")
        # Exit the loop as quota limit is reached
        break
        
    except Exception as e:
        print(f"An error occurred while processing item at (Index {i}): {e}")

# Save changes to file
searchManager.save()

Successfully processed item 222
Successfully processed item 223
Skipped Class_Exclusion_List: Pemphigoid, Benign Mucous Membrane
Successfully processed item 224
Successfully processed item 225
Skipped Class_Exclusion_List: Pemphigus
Successfully processed item 226
Successfully processed item 227
Skipped Class_Exclusion_List: Sweat Gland Neoplasms
Successfully processed item 228
Successfully processed item 229
Successfully processed item 230
Successfully processed item 231
Skipped Class_Exclusion_List: Pressure Ulcer
Successfully processed item 232
Skipped Class_Exclusion_List: Sweating, Gustatory
Successfully processed item 233
Skipped Class_Exclusion_List: Hypohidrosis
Successfully processed item 234
Successfully processed item 235
Successfully processed item 236
Search list saved successfully.


#### 2.2.4.2 Baidu Search Engine - Utilise Data Scraping Techniques

In [8]:
# Specify range of items to process
start_index = 0
end_index = 236
page_items = 30

# Iterate over the specified range of items in the search list
for i in range(start_index, end_index + 1):
    try:
        item = searchManager.search_list[i]
        chinese_searchTerm = None
        if 'zh' in item['lang']:
            chinese_searchTerm = item['lang']['zh'].get('label')
        if chinese_searchTerm:
            pages = math.ceil(page_items / 30)
            searchManager.update_baidu_urls_for_item(item, chinese_searchTerm, pages)
            print(f"Successfully processed item {i} {item['label']}")

            # Add a random pause between 4-6 seconds
            time.sleep(random.uniform(4, 6))
    
    except IndexError:
        print(f"Index {i} {item['label']} is out of range for the search list.")
        
    except Exception as e:
        print(f"An error occurred while processing item at (Index {i} {item['label']}): {e}")

# Save changes to file
searchManager.save()

Successfully processed item 2 Chloracne
Successfully processed item 21 Dermatitis Herpetiformis
Successfully processed item 22 Dermatitis, Atopic
Successfully processed item 23 Dermatitis, Photoallergic
Successfully processed item 27 Dermatitis, Occupational
Successfully processed item 28 Dermatitis, Exfoliative
Successfully processed item 30 Dermatitis, Seborrheic
Successfully processed item 33 Erythema Nodosum
Successfully processed item 36 Serum Sickness
Successfully processed item 37 Stevens-Johnson Syndrome
Successfully processed item 38 Eczema, Dyshidrotic
Successfully processed item 39 Tinea cruris
Successfully processed item 41 Radiodermatitis
Successfully processed item 42 Dermatomyositis
Successfully processed item 43 Erythema Ab Igne
Successfully processed item 44 Erythema Chronicum Migrans
Successfully processed item 51 Tinea Pedis
Successfully processed item 56 Alopecia Areata
Successfully processed item 63 Keratoacanthoma
Successfully processed item 66 Darier Disease
Succ

## 2.3 Download Manager

### 2.3.1 Create Custom Download Functions for specific Data Sources

#### 2.3.1.1 Instagram URLs

In [None]:
def fetch_instagram_urls(shortcode):
    base_url = "https://www.instagram.com/graphql/query/"
    query_hash = "b3055c01b4b222b8a47dc12b090e4e64"
    variables = {"shortcode": shortcode}
    full_url = f"{base_url}?query_hash={query_hash}&variables={json.dumps(variables)}"
    response = requests.get(full_url)
    data_json = json.loads(response.content.decode('utf-8'))
    urls = []
    if 'data' in data_json and 'shortcode_media' in data_json['data']:
        media = data_json['data']['shortcode_media']
        if 'edge_sidecar_to_children' in media:
            edges = media['edge_sidecar_to_children']['edges']
            for edge in edges:
                node = edge['node']
                if 'display_url' in node:
                    urls.append(node['display_url'])
    return urls

In [None]:
# Instagram
def download_instagram_images(url, file_save_path):
    try:
        initial_response = requests.get(url)
        initial_response.raise_for_status()
        soup = BeautifulSoup(initial_response.content, 'html.parser')
        link_tags = soup.find_all('link')
        shortcode = None
        for tag in link_tags:
            href = tag.get('href')
            if href:
                match = re.search(r'%2Fp%2F([\w-]+)%2F', href)
                if match:
                    shortcode = match.group(1)
                    break
        downloadURLs = fetch_instagram_urls(shortcode)
        for i, URL in enumerate(downloadURLs):
            img_response = requests.get(URL)
            img_response.raise_for_status()
            with open(file_save_path + f"_{i}.jpg" , 'wb') as file:
                file.write(img_response.content)
    
    except requests.exceptions.HTTPError as err:
        print(f"HTTP error occurred: {err}")
    except requests.exceptions.RequestException as err:
        print(f"Error occurred during requests: {err}")
    except ValueError as err:
        print(err)

#### 2.3.1.2 Facebook URLs

In [None]:
# Facebook
def download_facebook_image(url, file_save_path):
    try:
        initial_response = requests.get(url)
        initial_response.raise_for_status()
        soup = BeautifulSoup(initial_response.content, 'html.parser')
        meta_refresh = soup.find('meta', attrs={'http-equiv': 'refresh'})
        if not meta_refresh:
            raise ValueError("No redirect URL found in the meta tag")
        redirect_url = meta_refresh['content'].split('url=')[1]
        response = requests.get(redirect_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        og_image = soup.find('meta', property='og:image')
        if not og_image or not og_image.get('content'):
            raise ValueError("Image URL not found")
        image_url = og_image['content']
        img_response = requests.get(image_url)
        img_response.raise_for_status()
        with open(file_save_path + ".jpg", 'wb') as file:
            file.write(img_response.content)
    
    except requests.exceptions.HTTPError as err:
        print(f"HTTP error occurred: {err}")
    except requests.exceptions.RequestException as err:
        print(f"Error occurred during requests: {err}")
    except ValueError as err:
        print(err)

### 2.3.2 Create Download Manager

In [4]:
class DownloadManager:
    def __init__(self, download_directory_path='../02. Datasets/Images'):
        self.download_directory_path = download_directory_path

    def download_google_urls_for_item(self, item, lang_code, site):
        if lang_code not in item['lang'] or site not in item['imageURLs']['site']:
            print(f"Language code '{lang_code}' or site '{site}' not found in item.")
            return

        label = item['label']
        image_urls = item['imageURLs']['site'][site]['lang'][lang_code]

        # Create a sub-directory for the label
        label_directory = os.path.join(self.download_directory_path, label)
        if not os.path.exists(label_directory):
            os.makedirs(label_directory)

        for i, url in enumerate(image_urls):
            # Construct the file name with label/site_lang_index format
            file_name = f"{site}_{lang_code}_{i}"
            file_path = os.path.join(label_directory, file_name)

            try:
                # Only works for 'Facebook' URLs
                if site == 'facebook.com':
                    download_facebook_image(url, file_path)
                # Only works for 'Instagram' URLs
                elif site =='instagram.com':
                    download_instagram_images(url, file_path)
                # Works for ALL other URLs         
                else:
                    response = requests.get(url)
                    if response.status_code == 200:
                        with open(file_path, 'wb') as f:
                            f.write(response.content)
                    else:
                        print(f"Failed to download image from {url} with status code {response.status_code}")
                    
            except Exception as e:
                print(f"An error occurred while downloading {url}: {e}")

    def download_baidu_urls_for_item(self, item):
        site = 'baidu.com'
        lang_code ='zh'
        label = item['label']
        image_urls = item['imageURLs']['site'][site]

        # Terminate if image_urls is empty
        if (len(image_urls) < 1):
            return

        label_directory = os.path.join(self.download_directory_path, label)
        if not os.path.exists(label_directory):
            os.makedirs(label_directory)

        for i, url in enumerate(image_urls):
            if url:
                file_name = f"{site}_{lang_code}_{i}"
                if '&f=JPEG' in url:
                    file_extension = '.jpg'
                elif '&f=PNG' in url:
                    file_extension = '.png'
                else:
                    # Default file extension
                    file_extension = '.jpg'
                file_name_with_extension = file_name + file_extension
                file_path = os.path.join(label_directory, file_name_with_extension)
                try:
                    response = requests.get(url)
                    if response.status_code == 200:
                        with open(file_path, 'wb') as f:
                            f.write(response.content)
                    else:
                        print(f"Failed to download image from {url} with status code {response.status_code}")
                    time.sleep(1)
                except Exception as e:
                    print(f"An error occurred while downloading {url}: {e}")
    
    def download_images(self, image_urls, save_folder):
        if not os.path.exists(save_folder):
            os.makedirs(save_folder)
        start_index = self.find_next_image_index(save_folder)
        for i, url in enumerate(image_urls, start=start_index):
            try:
                response = requests.get(url)
                if response.status_code == 200:
                    with open(os.path.join(save_folder, f'{i}'), 'wb') as f:
                        print(response)
                        f.write(response.content)
            except Exception as e:
                print(f"An error occurred while downloading {url}: {e}")

    def find_next_image_index(self, save_folder):
        """Find next available image index in the folder"""
        existing_files = os.listdir(save_folder)
        max_index = -1
        for file in existing_files:
            match = re.match(r"(\d+)\.jpg", file)
            if match:
                index = int(match.group(1))
                max_index = max(max_index, index)
        return max_index + 1

downloadManager = DownloadManager()

### 2.3.3 Bulk <u>Download Images from URLs</u>: Using Dataset_URLs.json

#### 2.3.3.1 Google Search Engine URLs

In [6]:
downloadCompleted = []

for site in filter(lambda site: site not in downloadCompleted, searchManager.static_websites):
    print("="*100)
    print(f"Starting downloads for: {site}...")
    print("="*100)
    
    # Specify range of items to download
    start_index = 0
    end_index = 236
    
    # Iterate over the specified range of items in the search list
    for i in range(start_index, end_index + 1):
        try:
            # Access the item at the current index
            item = searchManager.search_list[i]
    
            # Specify the language code & site
            lang_code = 'en'
    
            # Download image URLs for the current item
            downloadManager.download_google_urls_for_item(item, lang_code, site)
    
        except IndexError:
            print(f"Index {i} is out of range in the search list")
        except Exception as e:
            print(f"An error occurred while downloading images for class {i}: {e}")
    print("="*100)
    print(f"Completed downloads for: {site}")
    print("="*100)
    downloadCompleted.append(site)

Starting downloads for: flickr.com...
Failed to download image from https://live.staticflickr.com/65535/53042802574_7be7797eaf.jpg
Failed to download image from https://live.staticflickr.com/65535/52715645732_2521d62c18_n.jpg
Failed to download image from https://live.staticflickr.com/65535/51096894715_9c0110b852_z.jpg
Failed to download image from https://live.staticflickr.com/65535/51087999716_0922e78717.jpg
Failed to download image from https://live.staticflickr.com/65535/51096169369_976ab2d415_n.jpg
Failed to download image from https://live.staticflickr.com/5100/5408054914_69f4ec3946_w.jpg
Completed downloads for: flickr.com
Starting downloads for: researchgate.net...
Failed to download image from https://www.researchgate.net/publication/236929625/figure/fig13/AS:393138054156288@1470742824096/Acne-keloidalis-nuchae-AKN-a-Mild-moderate-AKN-with-numerous-firm-follicular-papules.png
Failed to download image from https://www.researchgate.net/publication/317150825/figure/fig4/AS:498278

#### 2.3.3.2 Baidu Search Engine URLs

In [5]:
# Specify range of items to download
start_index = 0
end_index = 236

# Iterate over the specified range of items in the search list
for i in range(start_index, end_index + 1):
    try:
        item = searchManager.search_list[i]
        className = item['label']
        downloadManager.download_baidu_urls_for_item(item)
        print(f"Successfully downloaded all images for item {i} {item['label']}")
        time.sleep(20)
    except IndexError:
        print(f"Index {i} {item['label']} is out of range in the search list")
    except Exception as e:
        print(f"An error occurred while downloading images for item {i} {item['label']}: {e}")

Successfully downloaded all images for item 0 Acne Keloid
Successfully downloaded all images for item 1 Acne Conglobata
Successfully downloaded all images for item 2 Chloracne
Successfully downloaded all images for item 21 Dermatitis Herpetiformis
Successfully downloaded all images for item 22 Dermatitis, Atopic
Successfully downloaded all images for item 23 Dermatitis, Photoallergic
Successfully downloaded all images for item 25 Dermatitis, Phototoxic
Successfully downloaded all images for item 27 Dermatitis, Occupational
Successfully downloaded all images for item 28 Dermatitis, Exfoliative
Successfully downloaded all images for item 29 Dermatitis, Perioral
Successfully downloaded all images for item 30 Dermatitis, Seborrheic
Successfully downloaded all images for item 33 Erythema Nodosum
Successfully downloaded all images for item 36 Serum Sickness
Successfully downloaded all images for item 37 Stevens-Johnson Syndrome
Successfully downloaded all images for item 38 Eczema, Dyshidrot