# Prerequisites

Upload:
- headers.json
- product_label.txt
- product_review_links.txt

#Initialization

In [1]:
!pip install openpyxl
!pip install requests
!pip install bs4

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2


In [29]:
from bs4 import BeautifulSoup as bs
from pathlib import Path
from typing import Optional,Union,Dict,List
import time
import os
import requests as requests
import json

path = Path('food_product')
path_train = Path('food_product/train')
path_test = Path('food_product/test')

In [30]:
def get_headers(
    key: str,
    default_value: Optional[str] = None
    )-> Dict[str,Dict[str,str]]:
    """ Get Headers """
    JSON_FILE : str = 'headers.json'

    with open(JSON_FILE,'r',encoding='UTF-8') as file:
        headers : Dict[str,Dict[str,str]] = json.loads(file.read())

    try :
        return headers[key]
    except:
        if default_value:
            return default_value
        raise EnvironmentError(f'Set the {key}')

def download_images(dest_train: str, dest_test: str, results: List):
    count = 0
    for i in range(len(results) - 3):
      # print(len(results[i]))
      for x in results[i]:
          try:
            r = requests.get(x).content
          except:
            pass
          with open(f"{dest_train}/images-{count+1}.jpg", "wb+") as f:
            f.write(r)
          count += 1
    for i in range(10, len(results)):
      # print(len(results[i]))
      for x in results[i]:
          try:
            r = requests.get(x).content
          except:
            pass
          with open(f"{dest_test}/images-{count+1}.jpg", "wb+") as f:
            f.write(r)
          count += 1

    print(count)

class CoupangImageReview:
    @staticmethod
    def get_product_code(url: str)-> str:
        #split to get the product Id
        prod_code : str = url.split('products/')[-1].split('?')[0]
        return prod_code

    def __init__(self)-> None:
        self.__headers : Dict[str,str] = get_headers(key='headers')

    def main(self, link: str, page_count: int):
        URL : str = link

        prod_code : str = self.get_product_code(url=URL)

        URLS : List[str] = [f'https://www.coupang.com/vp/product/reviews?productId={prod_code}&page={page}&size=5&sortBy=ORDER_SCORE_ASC&ratings=&q=&viRoleCode=3&ratingSummary=true' for page in range(1, page_count + 1)]

        self.__headers['referer'] = URL

        with requests.Session() as session:
            return [self.fetch(url=url, session=session) for url in URLS]

    def fetch(self, url:str, session):
        save_data = []

        with session.get(url=url, headers=self.__headers) as response :
            html = response.text
            soup = bs(html,'html.parser')

            article_lenth = len(soup.select('article.sdp-review__article__list'))

            for idx in range(article_lenth):
                articles = soup.select('article.sdp-review__article__list')

                img_con = articles[idx].select_one('div.sdp-review__article__list__attachment')
                img_list = img_con.select('div.sdp-review__article__list__attachment__list')
                if len(img_list) > 0:
                  for i in range(len(img_list)):
                    img_link = img_list[i].select_one('img.sdp-review__article__list__attachment__img')
                    if img_link == None or img_link.attrs['src'] == '':
                        img_link = '-'
                    else:
                        img_link = img_link.attrs['src']
                        save_data.append(img_link)
                else:
                    img_link = '-'
            time.sleep(1)

            return save_data

    @staticmethod
    def clear_console() -> None:
        command: str = 'clear'
        if os.name in ('nt','dos'):
            command = 'cls'
        os.system(command=command)

In [22]:
class GetReviewImages:
    @staticmethod
    def get_images(label: str, link: str, page_count: int)-> None:
        results = CoupangImageReview().main(link, page_count)
        dest_train = (path_train/label)
        dest_test = (path_test/label)

        try:
            os.mkdir(dest_train)
            os.mkdir(dest_test)
        except:
            pass
        download_images(dest_train, dest_test, results)

# Run Main Function

In [None]:
if __name__ == '__main__':
    product_review_links = []
    product_labels = []
    # Read product label
    with open('product_label.txt', 'r') as file:
        for line in file:
            product_labels.append(line.strip())

    with open('product_review_links.txt', 'r') as file:
        for line in file:
            product_review_links.append(line.strip())

    page_count = 13
    try:
        os.mkdir(path)
        os.mkdir(path_train)
        os.mkdir(path_test)
    except:
        pass

    for idx in range(len(product_review_links)):
        label = product_labels[idx]
        link = product_review_links[idx]
        GetReviewImages.get_images(label, link, page_count)

In [None]:
import shutil

folder_path = "/content/food_product"

zip_path = "/content/food_product"

shutil.make_archive(zip_path, 'zip', folder_path)

from google.colab import files
files.download(zip_path)