In [None]:
!apt update
!apt install chromium-chromedriver
!pip install selenium
!pip install googletrans==4.0.0-rc1

import os
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
from googletrans import Translator
import time

options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
browser = webdriver.Chrome(options=options)
translator = Translator()

In [None]:
#Cleaning up json files

folder_path = '/content/drive/MyDrive/VNP/chatbot/parsed_products'
files = os.listdir(folder_path)

default_image_url = "https://png.pngtree.com/png-vector/20221125/ourmid/pngtree-no-image-available-icon-flatvector-illustration-pic-design-profile-vector-png-image_40966566.jpg"

for file in files:
    file_name = file
    file_path = os.path.join(folder_path, file)

    with open(file_path, 'r') as file:
        product = json.load(file)

        #Adding missing titles
        if product['title'] == '':
            product['title'] = file_name.replace('.json', '').replace('-', ' ') #if title is empty get it from the file name
            #Add the title to the json file
            with open(file_path, 'w') as file:
                json.dump(product, file, ensure_ascii=False, indent=4)

        #Scraping images
        url = product['url']
        try:
            browser.get(url)
            image = browser.find_elements(By.CSS_SELECTOR, "img.ng-scope")[0].get_attribute('src')
        except Exception:
            image = default_image_url #if image doesn't exist, use a default 'no-image-available' image
        product['image'] = image
        #Add the image to the json files
        with open(file_path, 'w') as file:
            json.dump(product, file, ensure_ascii=False, indent=4)

        #Cleaning up misspelled categories
        write = False
        if product['category'] == 'Keyborad' or product['category'] == 'Kayboard':
            product['category'] = 'Keyboard'
            write = True
        if product['category'] == 'Headhones':
            product['category'] = 'Headphones'
            write = True
        if product['category'] == 'Accssories':
            product['category'] = 'Accessories'
            write = True
        if product['category'] == 'Soundbars':
            product['category'] = 'Soundbar'
            write = True
        if product['category'] == 'Phone':
            product['category'] = 'Smartphone'
            write = True
        if write is True:
            with open(file_path, 'w') as file:
                json.dump(product, file, ensure_ascii=False, indent=4)

In [14]:
#Transfering json to csv, translating from mk to en, converting prices

folder_path = '/content/drive/MyDrive/VNP/chatbot/parsed_products'
files = os.listdir(folder_path)

products_csv = pd.read_csv('/content/drive/MyDrive/VNP/chatbot/products.csv')
descriptions_csv = pd.read_csv('/content/drive/MyDrive/VNP/chatbot/descriptions.csv')

conversion_rate = 54.98 # Conversion rate from MKD to USD
product_id = 0

product_list = []
description_list = []

for file in files:
    file_name = file
    file_path = os.path.join(folder_path, file)

    with open(file_path, 'r') as file:
        product = json.load(file)

        #Descriptions
        product['id'] = product_id #used to keep the relationship between the product and the descriptions
        description = product.pop('description', {})
        for key, value in description.items():
            try:
                key = translator.translate(key, src='mk', dest='en').text
                time.sleep(2)
            except Exception:
                pass
            try:
                value = translator.translate(value, src='mk', dest='en').text
                time.sleep(2)
            except Exception:
                pass
            key = key.replace(',', ' ')
            value = value.replace(',', ' ')
            description_list.append({'product_id': product_id, 'key': key, 'value': value})

        #Translating Titles
        try:
            title = translator.translate(product['title'], src='mk', dest='en').text
            time.sleep(2)
        except Exception:
            pass
        product['title'] = title.upper().replace(',', ' ')

        #Converting Prices
        regular_price = product['regular_price'].replace('.','')
        happy_price = product['happy_price'].replace('.','')
        if regular_price:
            product['regular_price'] = round(float(regular_price) / conversion_rate, 2)
        if happy_price:
            product['happy_price'] = round(float(happy_price) / conversion_rate, 2)

        #Appending Data
        product_list.append(product)

        product_id += 1


products = pd.DataFrame(product_list)
id_column = products.pop('id')
products.insert(0, 'id', id_column) #Making sure id is first column
descriptions = pd.DataFrame(description_list)

products['happy_price'] = products['happy_price'].fillna(0)
products['warranty'] = products['warranty'].fillna(0)
descriptions['value'] = descriptions['value'].str.replace('
', ' ', regex=True)

products.to_csv('/content/drive/MyDrive/VNP/chatbot/products.csv', index=False, sep='|')
descriptions.to_csv('/content/drive/MyDrive/VNP/chatbot/descriptions.csv', index=False, sep='|')