#Function Initializations

In [0]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re

In [0]:
def run(start_date, end_date, category, material_group):
    query = f"""
    SELECT
        t2.material_id,
        t2.material_name,
        t2.brand,
        t2.material_group_name,
        t2.category_name,
        ROUND(SUM(t1.amount)) AS sales
    FROM gold.transaction.uae_pos_transactions AS t1
    JOIN gold.material.material_master AS t2 ON t1.product_id = t2.material_id
    JOIN gold.store.store_master AS t3 ON t1.store_id = t3.store_id
    WHERE
        t1.business_day BETWEEN "{start_date}" AND "{end_date}"
        AND t2.category_name = "{category}"
        AND t2.material_group_name = "{material_group}"
        AND t3.tayeb_flag = 0
        AND t1.transaction_type IN ("SALE", "SELL_MEDIA")
        AND t1.amount > 0
        AND t1.quantity > 0
    GROUP BY 1, 2, 3, 4, 5
    ORDER BY 1
    """
    return spark.sql(query).toPandas()

In [0]:
def web_scrape_www(df, country):
    material_ids = df['material_id'].astype(str).tolist()
    dct = {'material_id': material_ids, 'material_name_long': [], 'link': []}

    for i in material_ids:
        url = f'https://www.luluhypermarket.com/en-{country}//p/{i}'
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        response = requests.get(url, headers=headers)
        bsObj = BeautifulSoup(response.content, 'html.parser')

        try:
            material_name = bsObj.find('h1', class_='product-name').text.strip()
        except AttributeError:
            material_name = None
            url = None
        
        dct['material_name_long'].append(material_name)
        dct['link'].append(url)

    df2 = pd.DataFrame(dct)
    df2['material_id'] = df2['material_id'].astype('int64')
    df2 = pd.merge(df2, df[['material_id', 'material_name', 'brand']], on='material_id', how = 'inner')
    df2['material_name_long'] = df2['material_name_long'].fillna(df2['material_name'])
    df2 = df2.drop(columns = 'material_name')
    df2.rename(columns = {'material_name_long': 'material_name'}, inplace = True)

    return df2

In [0]:
def web_scrape_gcc(df, country):
    material_ids = df['material_id'].astype(str).tolist()
    dct = {'material_id': material_ids, 'material_name_long': [], 'link': []}
    for material_id in material_ids:
        url = f'https://gcc.luluhypermarket.com/en-{country}/list/?search_text={material_id}'
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'}

        response = requests.get(url, headers = headers)
        bsObj = BeautifulSoup(response.content, 'html.parser')

        material_name = bsObj.find('body', class_ = "overflow-x-hidden scheduled")

        main_string = str(material_name)
        substring = f'\\",\\"sku\\":\\"{material_id}\\"'
        index = main_string.find(substring)

        if main_string[index] == '>':
            material_name = None
            link = None
        else:
            main_string = main_string[index - 200: index]
            substring = '"name\\":\\"'
            index = main_string.find(substring)
            material_name = main_string[index + len(substring):]
            material_name = material_name.replace("\\u0026", "&")

            link_material_name = material_name.lower().replace(" & ", "-").replace("%", "")
            link_material_name = re.sub(r'[^a-zA-Z0-9]', '-', link_material_name)
            link = f'https://gcc.luluhypermarket.com/en-{country}/{link_material_name}/p/{material_id}/'
        
        dct['material_name_long'].append(material_name)
        dct['link'].append(link)

    df2 = pd.DataFrame(dct)
    df2['material_id'] = df2['material_id'].astype('int64')
    df2 = pd.merge(df2, df[['material_id', 'material_name', 'brand']], on='material_id', how = 'inner')
    df2['material_name_long'] = df2['material_name_long'].fillna(df2['material_name'])
    df2 = df2.drop(columns = 'material_name')
    df2.rename(columns = {'material_name_long': 'material_name'}, inplace = True)

    return df2

#UAE

In [0]:
df = run('2024-01-01', '2024-12-29', 'BISCUITS & CAKES', 'RICE & OAT CAKE')
df2 = web_scrape_gcc(df, 'ae')
df2.display()

In [0]:
df2.info()

#Kuwait

In [0]:
# <a data-testid = "Deluxe & Bla Bla Gluten Free Rice Cakes Coated With Strawberry Yoghurt 115 g-0">

In [0]:
df2 = web_scrape_gcc(df, 'kw')
df2.display()

In [0]:
df2.info()

#Saudi Arabia

In [0]:
df2 = web_scrape_gcc(df, 'sa')
df2.display()

In [0]:
df2.info()

#Qatar

In [0]:
df2 = web_scrape_www(df, 'qa')
df2.display()

In [0]:
df2.info()

#Oman

In [0]:
df2 = web_scrape_www(df, 'om')
df2.display()

In [0]:
df2.info()

#Bahrain

In [0]:
df2 = web_scrape_www(df, 'bh')
df2.display()

In [0]:
df2.info()