In [1]:
import base64
import os
import re
import requests
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from concurrent.futures import ThreadPoolExecutor, as_completed

options = Options()
options.headless = True
options.add_argument("--window-size=960,720")

def capture_screenshot(domain):
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(f"https://{domain}")
    driver.execute_script("document.body.style.zoom='50%'")
    screenshot = driver.get_screenshot_as_base64()
    driver.quit()
    return domain, screenshot

def get_category(api_key, preprompt, domain, base64_image):
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    payload = {
        "model": "gpt-4o",
        "messages": [{
            "role": "system",
            "content": [
                { "type": "text", "text": preprompt}
            ]
        },{
            "role": "user",
            "content": [
                { "type": "text", "text": domain },
                { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{base64_image}" }}
            ]
        }], "max_tokens": 2500
    }

    attempts = 3
    while attempts > 0:
        attempts -= 1
        try:
            response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
            category = re.findall(r'\b\d\b', response.json()['choices'][0]['message']['content'])
        except:
            continue
        break

    print(domain, category[0] if category else '-1')
    
    return int(category[0]) if category else None

def process_domain(api_key, preprompt, domain):
    domain, screenshot = capture_screenshot(domain)
    
    screenshot_path = os.path.join("screenshots", f"{domain.replace('http://', '').replace('https://', '').replace('/', '_')}.png")
    with open(screenshot_path, "wb") as f:
        f.write(base64.b64decode(screenshot))
    
    category = get_category(api_key, preprompt, domain, screenshot)
    
    return domain, category

def process_domains(api_key, preprompt, domains, data, output_file):
    os.makedirs("screenshots", exist_ok=True)

    # for domain in domains:
    #     try:
    #         domain, category = process_domain(api_key, preprompt, domain)
    #         print(f"Domain: {domain}, Category: {category}")
    #         data.append([domain, category])
    #     except:
    #         data.append([domain, None])

    with ThreadPoolExecutor(max_workers=10) as executor:
        try:
            futures = [executor.submit(process_domain, api_key, preprompt, domain) for domain in domains]
            for future in as_completed(futures):
                domain, category = future.result()
                print(f"Domain: {domain}, Category: {category}")
                data.append([domain, category])
        except:
            pass
    
    df = pd.DataFrame(data, columns=['domain', 'category'])
    
    if os.path.exists(output_file):
        existing_df = pd.read_csv(output_file)
        existing_df = existing_df.set_index('domain').join(df.set_index('domain'), rsuffix='_new').reset_index()
        existing_df['voted_category'] = existing_df.apply(
            lambda row: row['category'] if row['category'] == row['category_new'] else 'NULL', axis=1)
        df = existing_df
    
    df.to_csv(output_file, index=False)
    return df

In [2]:
images_folder_path = 'images'
domains = [d for d in os.listdir(images_folder_path) if os.path.isdir(os.path.join(images_folder_path, d))]

# api_key = "YOUR_OPENAI_API_KEY_HERE"
api_key = "YOUR_OPENAI_API_KEY_HERE"
preprompt = """
You are an expert in website classification. Based on the given domain name and screenshot of website, classify the website into one of these categories:

1. E commerce (online shops)
2. Information/Educational (wikipedia, etc)
3. Business (company website)
4. News (bbc, cnn, etc)
5. Social Media (reddit, etc)
6. Video streaming (youtube, etc)r
7. Other (everything that cannot be classified into the above)

Provide ONLY the category number as output.

E.g

User: reddit.com
Response: 5

User: wikipedia.com
Response: 2

User: amazon.com
Response: 1

Please note that the screenshot will directly represent what is displayed in a live browser window. It may have an error, popup or not have loaded yet. You should be smart enough to understand this and not classify the website based on the image if it is not representative of the actual content on the website.

Now the user will send you their message.
"""

output_file = "domain_categories.csv"
data = []
df = process_domains(api_key, preprompt, domains, data, output_file)

app.hubspot.com 7
Domain: app.hubspot.com, Category: 7
apps.facebook.com 5
Domain: apps.facebook.com, Category: 5
91club.club 7
Domain: 91club.club, Category: 7
91club06.com 7
Domain: 91club06.com, Category: 7
ar.m.wikipedia.org 2
Domain: ar.m.wikipedia.org, Category: 2
9animetv.to 6
Domain: 9animetv.to, Category: 6
aniwatchtv.to 6
Domain: aniwatchtv.to, Category: 6
account.samsung.com 3
Domain: account.samsung.com, Category: 3
as.com 4
Domain: as.com, Category: 4
ameblo.jp 7
Domain: ameblo.jp, Category: 7
articulo.mercadolibre.com.ar 1
Domain: articulo.mercadolibre.com.ar, Category: 1
articulo.mercadolibre.com.mx 1
Domain: articulo.mercadolibre.com.mx, Category: 1
auto.drom.ru 1
Domain: auto.drom.ru, Category: 1
bakusai.com 7
Domain: bakusai.com, Category: 7
baseball.yahoo.co.jp 4
Domain: baseball.yahoo.co.jp, Category: 4
battwo.com 7
Domain: battwo.com, Category: 7
bato.to 7
Domain: bato.to, Category: 7
bdvenlinea.banvenez.com 7
Domain: bdvenlinea.banvenez.com, Category: 7
auctions.y

In [3]:
data

[['app.hubspot.com', 7],
 ['apps.facebook.com', 5],
 ['91club.club', 7],
 ['91club06.com', 7],
 ['ar.m.wikipedia.org', 2],
 ['9animetv.to', 6],
 ['aniwatchtv.to', 6],
 ['account.samsung.com', 3],
 ['as.com', 4],
 ['ameblo.jp', 7],
 ['articulo.mercadolibre.com.ar', 1],
 ['articulo.mercadolibre.com.mx', 1],
 ['auto.drom.ru', 1],
 ['bakusai.com', 7],
 ['baseball.yahoo.co.jp', 4],
 ['battwo.com', 7],
 ['bato.to', 7],
 ['bdvenlinea.banvenez.com', 7],
 ['auctions.yahoo.co.jp', 1],
 ['becasprogresar.educacion.gob.ar', 2],
 ['blog.livedoor.jp', 7],
 ['asuratoon.com', 7],
 ['alexisbima.com', 7],
 ['bpexch.com', 7],
 ['betinexchange.com', 7]]