## Install Libraries

In [12]:
# Do not run this cell if you are running the notebook on your local machine everytimne

!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
     ---------------------------------------- 0.0/981.5 kB ? eta -:--:--
      ------------------------------------ 20.5/981.5 kB 330.3 kB/s eta 0:00:03
     - ----------------------------------- 41.0/981.5 kB 393.8 kB/s eta 0:00:03
     --- -------------------------------- 102.4/981.5 kB 737.3 kB/s eta 0:00:02
     ------------ ------------------------- 317.4/981.5 kB 1.8 MB/s eta 0:00:01
     ---------------------- --------------- 593.9/981.5 kB 2.7 MB/s eta 0:00:01
     -------------------------------------  962.6/981.5 kB 3.6 MB/s eta 0:00:01
     -------------------------------------- 981.5/981.5 kB 3.5 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py): started
  Building wheel for langdetect (setup.py): finished with status 'done'
  Created wheel for l

## Import Libraries

In [1]:
# import libraries for imitating GET request
import requests
import json
import time
import os
import sys
from bs4 import BeautifulSoup
import pandas as pd
import random

# import libraries for language detection
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

# import libraries for web scraping
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

from tinydb import Query, TinyDB
from langcodes import standardize_tag

## Fetch Website and Detect Language

In [14]:
website = "https://www.naver.com/"


# Ensure consistent results from langdetect
DetectorFactory.seed = 0

def fetch_and_convert_website(url):
    try:
        # Fetch the website content
        response = requests.get(url)
        response.raise_for_status()  # Check if the request was successful
        
        # Parse the website content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Get text content and remove extra whitespace
        text = soup.get_text(separator=' ', strip=True)
        
        return text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the website: {e}")
        return None

def detect_language(text):
    try:
        language = detect(text)
        return language
    except LangDetectException as e:
        print(f"Error detecting language: {e}")
        return None


if __name__ == "__main__":
    # Example URL
    # url = 'https://www.example.com'
    
    # Fetch and convert the website to text
    text_content = fetch_and_convert_website(website)
    if text_content:
        # Print the text content
        print("Text content extracted from the website:")
        print(text_content)
        
        # Detect and print the language of the text content
        language = detect_language(text_content)
        if language:
            print(f"\nDetected language: {language}")

Text content extracted from the website:
NAVER 상단영역 바로가기 서비스 메뉴 바로가기 새소식 블록 바로가기 쇼핑 블록 바로가기 관심사 블록 바로가기 MY 영역 바로가기 위젯 보드 바로가기 보기 설정 바로가기 검색 검색 입력도구 자동완성/최근검색어펼치기 최근 검색어 전체삭제 검색어 저장 기능이 꺼져 있습니다. 설정이 초기화 된다면 도움말 을 확인해주세요. 최근 검색어 내역이 없습니다. 설정이 초기화 된다면 도움말 을 확인해주세요. 자동저장 끄기 도움말 닫기 CUE 대화하듯 질문해 보세요 이 정보가 표시된 이유 검색어와 포함된 키워드를 기반으로 AI 기술을 활용하여 연관된 추천 질문을 제공합니다. 레이어 닫기 이전 다음 자세히보기 관심사를 반영한 컨텍스트 자동완성 도움말 컨텍스트 자동완성 컨텍스트 자동완성 ON/OFF 설정은 해당기기(브라우저)에 저장됩니다. 자세히 보기 동일한 시간대・연령대・남녀별 사용자 그룹의 관심사에 맞춰 자동완성을 제공합니다. 자세히 보기 네이버 로그인 컨텍스트 자동완성 레이어 닫기 자동완성 끄기 도움말 신고 닫기

Detected language: ko


In [7]:
# rewrite selenium script which open similarweb.com and get top websites for korea-republic-of, health, childrens-health
# and save the response to a file


def get_top_websites_selenium(country, category, subcategory):
    # add user agent to headers
    headers = {
        'User-Agent': 'Mozilla/5.0'
    }

    url = f"https://www.similarweb.com/api/gettopwebsites?country={country}&category={category}&subcategory={subcategory}"
    driver = webdriver.Chrome()
    driver.get(url)
    time.sleep(5)
    response = driver.page_source
    # extract top websites from response
    # <html><head><meta name="color-scheme" content="light dark"><meta charset="utf-8"></head><body><pre>{"sites":[{"domain":"atelos.net","favicon":"https://site-images.similarcdn.com/image?url=atelos.net&amp;t=2&amp;s=1&amp;h=0c014068c9b4d14ef76a707c2eee40dfbad0c18e281d1a0d68f9bb7b87ea4a14","rankChange":0,"categoryId":"health/childrens_health","visitsAvgDurationFormatted":"00:00:26","pagesPerVisit":1.6413949523586921,"bounceRate":0.3758444316279532,"isBlackListed":false,"isNewRank":false},{"domain":"babycenter.com","favicon":"https://site-images.similarcdn.com/image?url=babycenter.com&amp;t=2&amp;s=1&amp;h=f3ad9c6f1997a429dd4e140a7c32b5f768d8b23e8a3c9aa8353f3d63af7a1b55","rankChange":0,"categoryId":"health/childrens_health","visitsAvgDurationFormatted":"00:01:15","pagesPerVisit":2.6761096536355957,"bounceRate":0.47973039156597036,"isBlackListed":false,"isNewRank":false},{"domain":"stanfordchildrens.org","favicon":"https://site-images.similarcdn.com/image?url=stanfordchildrens.org&amp;t=2&amp;s=1&amp;h=cf717fe028b2010f8ad94d01b45db82bce6f6b4887864668d200a311002a42d2","rankChange":6,"categoryId":"health/childrens_health","visitsAvgDurationFormatted":"00:02:17","pagesPerVisit":2.257756626916711,"bounceRate":0.7297613626673453,"isBlackListed":false,"isNewRank":false},{"domain":"parents.com","favicon":"https://site-images.similarcdn.com/image?url=parents.com&amp;t=2&amp;s=1&amp;h=098e7898a8b60d901990557e20c2bd7012f960a3eaece420c08f40e1ac3a602a","rankChange":2,"categoryId":"health/childrens_health","visitsAvgDurationFormatted":"00:01:12","pagesPerVisit":1.6463580425787216,"bounceRate":0.7314656129756678,"isBlackListed":false,"isNewRank":false},{"domain":"kidshealth.org","favicon":"https://site-images.similarcdn.com/image?url=kidshealth.org&amp;t=2&amp;s=1&amp;h=28a8bd381e5d8042a077325236b35e6c62d91026ce2b4a160ce4242c4113b329","rankChange":-2,"categoryId":"health/childrens_health","visitsAvgDurationFormatted":"00:01:03","pagesPerVisit":1.4349787300799237,"bounceRate":0.7982484320395493,"isBlackListed":false,"isNewRank":false}],"categoryId":"health/childrens_health","countryAlpha2Code":"KR","snapshotDate":"2024-05-01T00:00:00+00:00"}</pre><div class="json-formatter-container"></div></body></html>
    # the response looks like above

    response = response.split("<pre>")[1].split("</pre>")[0]
    print(response)

    # driver.close()
    with open("top_websites.html", "w") as f:
        f.write(response)

get_top_websites_selenium("korea-republic-of", "health", "childrens-health")


{
  "sites": [
    {
      "domain": "atelos.net",
      "favicon": "https://site-images.similarcdn.com/image?url=atelos.net&amp;t=2&amp;s=1&amp;h=0c014068c9b4d14ef76a707c2eee40dfbad0c18e281d1a0d68f9bb7b87ea4a14",
      "rankChange": 0,
      "categoryId": "health/childrens_health",
      "visitsAvgDurationFormatted": "00:00:26",
      "pagesPerVisit": 1.64139495235869,
      "bounceRate": 0.375844431627953,
      "isBlackListed": false,
      "isNewRank": false
    },
    {
      "domain": "babycenter.com",
      "favicon": "https://site-images.similarcdn.com/image?url=babycenter.com&amp;t=2&amp;s=1&amp;h=f3ad9c6f1997a429dd4e140a7c32b5f768d8b23e8a3c9aa8353f3d63af7a1b55",
      "rankChange": 0,
      "categoryId": "health/childrens_health",
      "visitsAvgDurationFormatted": "00:01:15",
      "pagesPerVisit": 2.6761096536356,
      "bounceRate": 0.47973039156597,
      "isBlackListed": false,
      "isNewRank": false
    },
    {
      "domain": "stanfordchildrens.org",
      "favicon

In [29]:
def get_top_website(country):
    url = "https://www.ahrefs.com/top/" + country
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    # look for tbody table
    tables = soup.find_all("tbody")


    top100 = tables[0]

    # create an empty dataframe with columns rank, url, traffic, increase_traffic
    # df_website = pd.DataFrame(columns=["rank", "url", "traffic", "increase_traffic"])
    # create a dictionary with keys rank, url, traffic, increase_traffic 
    list_website = []

    dict_website = {}

    for row in top100.find_all("tr"):
        cell_values = [cell.text for cell in row.find_all("td")]
        cell_values.pop(1)

        url = cell_values[1]
        rank = cell_values[0]
        traffic = cell_values[2]
        increase_traffic = cell_values[3]

        # add to dictionary
        dict_website["rank"] = rank
        dict_website["url"] = url
        dict_website["traffic"] = traffic
        dict_website["increase_traffic"] = increase_traffic

        # add to list
        list_website.append(dict_website)


        # df_website = df_website._append(pd.Series(cell_values, index=df_website.columns), ignore_index=True)
       
    return list_website

## Analyze DB of Websites by Languages

In [2]:
# Load the database
db = TinyDB('websites_by_language.json')
websites_table = db.table('websites')


# list all unique languages and their count of websites
languages = websites_table.all()
languages = [lang['language'] for lang in languages]
# print count of each language along with language
from collections import Counter
lang_count = Counter(languages)
print(lang_count)


Counter({'en': 63926, 'zh-cn': 16652, 'id': 8471, 'ru': 4302, 'es': 3924, 'de': 3500, 'ja': 3380, 'pt': 3346, 'ko': 2730, 'fr': 2618, 'vi': 1817, 'it': 1497, 'tr': 1457, 'nl': 1152, 'pl': 1105, 'ar': 1085, 'fa': 1067, 'th': 968, 'ro': 660, 'uk': 612, 'tl': 589, 'cs': 445, 'el': 384, 'sv': 383, 'hr': 356, 'no': 355, 'hi': 338, 'hu': 337, 'da': 336, 'fi': 271, 'et': 262, 'bg': 253, 'ca': 242, 'bn': 228, 'sk': 206, 'so': 171, 'he': 169, 'lt': 131, 'sl': 108, 'sw': 105, 'af': 95, 'mk': 67, 'lv': 62, 'ta': 52, 'cy': 51, 'mr': 51, 'sq': 50, 'te': 37, 'ml': 30, 'kn': 26, 'ne': 23, 'gu': 18, 'ur': 16, 'zh-tw': 15, 'pa': 2})


In [3]:
# list all unique languages on a new line and total number of unique languages
unique_languages = set(languages)
print(unique_languages)
print(f"Total number of unique languages: {len(unique_languages)}")


{'en', 'mk', 'pl', 'hu', 'hr', 'af', 'ne', 'pt', 'zh-tw', 'vi', 'mr', 'sl', 'he', 'ml', 'bg', 'lt', 'lv', 'zh-cn', 'fr', 'hi', 'bn', 'ja', 'ta', 'so', 'it', 'tr', 'da', 'no', 'sq', 'ur', 'el', 'th', 'cs', 'ca', 'fi', 'et', 'cy', 'pa', 'nl', 'fa', 'de', 'sk', 'ar', 'es', 'ro', 'gu', 'uk', 'tl', 'id', 'sw', 'sv', 'ko', 'te', 'kn', 'ru'}
Total number of unique languages: 55


In [27]:

# print count of all website for english, chinese, korean, japanese languages

# get all websites for english language
Website = Query()
websites = websites_table.search(Website.language == "en")
print(f"Total websites in English: {len(websites)}")

# get all websites for chinese language
websites = websites_table.search(Website.language == "zh-cn")
print(f"Total websites in Chinese(simplified): {len(websites)}")

# zh-tw
websites = websites_table.search(Website.language == "zh-tw")
print(f"Total websites in Chinese(traditional): {len(websites)}")


# get all websites for korean language
websites = websites_table.search(Website.language == "ko")
print(f"Total websites in Korean: {len(websites)}")

# get all websites for japanese language
websites = websites_table.search(Website.language == "ja")
print(f"Total websites in Japanese: {len(websites)}")




Total websites in English: 63926
Total websites in Chinese(simplified): 16652
Total websites in Chinese(traditional): 15
Total websites in Korean: 2730
Total websites in Japanese: 3380


### Get Random Websites from the Language

In [None]:
# generate a list of 25 random domains for eng, zh-cn, ko, ja languages
# get all websites for english language
Website = Query()
websites = websites_table.search(Website.language == "en")
# get 25 random websites
random_en_websites = random.sample(websites, 25)
print("Random websites in English:")
for website in random_en_websites:
    print(website['domain'])


In [18]:
# get all websites for chinese language
websites = websites_table.search(Website.language == "zh-cn")
# get 25 random websites
random_ZhCn_websites1 = random.sample(websites, 25)
print("Random websites in Chinese(simplified):")
for website in random_ZhCn_websites1:
    print(website['url'])

# get anohther 25 random websites for zh-cn language which is different from the previous 25 websites in random_ZhCn_websites1
random_ZhCn_websites2 = random.sample(websites, 25)
# check random_ZhCn_websites2 is different from random_ZhCn_websites1
for website in random_ZhCn_websites2:
    if website in random_ZhCn_websites1:
        print("Random websites in Chinese(simplified) are not unique")
        # replace the website with a new random website
        random_ZhCn_websites2.remove(website)
        random_ZhCn_websites2.append(random.choice(websites))

print("Random websites in Chinese(simplified):")
for website in random_ZhCn_websites2:
    print(website['url'])



Random websites in Chinese(simplified):
piggymates.com
xiningchina.com
qianbangjiaoyu.com
dnbbm.com
sanygroup.com
qxjf-art.com
xiamiaoyangzhi.com
cqkuaisu.com
cshuaqun.com
gongyeqg.com
18avx.com
chinabrx.com
rendaikuan.com
szyueshan.com
qdzhiruitong.com
freereceivesms.com
gggoodgame.com
hellobike.com
allstar-era.com
kxunchina.com
zgrtcm.com
yumerzx.com
leg1678.com
shenzhen-nanning.com
262196.cn
Random websites in Chinese(simplified):
fytlsm.com
hapclock.com
jtk100.com
iduduapp.com
sdbxqy.com
njchangxue.com
daimonchina.com
zhaoshimy.com
jutu360.com
wodessay.com
yataixuanhao.com
hztaiyi.com
sujienk.com
liusuliusu.com
cmdjdkj.com
hycmzc.com
ytshenhong.com
znote8899.com
jxlesong.com
jax-china.com
51haotou.com
xinglistqy.com
mtsbjy.com
ysu.edu.cn
tianqingshiyin.com


In [11]:
# get all websites for korean language
websites = websites_table.search(Website.language == "ko")
# get 25 random websites
random__ko_websites_1 = random.sample(websites, 25)
print("Random websites in Korean:")
for website in random__ko_websites_1:
    print(website['url'])

# get anohther 25 random websites for korean language which is different from the previous 25 websites in random__ko_websites_1
random__ko_websites_2 = random.sample(websites, 25)
# check random__ko_websites_2 is different from random__ko_websites_1
for website in random__ko_websites_2:
    if website in random__ko_websites_1:
        print("Random websites in Korean are not unique")
        # replace the website with a new random website
        random__ko_websites_2.remove(website)
        random__ko_websites_2.append(random.choice(websites))

print("Random websites in Korean:")
for website in random__ko_websites_2:
    print(website['url'])


Random websites in Korean:
21stcbc.org
lcd1004.co.kr
pvxywg.com
wtwt248.com
papalah.pw
jusoya10.com
netpro.co.kr
bestone-work.com
kassashair.com
chroscience.com
studypatent.com
ovotv.com
chosong.co.kr
xsmzjc.com
daehangreenpower.com
xingyueboke.com
womaneconomy.co.kr
keyixs.com
xn--939au0g3vw1iaq8a469c.kr
19878719.com
scshangting.com
rongbaodianmo.com
mgyqw.com
dabangapp.com
qiutianxia29.com
Random websites in Korean:
uqcjvpk.cn
mobilitytv.co.kr
whichav.video
jxcgyl.com
interpark.com
jiexunec.com
1234567.com.cn
neworbis.com
heywakeup.com.tw
wozai-travel.com
smdv.kr
ezalba.co.kr
haobofangshui.com
torrentsee217.com
ruantongzhi.com
cmuma.xyz
ttlock.com
jmdoor.com.tw
newtoki.help
sxwlz.com
sdmeixiusy.com
optisun.vip
clean-clean-peru.com
11toon112.com
aniweek.com


In [57]:
# give two more random korean websites
random__ko_websites_3 = random.sample(websites, 2)
print("Random websites in Korean:", random__ko_websites_3)

Random websites in Korean: [{'url': 'yxzwlkj.com', 'language': 'ko', 'timestamp': '2024-06-08T08:34:31.376923'}, {'url': 'jshaoou.com', 'language': 'ko', 'timestamp': '2024-06-08T07:06:15.695528'}]


In [40]:
# give 30 random websites from the database for korean, chinese and japanese languages
# get 30 random websites for korean language
websites = websites_table.search(Website.language == "ko")
random_websites = random.sample(websites, 30)
# just list urls
for website in random_websites:
    print(website['url'])





359198.com
toonkor326.com
dbcnews.co.kr
hbwocheng.com
whichav.video
pinksisly.com
chenzhongtech.com
bluezz.com.tw
gdzhukou.com
oplove16.com
yamoa3.site
zhongfa1688.com
douyuanxiuhe.com
yp.com.hk
imendon.com
fxfx217.com
htwhbook.com
yedam.com
homeplus.co.kr
newhua99.xyz
88p2p.com
shyuwangfangshui.com
fenghemp.com
daoom.co.kr
nfqlife.com
evolutionplaynow.com
limeitianhe.com
yebigun1.mil.kr
anpservice.net
trfsgs.com


In [17]:
# get 25 random websites for japanese language
websites = websites_table.search(Website.language == "ja")
random_ja_websites = random.sample(websites, 25)
# just list urls
for website in random_ja_websites:
    print(website['url'])

# get 25 more random japanese websites
random_ja_websites_2 = random.sample(websites, 25)
# check random_ja_websites_2 is different from random_ja_websites
for website in random_ja_websites_2:
    if website in random_ja_websites:
        # print("Random websites in Japanese are not unique")
        # replace the website with a new random website
        random_ja_websites_2.remove(website)
        random_ja_websites_2.append(random.choice(websites))

print("Random websites in Japanese:")
for website in random_ja_websites_2:
    print(website['url'])



nobori-mart.net
keirin-mobile.jp
sesto.jp
eigeki.com
valor-luvitapp.com
kuzen.io
enechange.co.jp
hanako.tokyo
saiyasune.com
gogojungle.co.jp
madamefigaro.jp
reil.co.jp
koichidomoto-fc.net
ganbalegends.com
jal.co.jp
dengekionline.com
bribaby.jp
hankyu.co.jp
yutasan.co
laxd.com
karakubuy.com
hellouniweb.com
android4front.jp
nichiga.net
cardrush-pokemon.jp
Random websites in Japanese:
domonet.jp
sabory-blog.com
tsurisuke.com
hero-news.com
halmek.co.jp
monotaro.com
homes.co.jp
ai-eye.jp
pushcode.jp
tyuemon.com
ifdef.jp
mangakoma.net
sangacio.com
jobop.jp
rere.jp
thp-shop.co.jp
brandnavi-online.com
ranking.net
edesk.jp
kimuratan.jp
xn--pckua2a7gp15o89zb.com
kawashima-ya.jp
nippon-foundation.or.jp
boxingnews.jp
axa-direct.co.jp


In [60]:
# generate 2 more random japanese websites
random_ja_websites_3 = random.sample(websites, 2)
print("Random websites in Japanese:", random_ja_websites_3)


Random websites in Japanese: [{'url': 'comiful.net', 'language': 'ja', 'timestamp': '2024-06-08T09:51:24.302366'}, {'url': 'hentaiasmr.moe', 'language': 'ja', 'timestamp': '2024-06-08T02:31:00.175990'}]


## Crawl Privacy Policy

In [18]:
# List of user agents
USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3', 
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1 Safari/605.1.15',
    'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
    # Add more user agents as needed
]

def get_random_user_agent():
    return random.choice(USER_AGENTS)

# get free proxies from online
def get_proxies():
    url = "https://free-proxy-list.net/"
    # fetch proxy list from online
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    proxy_table = soup.find("table", attrs={"class": "table table-striped table-bordered"})


    # print(proxy_table)

    # # Extract proxy IPs and ports
    proxies = []
    proxy_table = proxy_table.find("tbody")
    # print(proxy_table)
    for row in proxy_table.find_all("tr"):
        proxies.append({
        "ip":   row.find_all("td")[0].string,
        "port": row.find_all("td")[1].string
        })
        # print(row)
    return proxies

In [79]:
urls = [
    'https://www.naver.com/',  # Replace with your URLs
    'https://www.997788.com/',	
]


visited_links = set()
all_links = []

def get_random_user_agent():
    user_agents = [
        # Add a list of user agents here
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15'
        # Add more user agents if needed
    ]
    return random.choice(user_agents)


def extract_links(url):
    headers = {
        'User-Agent': get_random_user_agent()
    }
    # proxies = get_proxies()
    # proxy = random.choice(proxies)
    # print(f"Using proxy: {proxy}")
    
    try:
        response = requests.get(url, timeout=5, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")
        links = []
        
        for link in soup.find_all("a", href=True):
            full_link = link.get("href")
            if full_link and full_link.startswith("http"):
                links.append(full_link)
        return links
    
    except requests.RequestException as e:
        print(f"Error accessing {url}: {e}")
        return []

def recursive_extract(url, original_domain):
    if url not in visited_links:
        visited_links.add(url)
        if original_domain in url:
            print(f"Extracting links from: {url}")
            links = extract_links(url)
            for link in links:
                if link not in visited_links:
                    # if original_domain in link:
                    all_links.append(link)
                    recursive_extract(link, original_domain)
                    time.sleep(1)  # Sleep to avoid overwhelming the server

if __name__ == "__main__":
    start_url = "https://www.naver.com/"  # Replace with the starting URL
    original_domain = "naver.com"
    recursive_extract(start_url, original_domain)
    
    print("All extracted links:")
    for link in all_links:
        print(link)


Extracting links from: https://www.naver.com/
Extracting links from: https://help.naver.com/alias/search/word/word_35.naver
Extracting links from: https://policy.naver.com/rules/youthpolicy.html
Extracting links from: http://www.naver.com/
Extracting links from: https://help.naver.com/alias/search/word/word_16.naver
Extracting links from: https://help.naver.com/support/alias/search/word/word_16.naver
Extracting links from: https://nid.naver.com/nidlogin.login
Extracting links from: https://www.naver.com
Extracting links from: https://help.naver.com/alias/search/word/word_17.naver
Extracting links from: https://help.naver.com/alias/search/word/word_18.naver
Extracting links from: https://nid.naver.com/user2/api/route?m=routePwInquiry&lang=ko_KR
Extracting links from: https://help.naver.com/support/alias/membership/p.membership/p.membership_26.naver
Extracting links from: https://nid.naver.com/user2/api/route?m=routeIdInquiry&lang=ko_KR
Extracting links from: https://nid.naver.com/user2/

In [None]:
import requests
from bs4 import BeautifulSoup
import random
import time
from urllib.parse import urlparse

visited_links = set()
all_links = []

def get_random_user_agent():
    user_agents = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15'
        # Add more user agents if needed
    ]
    return random.choice(user_agents)

def extract_links(url):
    headers = {
        'User-Agent': get_random_user_agent()
    }
    
    try:
        response = requests.get(url, timeout=5, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")
        links = []
        
        for link in soup.find_all("a", href=True):
            full_link = link.get("href")
            if full_link and full_link.startswith("http"):
                links.append(full_link)
        return links
    
    except requests.RequestException as e:
        print(f"Error accessing {url}: {e}")
        return []

def recursive_extract(url, original_domain):
    if url not in visited_links:
        visited_links.add(url)
        print(f"Extracting links from: {url}")
        links = extract_links(url)
        for link in links:
            parsed_link = urlparse(link)
            if link not in visited_links:
                # check if original domain is in the link
                if original_domain in parsed_link.netloc:
                    all_links.append(link)
                    recursive_extract(link, original_domain)
                    time.sleep(1)  # Sleep to avoid overwhelming the server

if __name__ == "__main__":
    start_url = "https://www.naver.com/"  # Replace with the starting URL
    parsed_start_url = urlparse(start_url)
    original_domain = parsed_start_url.netloc
    
    print(f"Original domain: {original_domain}, Start URL: {start_url}")
    recursive_extract(start_url, original_domain)
    
    print("All extracted links:")
    for link in all_links:
        print(link)


## Analyzing Extracted Links

In [81]:
# from all_links list, get all urls which might be related to privacy policy, terms of service, policy, cookies, etc
# extract all urls which might be related to privacy policy, terms of service, policy, cookies, etc
# from the list of all links extracted from the website

search_terms = ["privacy", "policy", "terms", "cookies", "gdpr", "data", "protection", "security" "legal", "agreement"]
privacy_links = []
for link in all_links:
    for term in search_terms:
        if term in link:
            privacy_links.append(link)
            break

print("Privacy-related links:")
for link in privacy_links:
    print(link) 

Privacy-related links:
https://policy.naver.com/rules/youthpolicy.html
https://policy.naver.com/policy/service.html
http://policy.naver.com/policy/privacy.html
https://privacy.naver.com/policy_and_law/easy_version?menu=policy_personal_information_easyVersion
http://policy.naver.com/rules/privacy.html
https://privacy.naver.com/policy_and_law/infographic?menu=policy_personal_information_infographic
http://policy.naver.com/rules/service_location.html
https://policy.naver.com/policy/popup/privacy_agreement.html
https://policy.naver.com/policy/privacy.html
http://www.oecd.org/sti/ieconomy/oecdguidelinesontheprotectionofprivacyandtransborderflowsofpersonaldata.htm
https://privacy.naver.com/privacyinfo
http://www.naver.com/rules/privacy.html
https://privacy.naver.com/transparency/related_act?menu=transparency_report_understand_related_act
https://www.naver.com/policy/service.html
https://www.naver.com/policy/privacy.html
https://privacy.naver.com/protection_activity/naver_personal_information

Original domain: www.naver.com, Start URL: https://www.naver.com/
Extracting links from: https://www.naver.com/
All extracted links:


In [1]:
#  read policy links from websites_by_language.json file in the policy_links table
# Load the database
db = TinyDB('websites_by_language.json')
policy_links_table = db.table('policy_links')
policy_links = policy_links_table.all()
print(len(policy_links))
privacy_domains = {}
for domain in policy_links:
    print(domain['domain'], len(domain['all_links']) )
    # check if there is any privacy policy link in the all_links
    privacy_links = []
    for link in domain['all_links']:
        if "privacy" in link or "policy" in link or "terms" in link or "cookies" in link or "gdpr" in link or "security" in link or "legal" in link or "agreement" in link:
            privacy_links.append(link)
            privacy_domains[domain['domain']] = privacy_links
            # print(link)
            # break
    print(len(privacy_links))

NameError: name 'TinyDB' is not defined

In [23]:
print(len(privacy_domains))

# give number of privacy related links for each domain
for domain, links in privacy_domains.items():
    print(domain, len(links))

1304
mib19.co.kr 2
indischool.com 4
dragonest.com 1
kurogames.com 3
zocbo.com 1
ttu.edu.tw 2
realclick.co.kr 6
726786.com 6
ymc616.com 6
univ100.kr 10
jscjx.cn 8
octopus.com.hk 11
ny10086.com 13
bestone-work.com 10
lcd1004.co.kr 5
gnjoy.com.tw 19
jointapply.com 41
mywhh.com 7
11toon96.com 42
ytn.co.kr 100
hnzxscp.com 15
dgxinshun168.com 56
dak.gg 44
gwfence.co.kr 26
toosadfun.com 20
mangoboard.net 59
clean-clean-peru.com 1
manatoki339.net 9
schoolbell-e.com 3
mlxtrip.com 10
instiz.net 9
cypher-dark-market.com 7
xstxt.cc 1
cafe24.co.kr 6
lz3305.com 3
ahjundun.com 6
sytianxia.com 2
peanutoon.com 3
thekpm.com 4
lscmlt.com 7
xiuwushidai.com 7
x1kzhi.com 7
urovo.com 7
businessweekly.com.tw 7
linqihuoyuan.com 7
peoplenjob.com 7
aii.life 7
fxxz.com 7
solarbe.com 7
metakr.co.kr 7
poni25.net 7
avsubs.co.kr 7
jnu.ac.kr 10
bingfeng.tw 11
gxrgwl.com 11
youboy.com 11
univstore.com 11
jc001.cn 11
itpison.com 2
tkr314.com 5
wink.co.kr 5
shouxiuwang.com 1
shwenguo.com 1
nikemania.com 3
suzhouweddingsh

In [None]:
len

In [25]:
db = TinyDB('websites_by_language.json')
websites_table = db.table('websites')
Website = Query()

korean_websites = websites_table.search(Website.language == "ko")

all_korean_websites = [website['url'] for website in korean_websites]
print(f"Total Korean websites: {len(set(all_korean_websites))}")

# websites_to_process = korean_websites[:100]
# websites_to_process = [website['url'] for website in websites_to_process]
# print(websites_to_process)

policy_links_table = db.table('policy_links')
existing_policy_links = policy_links_table.all()
# only korean domains which has policy_link['country'] == "Korea"
existing_policy_links = [policy_link['domain'] for policy_link in existing_policy_links if policy_link['country'] == "Korea"]
# existing_policy_links = [policy_link['domain'] for policy_link in existing_policy_links]
# existing_policy_links = [policy_link['domain'] for policy_link in existing_policy_links]

print(f"Number of Korean domains in policy links table: {len(set(existing_policy_links))}, percentage: {len(set(existing_policy_links))/len(set(all_korean_websites))}")


remaining_websites = []

for each_website in all_korean_websites:
    if each_website not in existing_policy_links:
        remaining_websites.append(each_website)
print(len(remaining_websites))


Total Korean websites: 2730
Number of Korean domains in policy links table: 2300, percentage: 0.8424908424908425
430
