In [None]:
!pip install nest-asyncio cloudscraper requests beautifulsoup4 lxml chardet \
pandas numpy openpyxl xlrd \
gspread gspread-dataframe oauth2client \
openai \
selenium webdriver-manager \
psutil \
tqdm \
python-dotenv




[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: C:\Users\mobidays\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [None]:
# "{}" 안의 값은 변경하여 사용
# 크롤링 프로그램
import nest_asyncio
nest_asyncio.apply()
import asyncio
import time
import random
import threading
from urllib.parse import urlparse

import cloudscraper
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import pandas as pd
import gspread
from gspread_dataframe import set_with_dataframe
import re

import requests
from requests.adapters import HTTPAdapter
from urllib3 import PoolManager
import ssl
import chardet
import random

import warnings
from urllib3.exceptions import InsecureRequestWarning

# =========================
# 날짜
# =========================
yesterday = datetime.today() - timedelta(days=1)
yesterday_str = yesterday.strftime("%Y-%m-%d")

# =========================
# 공통
# =========================
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)",
    "Mozilla/5.0 (X11; Linux x86_64)"
]

headers = {"User-Agent": random.choice(user_agents)}

def create_scraper():
    return cloudscraper.create_scraper(
        browser={"browser": "firefox", "platform": "windows", "mobile": False}
    )

_domain_locks = {}
_domain_locks_lock = threading.Lock()

def _get_domain_lock(domain: str):
    with _domain_locks_lock:
        if domain not in _domain_locks:
            _domain_locks[domain] = threading.Semaphore(1)
        return _domain_locks[domain]

def _jitter_sleep(a=0.05, b=0.2):
    time.sleep(random.uniform(a, b))

def robust_get(scraper, url, headers, retries=2, backoff=0.6, min_len=300, timeout=10):
    last_exc = None
    dom = urlparse(url).netloc
    lock = _get_domain_lock(dom)
    for i in range(retries):
        try:
            with lock:
                _jitter_sleep()
                r = scraper.get(url, headers=headers, timeout=timeout)
            if r is not None and r.status_code in (200, 201) and len(r.text) >= min_len:
                return r
        except Exception as e:
            last_exc = e
        time.sleep((backoff ** i) + random.uniform(0, 0.2))
    if last_exc:
        raise last_exc
    return None

def should_warn(meta: dict) -> bool:
    must_fields = ["count_title", "count_link", "count_date"]

    if any(meta.get(f, 0) == 0 for f in must_fields):
        return True

    matched = meta.get("matched", 0)
    content = meta.get("count_content", 0)
    if matched > 0 and content == 0:
        return True

    return False

def make_warning_row(meta: dict, date_str: str) -> pd.DataFrame:
    return pd.DataFrame([{
        "매체명": meta.get("site", "UNKNOWN"),
        "제목": "⚠ 클래스 변경 가능성",
        "날짜": date_str,
        "내용": (
            f"제목:{meta.get('count_title', 0)} "
            f"링크:{meta.get('count_link', 0)} "
            f"날짜:{meta.get('count_date', 0)} "
            f"본문:{meta.get('count_content', 0)} "
            f"매칭:{meta.get('matched', 0)} → 구조 점검 필요"
        ),
        "링크": meta.get("url", "")
    }])


class TLSAdapter(HTTPAdapter):
    def init_poolmanager(self, *args, **kwargs):
        context = ssl.create_default_context()
        context.set_ciphers('HIGH:!DH:!aNULL')
        kwargs['ssl_context'] = context
        return super(TLSAdapter, self).init_poolmanager(*args, **kwargs)

# =========================
# 크롤러
# =========================

def crawl_marketingmag_sync():
    site, url = "Marketing Mag", "https://www.marketingmag.com.au/"
    scraper, rows, errors = create_scraper(), [], []
    count_title = count_link = count_date = count_content = matched = 0

    try:
        soup = BeautifulSoup(robust_get(scraper, url, headers).text, "html.parser")
        for art in soup.find_all("div", class_="tt-post-info"):
            try:
                title = link = date = None
                title_tag = art.find("a", class_="tt-post-title c-h4");   title = title_tag.get_text(strip=True) if title_tag else None
                link = title_tag.get("href")
                if not link: continue
                date_tag = art.find("span", class_="tt-post-date");       date_raw = date_tag.get_text(strip=True) if date_tag else None
                date = datetime.strptime(date_raw, "%B %d, %Y").strftime("%Y-%m-%d") if date_raw else None

                if title: count_title += 1
                if link: count_link += 1
                if date: count_date += 1

                if date == yesterday_str:
                    matched += 1
                    inner = BeautifulSoup(robust_get(scraper, link, headers).text, "html.parser")
                    content_div = inner.select_one("div.simple-text.size-4.tt-content.title-droid.margin-big")
                    texts = [p.get_text(" ", strip=True) for p in content_div.find_all(["p","h3","blockquote"])] if content_div else []
                    if texts: count_content += 1; rows.append({"매체명": site, "제목": title, "날짜": date, "내용": "\n".join(texts), "링크": link})
            except Exception as e: errors.append(f"개별 기사 오류: {e}")

        meta = {"site": site, "url": url, "count_title": count_title, "count_link": count_link,
                "count_date": count_date, "count_content": count_content, "matched": matched, "errors": errors}
        return pd.DataFrame(rows), meta

    except Exception as e:
        return pd.DataFrame(), {"site": site, "url": url, "count_title":0, "count_link":0,
                                "count_date":0, "count_content":0, "matched":0, "errors":[f"메인 로드 실패: {e}"]}

def crawl_marketingbeat_sync():
    site, url = "Marketing Beat", "https://www.marketing-beat.co.uk/"
    scraper, rows, errors = create_scraper(), [], []
    count_title = count_link = count_date = count_content = matched = 0

    try:
        soup = BeautifulSoup(robust_get(scraper, url, headers).text, "html.parser")
        for art in soup.find_all("li", class_="mb-latest-articles-box"):
            try:
                title = link = date = None
                link_tag = art.find("a", class_="mb-articles-content-link")
                if not link_tag: continue
                title_tag = link_tag.find("h3")
                title = title_tag.get_text(strip=True) if title_tag else None
                link = link_tag.get("href")

                date_tag = link_tag.find("div", class_="mb-articles-author-date")
                date_raw = date_tag.find("span").get_text(strip=True) if date_tag and date_tag.find("span") else None
                date = None
                if date_raw:
                    try:
                        date = datetime.strptime(date_raw.split("x")[0].strip(), "%d/%m/%Y").strftime("%Y-%m-%d")
                    except Exception:
                        date = date_raw

                if title: count_title += 1
                if link: count_link += 1
                if date: count_date += 1

                if date == yesterday_str:
                    matched += 1
                    inner = BeautifulSoup(robust_get(scraper, link, headers).text, "html.parser")
                    content_div = inner.select_one("div.mb-post-content")
                    texts = [p.get_text(" ", strip=True) for p in content_div.find_all(["p","h2","h3","li"])] if content_div else []
                    if texts: count_content += 1; rows.append({"매체명": site, "제목": title, "날짜": date, "내용": "\n".join(texts), "링크": link})
            except Exception as e: errors.append(f"개별 기사 오류: {e}")

        meta = {"site": site, "url": url, "count_title": count_title, "count_link": count_link,
                "count_date": count_date, "count_content": count_content, "matched": matched, "errors": errors}
        return pd.DataFrame(rows), meta

    except Exception as e:
        return pd.DataFrame(), {"site": site, "url": url, "count_title": 0, "count_link": 0,
                                "count_date": 0, "count_content": 0, "matched": 0, "errors": [f"메인 로드 실패: {e}"]}

def crawl_searchengine_sync():
    site, url = "Search Engine Land", "https://searchengineland.com/"
    scraper, rows, errors = create_scraper(), [], []
    count_title = count_link = count_date = count_content = matched = 0

    try:
        soup = BeautifulSoup(robust_get(scraper, url, headers).text, "html.parser")
        for art in soup.select("article.stream-article"):
            try:
                title = link = date = None
                a_tag = art.select_one("h2.headline a")
                if not a_tag: continue
                title = a_tag.get_text(strip=True)
                link = a_tag.get("href")

                date_tag = art.select_one(".author-time .byline")
                date_raw = date_tag.get_text(strip=True).split("|")[-1].split("at")[0].strip() if date_tag else None
                date = None
                if date_raw:
                    try: date = datetime.strptime(date_raw, "%b %d, %Y").strftime("%Y-%m-%d")
                    except Exception: date = date_raw

                if title: count_title += 1
                if link: count_link += 1
                if date: count_date += 1

                if date == yesterday_str:
                    matched += 1
                    inner = BeautifulSoup(robust_get(scraper, link, headers).text, "html.parser")
                    content_div = inner.find("div", {"id": "articleContent"})
                    texts = [p.get_text(" ", strip=True) for p in content_div.find_all(["p", "h2", "h3", "li"])] if content_div else []
                    if texts: count_content += 1; rows.append({"매체명": site, "제목": title, "날짜": date, "내용": "\n".join(texts), "링크": link})
            except Exception as e: errors.append(f"개별 기사 오류: {e}")

        meta = {"site": site, "url": url, "count_title": count_title, "count_link": count_link,
                "count_date": count_date, "count_content": count_content, "matched": matched, "errors": errors}
        return pd.DataFrame(rows), meta

    except Exception as e:
        return pd.DataFrame(), {"site": site, "url": url, "count_title": 0, "count_link": 0,
                                "count_date": 0, "count_content": 0, "matched": 0, "errors": [f"메인 로드 실패: {e}"]}

def crawl_adweek_sync():
    site, url = "ADWEEK", "https://www.adweek.com/brand-marketing/"
    scraper, rows, errors = create_scraper(), [], []
    count_title = count_link = count_date = count_content = matched = 0

    try:
        soup = BeautifulSoup(robust_get(scraper, url, headers).text, "html.parser")
        targets = []
        hero = soup.find("div", class_="hero__text"); hero_a = hero.find("a") if hero else None # title,link tag class 종류가 2개라 추가 과정 필요
        if hero_a: targets.append(hero_a)
        targets.extend(soup.find_all(["h2", "h3"], class_=["section__headline", "font-heading", "fw-bold"]))

        for art in targets:
            try:
                title = link = date = None
                a_tag = art if art.name == "a" else art.find("a")
                if not a_tag or not a_tag.get("href"): continue
                title, link = a_tag.get_text(strip=True), a_tag["href"]

                inner = BeautifulSoup(robust_get(scraper, link, headers).text, "html.parser")
                date_tag = inner.find("time", class_="custom-publish-time")
                if date_tag and "datetime" in date_tag.attrs:
                    date = date_tag["datetime"].split("UTC")[0].strip()

                if title: count_title += 1
                if link: count_link += 1
                if date: count_date += 1

                if date == yesterday_str:
                    matched += 1
                    content_div = inner.select("div.aw-article-content p")
                    texts = [p.get_text(" ", strip=True) for p in content_div] if content_div else []
                    if texts: count_content += 1; rows.append({"매체명": site, "제목": title, "날짜": date, "내용": "\n".join(texts), "링크": link})
            except Exception as e: errors.append(f"개별 기사 오류: {e}")

        meta = {"site": site, "url": url, "count_title": count_title, "count_link": count_link,
                "count_date": count_date, "count_content": count_content, "matched": matched, "errors": errors}
        return pd.DataFrame(rows), meta

    except Exception as e:
        return pd.DataFrame(), {"site": site, "url": url, "count_title": 0, "count_link": 0,
                                "count_date": 0, "count_content": 0, "matched": 0, "errors": [f"메인 로드 실패: {e}"]}

def crawl_marketingnews_sync():
    site, url = "Marketingnews", "https://www.marketingnews.es/"
    scraper, rows, errors = create_scraper(), [], []
    count_title = count_link = count_date = count_content = matched = 0

    try:
        soup = BeautifulSoup(robust_get(scraper, url, headers).text, "html.parser")
        for art in soup.find_all("a", class_="title") or []:
            try:
                title = link = date = None
                title_tag = art.find("h2") or art.find("span") or art.find("font")
                title = title_tag.get_text(strip=True) if title_tag else art.get_text(strip=True)
                link_tag = art.get("href", "")
                link = f"{url.rstrip('/')}{link_tag}" if link_tag.startswith("/") else link_tag

                inner_res = robust_get(scraper, link, headers)
                if not inner_res: continue
                inner = BeautifulSoup(inner_res.text, "html.parser")
                date_tag = inner.select_one("header#headDetail div.date time")
                if date_tag:
                    date_raw = date_tag.get_text(strip=True)
                    for fmt in ("%d/%m/%Y", "%Y-%m-%d", "%d-%m-%Y"):
                        try:
                            date = datetime.strptime(date_raw, fmt).strftime("%Y-%m-%d")
                            break
                        except ValueError:
                            continue

                if title: count_title += 1
                if link: count_link += 1
                if date: count_date += 1

                if date == yesterday_str:
                    matched += 1
                    content_div = inner.select_one("main#bodyDetail")
                    if content_div:
                        texts = [tag.get_text(strip=True, separator=" ") for tag in content_div.find_all(["p","h2","blockquote"]) if tag.get_text(strip=True)]
                        if texts: count_content += 1; rows.append({"매체명": site, "제목": title, "날짜": date, "내용": "\n\n".join(texts), "링크": link})
            except Exception as e: errors.append(f"개별 기사 오류: {e}")

        meta = {"site": site, "url": url, "count_title": count_title, "count_link": count_link,
                "count_date": count_date, "count_content": count_content, "matched": matched, "errors": errors}
        return pd.DataFrame(rows), meta

    except Exception as e:
        return pd.DataFrame(), {"site": site, "url": url, "count_title": 0, "count_link": 0,
                                "count_date": 0, "count_content": 0, "matched": 0, "errors": [f"메인 로드 실패: {e}"]}

def crawl_marketingtech_sync():
    site, url = "MarketingTechNews", "https://www.marketingtechnews.net/"
    session, rows, errors = requests.Session(), [], []
    session.mount('https://', TLSAdapter())
    count_title = count_link = count_date = count_content = matched = 0

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
        "Accept-Language": "en-US,en;q=0.9",
        "Referer": "https://www.google.com/"
    }

    try:
        res = session.get(url, headers=headers, timeout=10)
        if res.status_code != 200:
            raise Exception(f"상태 코드 오류: {res.status_code}")
        soup = BeautifulSoup(res.text, "html.parser")

        for art in soup.find_all("section", class_="entry-content"):
            try:
                title = link = date = None
                a_tag = art.find("h3").find("a") if art.find("h3") else None
                if not a_tag: continue
                title, link = a_tag.get_text(strip=True), a_tag.get("href")
                date_tag = art.select_one(".byline-content-holder .content")
                date_raw = date_tag.get_text(strip=True).split("|")[0].strip() if date_tag else None
                if date_raw:
                    try: date = datetime.strptime(date_raw, "%d %B %Y").strftime("%Y-%m-%d")
                    except Exception: date = date_raw

                if title: count_title += 1
                if link: count_link += 1
                if date: count_date += 1

                if date == yesterday_str:
                    matched += 1
                    inner_res = session.get(link, headers=headers, timeout=10)
                    inner = BeautifulSoup(inner_res.text, "html.parser")
                    content_div = inner.select_one("section.entry-content")
                    if content_div:
                        texts = [t.get_text(" ", strip=True) for t in content_div.find_all(["p", "blockquote"]) if t.get_text(strip=True)]
                        if texts: count_content += 1; rows.append({"매체명": site, "제목": title, "날짜": date, "내용": "\n\n".join(texts), "링크": link})
            except Exception as e: errors.append(f"개별 기사 오류: {e}")

        meta = {"site": site, "url": url, "count_title": count_title, "count_link": count_link,
                "count_date": count_date, "count_content": count_content, "matched": matched, "errors": errors}
        return pd.DataFrame(rows), meta

    except Exception as e:
        return pd.DataFrame(), {"site": site, "url": url, "count_title": 0, "count_link": 0,
                                "count_date": 0, "count_content": 0, "matched": 0, "errors": [f"메인 로드 실패: {e}"]}

def crawl_euronews_sync():
    site, url = "Euronews", "https://www.euronews.com/business"
    scraper, rows, errors = create_scraper(), [], []
    count_title = count_link = count_date = count_content = matched = 0

    def abs_url(href: str) -> str:
        href = (href or "").strip()
        return href if href.startswith("http") else f"https://www.euronews.com{href}"

    def extract_article_content(inner_soup):
        content_selectors = [
            'div.c-article-content.c-article-content--business.js-article-content',
            'div.c-article-content.c-article-content--business',
            'div.c-article-content.js-article-content',
            'div.c-article-content',
            'article.c-article-content',
            'div#article-content',
        ]
        content_div = None
        for sel in content_selectors:
            content_div = inner_soup.select_one(sel)
            if content_div:
                break
        if not content_div:
            return None

        parts = []
        for tag in content_div.find_all(["p", "h2", "li"]):
            t = tag.get_text(" ", strip=True)
            if t:
                parts.append(t)

        parts = [p for i, p in enumerate(parts) if p and (i == 0 or p != parts[i-1])]
        return "\n".join(parts) if parts else None

    try:
        html = robust_get(scraper, url, headers).text
        soup = BeautifulSoup(html, "html.parser")

        section = None
        section_selectors = [
            'section[data-block="topStoriesVerticalBlock"]',
            'div.b-top-stories-vertical',
            'div.b-top-stories-vertical__container',
            'main .b-top-stories-vertical',
        ]
        for sel in section_selectors:
            section = soup.select_one(sel)
            if section:
                break

        if section:
            targets = section.select('article.the-media-object')
        else:
            targets = soup.select('article.the-media-object')

        seen_links = set()

        for art in targets:
            try:
                title = link = date = None
                link_tag = art.select_one('a.the-media-object__link')
                if not link_tag:
                    link_tag = art.select_one('figure.the-media-object__figure a')

                if link_tag and link_tag.get('href'):
                    link = abs_url(link_tag.get('href'))

                if not link or link in seen_links:
                    continue
                seen_links.add(link)

                title_tag = art.select_one('h2.the-media-object__title')
                if title_tag:
                    title = title_tag.get_text(strip=True)
                elif link_tag and link_tag.get('aria-label'):
                    title = link_tag.get('aria-label').strip()

                inner = BeautifulSoup(scraper.get(link, headers=headers).text, "html.parser")
                date_raw = None; date = None
                date_tag = inner.find('time')
                date_raw = date_tag.get('datetime')
                date = date_raw.split(' ')[0]

                if title: count_title += 1
                if link: count_link += 1
                if date: count_date += 1

                if date == yesterday_str:
                    matched += 1
                    content_text = extract_article_content(inner)
                    if content_text: count_content += 1; rows.append({"매체명": site, "제목": title, "날짜": date, "내용": content_text, "링크": link})
            except Exception as e: errors.append(f"개별 기사 오류: {e}")

        meta = {"site": site, "url": url, "count_title": count_title, "count_link": count_link,
                "count_date": count_date, "count_content": count_content, "matched": matched, "errors": errors}
        return pd.DataFrame(rows), meta

    except Exception as e:
        return pd.DataFrame(), {"site": site, "url": url, "count_title": 0, "count_link": 0,
                                "count_date": 0, "count_content": 0, "matched": 0, "errors": [f"메인 로드 실패: {e}"]}

def crawl_thenextweb_sync():
    site, url = "The Next Web", "https://thenextweb.com/latest"
    scraper, rows, errors = create_scraper(), [], []
    count_title = count_link = count_date = count_content = matched = 0

    try:
        res = scraper.get(url, headers=headers)
        res.raise_for_status()
        soup = BeautifulSoup(res.text, "html.parser")

        targets = []
        for a in soup.select("article.c-topArticles__article header h2 a") + soup.select("article.c-listArticle h2 a"):
            if a and a.get("href"):
                title, link = a.get_text(strip=True), f"https://thenextweb.com{a.get('href')}"
                if link not in [art["link"] for art in targets]:
                    targets.append({"title": title, "link": link})

        for art in targets:
            try:
                title = link = date = None
                inner_res = scraper.get(art["link"], headers=headers)
                inner_res.raise_for_status()
                inner = BeautifulSoup(inner_res.text, "html.parser")

                time_tag = inner.select_one("footer.c-article__pubDate time")
                if time_tag:
                    date_raw = time_tag.get_text(strip=True)
                    date_str = date_raw.split("-")[0].strip()
                    try:
                        date = datetime.strptime(date_str, "%B %d, %Y").strftime("%Y-%m-%d")
                    except Exception:
                        date = date_raw

                if art["title"]: count_title += 1
                if art["link"]: count_link += 1
                if date: count_date += 1

                if date == yesterday_str:
                    matched += 1
                    content_div = inner.select_one("div#article-main-content")
                    if content_div:
                        for ad in content_div.select("div.channel-cta-wrapper, div[id^='hs-embed'], aside, div[class*='ad'], div[class*='sponsor'], div[class*='newsletter']"):
                            ad.decompose()
                        texts = [p.get_text(" ", strip=True) for p in content_div.find_all("p") if p.get_text(strip=True)]
                        if texts: count_content += 1; rows.append({"매체명": site, "제목": art["title"], "날짜": date, "내용": "\n".join(texts), "링크": art["link"]})
            except Exception as e:
                errors.append(f"개별 기사 오류: {e}")

        meta = {"site": site, "url": url, "count_title": count_title, "count_link": count_link,
                "count_date": count_date, "count_content": count_content, "matched": matched, "errors": errors}
        return pd.DataFrame(rows), meta

    except Exception as e:
        return pd.DataFrame(), {"site": site, "url": url,
                                "count_title": 0, "count_link": 0, "count_date": 0,
                                "count_content": 0, "matched": 0, "errors": [f"메인 로드 실패: {e}"]}

def crawl_ainews_sync():
    site, url = "AI NEWS", "https://www.artificialintelligence-news.com/all-categories/"
    warnings.simplefilter("ignore", InsecureRequestWarning)
    ssl_context = ssl.create_default_context()
    ssl_context.set_ciphers('DEFAULT:@SECLEVEL=1')
    scraper, rows, errors = cloudscraper.create_scraper(browser={"browser": "chrome", "platform": "windows", "mobile": False}, ssl_context=ssl_context),[], []
    count_title = count_link = count_date = count_content = matched = 0

    try:
        response = scraper.get(url, headers={"User-Agent": "Mozilla/5.0"})
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        articles = soup.select("div.e-loop-item")

        for art in articles:
            try:
                title = link = date = None
                title_tag = art.select_one(".elementor-widget-theme-post-title h1 a")
                title = title_tag.get_text(strip=True) if title_tag else None
                link = title_tag["href"] if title_tag and title_tag.has_attr("href") else None

                date_tag = art.select_one(".elementor-element-18b13528 .elementor-heading-title")
                date_raw = date_tag.get_text(strip=True) if date_tag else None
                if date_raw:
                    try: date = datetime.strptime(date_raw, "%B %d, %Y").strftime("%Y-%m-%d")
                    except Exception: date = date_raw

                if title: count_title += 1
                if link: count_link += 1
                if date: count_date += 1

                if date == yesterday_str:
                    matched += 1
                    inner_context = ssl.create_default_context(); inner_context.set_ciphers('DEFAULT:@SECLEVEL=1'); inner_context.check_hostname = False
                    inner_scraper = cloudscraper.create_scraper(browser={"browser": "chrome", "platform": "windows", "mobile": False}, ssl_context=inner_context)
                    inner_res = inner_scraper.get(link, verify=False, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/128.0"})
                    inner = BeautifulSoup(inner_res.text, "html.parser")
                    content_div = inner.select_one("div.elementor-widget-theme-post-content")
                    if content_div:
                        texts = [p.get_text(" ", strip=True) for p in content_div.find_all(["p", "h3"]) if p.get_text(strip=True)]
                        if texts: count_content += 1; rows.append({"매체명": site, "제목": title, "날짜": date, "내용": "\n".join(texts), "링크": link})
            except Exception as e: errors.append(f"개별 기사 오류: {e}")

        meta = {"site": site, "url": url, "count_title": count_title, "count_link": count_link,
                "count_date": count_date, "count_content": count_content, "matched": matched, "errors": errors}
        return pd.DataFrame(rows), meta

    except Exception as e:
        return pd.DataFrame(), {"site": site, "url": url,
                                "count_title": 0, "count_link": 0, "count_date": 0,
                                "count_content": 0, "matched": 0, "errors": [f"메인 로드 실패: {e}"]}

def crawl_socialmedia_sync():
    site, url = "Social Media Today", "https://www.socialmediatoday.com/"
    scraper, rows, errors = create_scraper(), [], []
    count_title = count_link = count_date = count_content = matched = 0

    try:
        soup = BeautifulSoup(scraper.get(url, headers=headers).text, "html.parser")
        feed_targets = soup.find("section", class_="dash-feed")
        targets = feed_targets.find_all("li", class_="row feed__item", recursive=True) if feed_targets else []

        for art in targets:
            try:
                title = link = date = None
                if "feed-item-ad" in art.get("class", []): continue
                title_tag = art.find("h3", class_="feed__title");
                if not title_tag: continue
                link_tag = title_tag.find("a")
                title, link = link_tag.get_text(strip=True), link_tag["href"] if link_tag else None
                if link and link.startswith("/"): link = f"https://www.socialmediatoday.com{link}"

                inner = BeautifulSoup(scraper.get(link, headers=headers).text, "html.parser")
                date_raw = None; date = None
                date_tag = inner.find("span", class_="published-info")
                if date_tag:
                    text = date_tag.get_text(strip=True)
                    m = re.search(r"([A-Za-z]+\.\s?\d{1,2},\s?\d{4})", text)
                    if m:
                        date_raw = m.group(1).replace(".", "")
                        date = datetime.strptime(date_raw, "%b %d, %Y").strftime("%Y-%m-%d")

                if title: count_title += 1
                if link: count_link += 1
                if date: count_date += 1

                if date == yesterday_str:
                    matched += 1
                    content_div = inner.find("div", class_="article-body")
                    if content_div:
                        for ad in content_div.find_all(["div", "script", "form"], recursive=True): ad.decompose()
                        texts = [p.get_text(" ", strip=True) for p in content_div.find_all("p") if p.get_text(strip=True)]
                        if texts: count_content += 1; rows.append({"매체명": site, "제목": title, "날짜": date, "내용": "\n".join(texts), "링크": link})
            except Exception as e: errors.append(f"개별 기사 오류: {e}")

        meta = {"site": site, "url": url, "count_title": count_title, "count_link": count_link,
                "count_date": count_date, "count_content": count_content, "matched": matched, "errors": errors}
        return pd.DataFrame(rows), meta

    except Exception as e:
        return pd.DataFrame(), {"site": site, "url": url, "count_title": 0, "count_link": 0,
                                "count_date": 0, "count_content": 0, "matched": 0, "errors": [f"메인 로드 실패: {e}"]}

def crawl_einpresswire_sync():
    _mojibake_signatures = (
        "Ã", "Â", "â€”", "â€“", "â€", "â€œ", "â€˜", "â€™", "â€¦", "�"
    )
    def fix_encoding(text: str) -> str:
        if not text:
            return text

        if any(sig in text for sig in _mojibake_signatures):
            try:
                fixed = text.encode("latin1", errors="ignore").decode("utf-8", errors="ignore")
                if fixed.count("�") < text.count("�"):
                    return fixed

                def _korean_ratio(s):
                    total = len(s)
                    if total == 0: return 0.0
                    ko = sum(0xAC00 <= ord(ch) <= 0xD7A3 for ch in s)
                    return ko / total
                if _korean_ratio(fixed) >= _korean_ratio(text):
                    return fixed
            except Exception:
                pass
        return text

    site, url = "EIN Presswire", "https://www.einpresswire.com/channel/media-advertising-pr#"
    scraper, rows, errors = cloudscraper.create_scraper(), [], []
    count_title = count_link = count_date = count_content = matched = 0

    try:
        response = scraper.get(url, headers=headers); response.encoding = "utf-8"
        soup = BeautifulSoup(response.text, "html.parser")
        articles = soup.select("ul.pr-feed li.funlist0")

        for art in articles:
            try:
                title = link = date = None
                title_tag, date_tag = art.select_one("h3 a"), art.select_one(".pretitle .date a")
                if not title_tag: continue
                title, link = title_tag.get_text(strip=True), title_tag["href"]

                date_raw = date_tag.get_text(strip=True) if date_tag else None
                date = None
                if date_raw:
                    try: date = datetime.strptime(date_raw, "%B %d, %Y").strftime("%Y-%m-%d")
                    except Exception: date = date_raw

                if title: count_title += 1
                if link: count_link += 1
                if date: count_date += 1

                if date == yesterday_str:
                    matched += 1
                    inner_res = robust_get(scraper, link, headers)
                    inner_res.encoding = "utf-8"
                    inner = BeautifulSoup(inner_res.text, "html.parser")
                    content_div = inner.select_one("div.article_column.imported")
                    if content_div:
                        for t in content_div.select("script, iframe, img, style"): t.decompose()
                        for p in content_div.select("p"):
                            text = p.get_text(strip=True)
                            if text.startswith("Legal Disclaimer") or p.get("class") in [["contact"], ["pr_video_title"]]:
                                p.decompose()
                        f = content_div.select_one("div.article-footer")
                        if f: f.decompose()

                        content = content_div.get_text(separator="\n", strip=True)
                        content = fix_encoding(content)
                        if content: count_content += 1; rows.append({"매체명": site, "제목": title, "날짜": date, "내용": content, "링크": link})
            except Exception as e: errors.append(f"개별 기사 오류: {e}")

        meta = {"site": site, "url": url, "count_title": count_title, "count_link": count_link,
                "count_date": count_date, "count_content": count_content, "matched": matched, "errors": errors}
        return pd.DataFrame(rows), meta

    except Exception as e:
        return pd.DataFrame(), {"site": site, "url": url, "count_title": 0, "count_link": 0,
                                "count_date": 0, "count_content": 0, "matched": 0, "errors": [f"메인 로드 실패: {e}"]}

# =========================
# 실행
# =========================

async def run_all_crawlers():
    parallel_crawlers = [crawl_marketingmag_sync,
                         crawl_marketingbeat_sync,
                         crawl_searchengine_sync,
                         crawl_adweek_sync,
                         crawl_marketingnews_sync,
                         crawl_euronews_sync,
                         crawl_thenextweb_sync,
                         crawl_socialmedia_sync,
                         crawl_einpresswire_sync,
                         crawl_ainews_sync]
    sequential_crawlers = [crawl_marketingtech_sync]

    tasks = [asyncio.to_thread(lambda c=c: c()) for c in parallel_crawlers]
    parallel_results = await asyncio.gather(*tasks)

    sequential_results = []
    for c in sequential_crawlers:
        res = await asyncio.to_thread(lambda c=c: c())
        sequential_results.append(res)
        await asyncio.sleep(random.uniform(0.3,0.6))

    results = parallel_results+sequential_results
    all_rows, warnings, metas = [], [], []
    for df, meta in results:
        metas.append(meta)
        if should_warn(meta): warnings.append(make_warning_row(meta, yesterday_str))
        if not df.empty: all_rows.append(df)

    df_data = pd.concat(all_rows, ignore_index=True) if all_rows else pd.DataFrame(columns=["매체명","제목","날짜","내용","링크"])
    df_warn = pd.concat(warnings, ignore_index=True) if warnings else pd.DataFrame(columns=["매체명","제목","날짜","내용","링크"])
    return df_data, df_warn, metas

df_data, df_warn, metas = asyncio.get_event_loop().run_until_complete(run_all_crawlers())

# =========================
# 시트 업데이트
# =========================
gc = gspread.service_account("{json 파일명}") # 구글 api 사용을 위해 필요한 json 키 값
spreadsheet = gc.open_by_key("{스프레드시트 주소}")
worksheet = spreadsheet.worksheet("{스프레드시트 셀 이름}")
existing = worksheet.get_all_values()
next_row = len(existing) + 1

blocks = []
if not df_data.empty: blocks.append(df_data)
if not df_warn.empty: blocks.append(df_warn)
if blocks:
    df_out = pd.concat(blocks, ignore_index=True)
    df_upload = df_out.drop(columns=["내용"])
    set_with_dataframe(worksheet, df_upload, row=next_row, col=1, include_column_header=False)

print("=== METAS ===")
for m in metas:
    print(m)

=== METAS ===
{'site': 'Marketing Mag', 'url': 'https://www.marketingmag.com.au/', 'count_title': 14, 'count_link': 14, 'count_date': 14, 'count_content': 1, 'matched': 1, 'errors': ["개별 기사 오류: 'NoneType' object has no attribute 'get'", "개별 기사 오류: 'NoneType' object has no attribute 'get'", "개별 기사 오류: 'NoneType' object has no attribute 'get'", "개별 기사 오류: 'NoneType' object has no attribute 'get'", "개별 기사 오류: 'NoneType' object has no attribute 'get'", "개별 기사 오류: 'NoneType' object has no attribute 'get'", "개별 기사 오류: 'NoneType' object has no attribute 'get'", "개별 기사 오류: 'NoneType' object has no attribute 'get'", "개별 기사 오류: 'NoneType' object has no attribute 'get'", "개별 기사 오류: 'NoneType' object has no attribute 'get'", "개별 기사 오류: 'NoneType' object has no attribute 'get'", "개별 기사 오류: 'NoneType' object has no attribute 'get'", "개별 기사 오류: 'NoneType' object has no attribute 'get'", "개별 기사 오류: 'NoneType' object has no attribute 'get'", "개별 기사 오류: 'NoneType' object has no attribute 'get'", "개별 기사 

In [None]:
# 챗 gpt 번역 프로그램
import openai

openai.api_key = "{gpt api 키 값}" # 챗 gpt api 사용을 위해 필요한 키 값

def translate_long_text(text, target_language="Korean"):
    paragraphs = text.split('\n\n')
    translated_paragraphs = []

    for paragraph in paragraphs:
        if paragraph.strip():
            try:
                response = openai.chat.completions.create(
                    model="gpt-4o-mini",
                    messages=[
                        {"role": "system", "content": f"You are a helpful assistant that translates text to {target_language}."},
                        {"role": "user", "content": f"Please translate the following text into {target_language}:\n\n{paragraph}"}
                    ]
                )
                translated_paragraphs.append(response.choices[0].message.content.strip())
            except Exception as e:
                print(f"Error during translation: {e}")
                translated_paragraphs.append(paragraph)

    return "\n\n".join(translated_paragraphs)

translated_data = []

for idx, row in df_out.iterrows():
    date = row['날짜']
    title = row['제목']
    content = row['내용']

    translated_title = translate_long_text(str(title), target_language="Korean")
    translated_content = translate_long_text(str(content), target_language="Korean")

    translated_data.append({
        "날짜": date,
        "제목": translated_title,
        "내용": translated_content
    })

translated_df = pd.DataFrame(translated_data)

gc = gspread.service_account("{json 파일명}") # 구글 api 사용을 위해 필요한 json 키 값
spreadsheet = gc.open_by_key("{스프레드시트 주소}")
worksheet = spreadsheet.worksheet("{스프레드시트 셀 이름}")
existing = worksheet.get_all_values()
next_row = len(existing) + 1
set_with_dataframe(worksheet, translated_df, row=next_row, col=1, include_column_header=False)