In [None]:
# 라이브러리 불러오기
import re, time, random
from datetime import datetime, timedelta, date
from collections import Counter
from urllib.parse import quote

import requests
from bs4 import BeautifulSoup

import pandas as pd
import gspread
from gspread_dataframe import set_with_dataframe

# 기업명 리스트 가져오기 (스프레드시트 기사 수집 리스트)
gc = gspread.service_account('{json 파일명}')
spreadsheet = gc.open_by_key("{스프레드시트 주소}")
worksheet = spreadsheet.worksheet("{스프레드시트 셀 이름}")
keyword_list = sum(worksheet.get("D2:D3"), [])
rows = worksheet.get("B2:E3")

keyword_info = {}
for row in rows:
    if len(row) >= 2:
        group, plus, keyword, point = row[0], row[1], row[2], row[3]
        keyword_info[keyword] = {"기업명": group, "가산 기업명": plus, "기업별 키워드": point}

# 날짜 계산 함수 (링크에 사용)
today = date.today()
first_day_this_month = today.replace(day=1)
last_day_prev_month = first_day_this_month - timedelta(days=1)
start_date_str = last_day_prev_month.replace(day=1)
finish_date_str = last_day_prev_month
start_date1 = start_date_str.strftime("%Y.%m.%d")
finish_date1 = finish_date_str.strftime("%Y.%m.%d")
start_date2 = start_date_str.strftime("%Y%m%d")
finish_date2 = finish_date_str.strftime("%Y%m%d")

# 자동 class 탐색
class_url = "https://search.naver.com/search.naver?ssc=tab.news.all&where=news&sm=tab_jum&query=강남" # 임의로 기사가 많은 검색어 사용
class_headers = {"User-Agent": "Mozilla/5.0"}
class_html = requests.get(class_url, headers= class_headers).text
class_soup = BeautifulSoup(class_html, "html.parser")

root = class_soup.select_one("div.group_news")
if not root:
    raise RuntimeError("group_news 컨테이너를 찾지 못했습니다.")

def extra_tokens(cls_list, must_have):
    return [
        c for c in cls_list
        if c not in must_have
        and not (c.startswith(("sds-","fds-","api_","_")))
        and re.fullmatch(r"[A-Za-z0-9_]{8,}", c)
    ]

# 메인 div 가져오기
vertical_counter = Counter()
for div in root.select("div.sds-comps-vertical-layout.sds-comps-full-layout"):
    if div.select_one(".sds-comps-profile"):
        tokens = extra_tokens(div.get("class", []),
                              ["sds-comps-vertical-layout","sds-comps-full-layout"])
        if tokens:
            vertical_counter.update([tokens[-1]])

vertical_tail = vertical_counter.most_common(1)[0][0] if vertical_counter else None
class1 = f"sds-comps-vertical-layout sds-comps-full-layout {vertical_tail}" if vertical_tail else None

# 서브 div 가져오기
base_counter = Counter()
base_blocks = []
for div in root.select("div.sds-comps-base-layout.sds-comps-full-layout"):
    if div.select_one("a[class*=_]"):
        tokens = extra_tokens(div.get("class", []),
                              ["sds-comps-base-layout","sds-comps-full-layout"])
        if tokens:
            base_counter.update([tokens[-1]])
            base_blocks.append((div, tokens[-1]))

base_tail = base_counter.most_common(1)[0][0] if base_counter else None
class3 = f"sds-comps-base-layout sds-comps-full-layout {base_tail}" if base_tail else None

# 메인 main_div 가져오기
token_counter = Counter()
for div in class_soup.select("div.sds-comps-base-layout.sds-comps-full-layout"):
    tokens = extra_tokens(div.get("class", []),
                          ["sds-comps-base-layout", "sds-comps-full-layout"])
    if tokens:
        token_counter.update(tokens)

token_10_diff = [token for token, count in token_counter.items()
                 if count == 10 and token != class3]

if token_10_diff:
    class2 = f"sds-comps-base-layout sds-comps-full-layout {' '.join(token_10_diff)}"

# 서브 title_tag 가져오기
anchor_pair = None
target_div = None
for div, tail in base_blocks:
    if tail == base_tail:
        target_div = div
        break

if target_div:
    for a in target_div.select("a"):
        cl = a.get("class", [])
        if not cl:
            continue
        no_unders = [c for c in cl if "_" not in c and re.fullmatch(r"[A-Za-z0-9]{10,}", c)]
        with_unders = [c for c in cl if "_" in c and re.fullmatch(r"[A-Za-z0-9_]{10,}", c)]
        if no_unders and with_unders:
            left  = sorted(no_unders,  key=len, reverse=True)[0]
            right = sorted(with_unders, key=len, reverse=True)[0]
            class4 = f"{left} {right}"
            break

# 네이버 뉴스 크롤링 함수
def crawl_naver_news(keyword):
    encoded_kw = quote(keyword)
    url = f"https://search.naver.com/search.naver?ssc=tab.news.all&query={encoded_kw}&sm=tab_opt&sort=0&photo=0&field=0&pd=3&ds={start_date1}&de={finish_date1}&docid=&related=0&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so%3Ar%2Cp%3Afrom{start_date2}to{finish_date2}&is_sug_officeid=0&office_category=0&service_area=0"

    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/123.0.0.0 Safari/537.36"
    }

    response = requests.get(url, headers=headers, timeout=5)
    soup = BeautifulSoup(response.text, "html.parser")

    news_items = []

    # 메인 뉴스 크롤링
    main_articles = soup.find_all("div", class_= class1)
    for article in main_articles:
        title, link, date, content = None, None, None, None

        main_div = article.find('div', class_= class2)
        title_tag = main_div.find('span', class_='sds-comps-text sds-comps-text-ellipsis sds-comps-text-ellipsis-1 sds-comps-text-type-headline1')

        a_tag = article.find('div', class_='sds-comps-horizontal-layout sds-comps-inline-layout sds-comps-profile-info')
        link_tag = a_tag.find("a", href=lambda x: x and ("n.news.naver.com" in x or
                                                           "m.entertain.naver.com" in x or
                                                           "m.sports.naver.com" in x)) # link_tag

        date_tag = a_tag.find(
            'span',
             string=lambda text: text and (
                any(unit in text for unit in ['분 전', '시간 전', '일 전', '주 전'])
                or re.fullmatch(r'\d{4}\.\d{2}\.\d{2}.', text.strip())
            )
        )
        date = date_tag.get_text(strip=True) if date_tag else None

        # 네이버 뉴스 링크가 있는 경우 (title, content를 뉴스 본문에서 크롤링)
        if link_tag:
            link = link_tag["href"]

            detail_response = requests.get(link, headers=headers)
            detail_soup = BeautifulSoup(detail_response.text, "html.parser")

            detail_title_selectors = [lambda s: s.select_one("h2#title_area.media_end_head_headline"),
                                 lambda s: s.select_one(".media_end_head_title span"),
                                 lambda s: s.select_one("h2.ArticleHead_article_title__qh8GV")]

            for detail_title_selector in detail_title_selectors:
                detail_title_tag = detail_title_selector(detail_soup)
                if detail_title_tag:
                    title = detail_title_tag.get_text(" ", strip=True)
                    break
                if not title:
                    title = title_tag.get_text(" ", strip=True)

            detail_article = detail_soup.find("article", {"id": "dic_area"})
            if detail_article:
                for content_tag in detail_article.find_all(["span", "em", "br", "div"]):
                    content_tag.decompose()
                content = detail_article.get_text(separator="\n", strip=True)
            else:
                content = None

            if content is None:
                content_tag =  detail_soup.select_one("div._article_content")
                if content_tag:
                    content = content_tag.get_text(separator="\n", strip=True)

        # 네이버 뉴스 링크가 없는 경우 (title을 뉴스 페이지에서 크롤링)
        else:
            link_tag = main_div.find('a', href=True)
            link = link_tag['href'] if link_tag else None
            title = title_tag.get_text(" ", strip=True)

        safe_title = title.replace('"', '""')
        title_link = f'=HYPERLINK("{link}", "{safe_title}")'

        news_items.append({
            "검색어": keyword,
            "제목": title_link,
            "내용": content,
            "날짜": date
        })

    # 서브 뉴스 크롤링
    sub_articles = soup.find_all("div", class_= class3)
    for article in sub_articles:
        title, link, date, content = None, None, None, None

        title_tag = article.find("a", class_= class4) # title_tag

        link_tag = article.find("a", href=lambda x: x and ("n.news.naver.com" in x or
                                                           "m.entertain.naver.com/article/" in x or
                                                           "m.sports.naver.com/article/" in x)) # link_tag

        date_tag = article.find(
            'span',
             string=lambda text: text and (
                any(unit in text for unit in ['분 전', '시간 전', '일 전', '주 전'])
                or re.fullmatch(r'\d{4}\.\d{2}\.\d{2}.', text.strip())
            )
        )
        date = date_tag.get_text(strip=True) if date_tag else None

        # 네이버 뉴스 링크가 있는 경우 (title, content를 뉴스 본문에서 크롤링)
        if link_tag:
            link = link_tag["href"] if link_tag else None

            detail_response = requests.get(link, headers=headers)
            detail_soup = BeautifulSoup(detail_response.text, "html.parser")

            detail_title_selectors = [lambda s: s.select_one("h2#title_area.media_end_head_headline"),
                                 lambda s: s.select_one(".media_end_head_title span"),
                                 lambda s: s.select_one("h2.ArticleHead_article_title__qh8GV")]

            for detail_title_selector in detail_title_selectors:
                detail_title_tag = detail_title_selector(detail_soup)
                if detail_title_tag:
                    title = detail_title_tag.get_text(" ", strip=True)
                    break
                if not title:
                    title = title_tag.get_text(" ", strip=True)

            detail_article = detail_soup.find("article", {"id": "dic_area"})
            if detail_article:
                for content_tag in detail_article.find_all(["span", "em", "br", "div"]):
                    content_tag.decompose()
                content = detail_article.get_text(separator="\n", strip=True)
            else:
                content = None

            if content is None:
                content_tag = detail_soup.select_one("div._article_content")
                if content_tag:
                    content = content_tag.get_text(separator="\n", strip=True)

        # 네이버 뉴스 링크가 없는 경우 (title을 뉴스 페이지에서 크롤링)
        else:
            link = title_tag['href']
            title = title_tag.get_text(" ", strip=True)

        safe_title = title.replace('"', '""')
        title_link = f'=HYPERLINK("{link}", "{safe_title}")'

        news_items.append({
            "검색어": keyword,
            "제목": title_link,
            "내용": content,
            "날짜": date
        })

    return news_items

all_news = []
retry_keywords = []

# 1차 실행
for keyword, info in keyword_info.items():
    company_news = crawl_naver_news(keyword)
    for item in company_news:
        item["기업명"] = info["기업명"]
        item["가산 기업명"] = info["가산 기업명"]
        item["기업별 키워드"] = info["기업별 키워드"]

    if len(company_news) == 0:
        retry_keywords.append(keyword)

    all_news.extend(company_news)
    print(f"{keyword}: {len(company_news)}건 수집 완료")
    time.sleep(random.uniform(3.5, 6.5))


# 1차에서 0건이었던 키워드 재시도 (2회 시도)
if retry_keywords:
    second_retry_keywords = []
    for keyword in retry_keywords:
        info = keyword_info[keyword]
        company_news = crawl_naver_news(keyword)
        for item in company_news:
            item["기업명"] = info["기업명"]
            item["가산 기업명"] = info["가산 기업명"]
            item["기업별 키워드"] = info["기업별 키워드"]
        all_news.extend(company_news)
        print(f"[재시도 1회차] {keyword}: {len(company_news)}건 수집 완료")

        if len(company_news) == 0:
            second_retry_keywords.append(keyword)

        time.sleep(random.uniform(5.5, 9.5))

    # 2차에서도 여전히 0건인 키워드 재시도 (3회 시도)
    if second_retry_keywords:
        for keyword in second_retry_keywords:
            info = keyword_info[keyword]
            company_news = crawl_naver_news(keyword)
            for item in company_news:
                item["기업명"] = info["기업명"]
                item["가산 기업명"] = info["가산 기업명"]
                item["기업별 키워드"] = info["기업별 키워드"]
            all_news.extend(company_news)
            print(f"[재시도 2회차] {keyword}: {len(company_news)}건 수집 완료")
            time.sleep(random.uniform(6.5, 10.5))
else:
    pass

# 업로드
if all_news:
    df = pd.DataFrame(all_news, columns=["검색어", "기업명", "가산 기업명", "기업별 키워드", "제목", "내용", "날짜"])
    worksheet = spreadsheet.worksheet("{스프레드시트 셀 이름}")
    existing_data = worksheet.get_all_values()
    next_row = len(existing_data) + 1
    set_with_dataframe(worksheet, df, row=next_row, col=1, include_column_header=False)
else:
    print("스프레드시트 업로드 에러 발생")

In [None]:
# 대표 기사 선
import pandas as pd
import numpy as np
import re

from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN

import gspread
from gspread_dataframe import set_with_dataframe
from gspread.utils import rowcol_to_a1

from datetime import datetime
from dateutil.relativedelta import relativedelta

# 날짜 구분 함수
today = datetime.today()
last_month = today - relativedelta(months=1)
formatted = last_month.strftime("%Y.%m.")

# 공통 함수
def clean_text(text):
    text = re.sub(r"\[[^\]]*\]", " ", str(text))
    text = re.sub(r"[^가-힣a-zA-Z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def normalize_text(text):
    return re.sub(r"[^가-힣a-z0-9]", "", str(text).lower())

def make_corp_pattern(corp_name):
    corp_escaped = re.escape(corp_name)
    josa_pattern = (
        "이|가|은|는|을|를|의|에|에서|으로|로|에게|께|께서|도|만|까지|부터|와|과|랑|하고|조차|마저|뿐"
        "|이라도|이라서|으로서|으로써|에게서|부터의|까지의|밖에"
        "|이라면|이라도|이라든가|이라며|이라서|이라니|이라면|이나|이나마|이거나|이며|이었다|이었던|이기에|이지만|이니까|이만큼|이만큼은|이라고"
    )
    pattern = rf'(?<![가-힣A-Za-z0-9]){corp_escaped}(?![가-힣A-Za-z])({josa_pattern})?(?![가-힣A-Za-z0-9])'
    return pattern

def is_related(title, content, corp_name):
    if not corp_name:
        return False

    temp_text = f"{title} {content}"
    temp_text = re.sub(rf"(사진\s*\|\s*{corp_name})", "", temp_text)
    temp_text = re.sub(rf"({corp_name}\s*제공)", "", temp_text)
    temp_text = re.sub(rf"({corp_name}\s*출처)", "", temp_text)
    temp_text = re.sub(rf"(사진\s*=\s*{corp_name})", "", temp_text)

    corp_list = [c.strip() for c in str(corp_name).split(",") if c.strip()]
    for corp in corp_list:
        pattern = make_corp_pattern(corp)
        if re.search(pattern, temp_text):
            return True
    return False

def count_valid_corp_mentions(text, corp_name):
    text = str(text)
    corp_pattern = make_corp_pattern(corp_name)

    cleaned_text = re.sub(rf"(사진\s*\|\s*{corp_name})", "", text)
    cleaned_text = re.sub(rf"({corp_name}\s*제공)", "", cleaned_text)
    cleaned_text = re.sub(rf"({corp_name}\s*출처)", "", cleaned_text)
    cleaned_text = re.sub(rf"(사진\s*=\s*{corp_name})", "", cleaned_text)

    return len(re.findall(corp_pattern, cleaned_text))

# 점수 계산
def calc_score(cluster_size, corp_name, bonus_name, title, content, keywords):
    base = 1.0
    text = f"{title} {content}"
    bonus_list = [b.strip() for b in str(bonus_name).split(",") if b.strip()] if bonus_name else []
    corp_mention = sum(count_valid_corp_mentions(text, b) for b in bonus_list)
    info_keywords = info_keywords = [ "CEO", "MOU", "공개", "공시", "기업", "경쟁", "경쟁력", "계약", "계획",
                                      "검토", "규모", "기록", "기술", "네트워크", "달성", "라운드", "매출",
                                      "명예", "모금", "목표", "민간", "법인", "배당", "벤처", "발행", "발표",
                                      "비전", "비율", "사업", "산업", "상장", "서비스", "성과", "성장",
                                      "성장률", "스타트업", "수상", "수여", "수익", "시장", "시리즈", "신규",
                                      "실적", "압류", "업계", "에너지", "영업", "연구", "유치", "이익", "인수",
                                      "인증", "잠재력", "자금", "자산", "자본", "점유율", "정부", "제휴", "전망",
                                      "제품", "점유율", "정책", "조달", "주관사", "지분", "창업", "출범",
                                      "출자", "출시", "캠페인", "콘퍼런스", "특허", "투자", "패션", "파트너",
                                      "파트너십", "펀드", "펀딩", "프로모션", "평가", "확대", "혁신", "확장",
                                      "회원", "협업", "흑자", "비상장", "적자", "추진", "전략", "브랜드",
                                      "글로벌", "구독자", "생태계", "손익", "손실", "수출", "진출"
                                      ]

    cluster_bonus = min(cluster_size * 0.1, 3.0)

    bonus_corp_bonus = 0
    for b in bonus_list:
        pattern_bonus = make_corp_pattern(b)
        if re.search(pattern_bonus, title):
            bonus_corp_bonus = 3.0
            break

    repeat_bonus = 1.0 if corp_mention >= 3 else 0
    info_bonus = 2.0 if any(k in text for k in info_keywords) else 0
    length_bonus = 0.3 if len(content) > 3000 else 0.2 if len(content) > 1500 else 0.1 if len(content) > 800 else 0
    entertain_penalty = -4.0 if "entertain" in title.lower() else 0
    keyword_bonus = 0

    if keywords:
        keyword_list = [k.strip().lower() for k in str(keywords).replace("[", "").replace("]", "").split(",") if k.strip()]
        text_lower = text.lower()
        if any(word in text_lower for word in keyword_list):
            keyword_bonus = 2.0

    score = base + cluster_bonus + bonus_corp_bonus + repeat_bonus + info_bonus + length_bonus + entertain_penalty + keyword_bonus

    detail = {
        "기본 점수": base,
        "반복 주제 점수": cluster_bonus,
        "기업명 포함 점수": bonus_corp_bonus,
        "반복 기업명 점수": repeat_bonus,
        "투자 및 유치 점수": info_bonus,
        "기사 길이 점수": length_bonus,
        "엔터 기사 점수": entertain_penalty,
        "기업별 키워드 점수": keyword_bonus
    }

    return round(score, 3), detail

# 전처리
df["제목"] = df["제목"].fillna("").astype(str)
df["내용"] = df["내용"].fillna("").astype(str)

# 모델 로드
model = SentenceTransformer("jhgan/ko-sroberta-multitask")

# 기사 점수 계산
results = []

# 그룹별 대표 기사 선정
for keyword, group in df.groupby("검색어"):
    corp = group["기업명"].iloc[0]
    bonus = group["가산 기업명"].iloc[0] if "가산 기업명" in group.columns else None
    related = group[group.apply(lambda r: is_related(r["제목"], r["내용"], bonus), axis=1)]
    if related.empty:
        continue

    texts = [clean_text(f"{r['제목']} {r['내용']}") for _, r in related.iterrows()]
    emb = model.encode(texts, convert_to_tensor=True)
    labels = DBSCAN(eps=0.35, min_samples=1, metric="cosine").fit(emb).labels_

    for i, (idx, row) in enumerate(related.iterrows()):
        cluster_size = np.sum(labels == labels[i])
        score, detail = calc_score(cluster_size, corp, bonus, row["제목"], row["내용"], row["기업별 키워드"])
        results.append({
            "기업명": corp,
            "제목": row["제목"],
            "내용": row["내용"],
            "날짜": row["날짜"],
            "점수": score,
            "세부 점수": detail,
            "구분": formatted
        })

# 결과 정리 및 업로드 (점수표)
rep_df = pd.DataFrame(results)
rep_df["내용_"] = rep_df["내용"].apply(lambda x: 0 if pd.isna(x) or str(x).strip() == "" else 1)

if not rep_df.empty:
    top_articles = rep_df.sort_values(by=["기업명", "점수", "내용_"], ascending=[True, False, False])
    worksheet = spreadsheet.worksheet("{스프레드시트 셀 이름}")
    existing = worksheet.get_all_values()
    next_row = len(existing) + 1
    set_with_dataframe(worksheet, top_articles, row=next_row, col=1, include_column_header=False)
    print("스프레드시트 업로드 완료")
else:
    print("관련 기사 없음")

# 결과 정리 및 업로드 (기사 수집 결과)
if not rep_df.empty:
    filtered_df = rep_df[rep_df["점수"] >= 2].copy()

    if not filtered_df.empty:
        top_articles = (filtered_df.sort_values(by=["기업명", "점수", "내용_"], ascending=[True, False, False])
                                 .groupby("기업명", as_index=False)
                                 .first())
        worksheet = spreadsheet.worksheet("{스프레드시트 셀 이름}")
        existing = worksheet.get_all_values()
        next_row = len(existing) + 1
        set_with_dataframe(worksheet, top_articles, row=next_row, col=1, include_column_header=False)
        print("스프레드시트 업로드 완료")
    else:
        print("2점 이상인 기사가 없습니다.")
else:
    print("관련 기사 없음")

# 결과 정리 및 업로드 
spreadsheet = gc.open_by_key("{스프레드시트 주소}")
list_ws = spreadsheet.worksheet("{스프레드시트 셀 이름}")

header_row = list_ws.row_values(4)
try:
    formatted_col = [h.lower().strip() for h in header_row].index(formatted) + 1
except ValueError:
    raise Exception("4행에 formatted 값이 없습니다.")

target_col = formatted_col + 2
target_col_letter = rowcol_to_a1(1, target_col)[:2].replace('1', '')

print(f"기준 열: {header_row[formatted_col-1]} ({formatted_col}열): 업로드 위치: +2 = {target_col_letter}열")

company_cells = list_ws.range("H6:H")

if not rep_df.empty:
    filtered_df = rep_df[rep_df["점수"] >= 2].copy()

    if not filtered_df.empty:
        top_articles = (
            filtered_df.sort_values(by=["기업명", "점수", "내용_"], ascending=[True, False, False])
                       .groupby("기업명", as_index=False)
                       .first()
        )

        for cell in company_cells:
            company_name = cell.value.strip() if cell.value else None

            if company_name:
                match = top_articles[top_articles["기업명"] == company_name]
                if not match.empty:
                    title_value = match.iloc[0]["제목"]

                    upload_cell = rowcol_to_a1(cell.row, target_col)
                    list_ws.update([[title_value]], upload_cell, value_input_option="USER_ENTERED")

                    print(f"{company_name}: {upload_cell} 입력 완료")

        print("스프레드시트 업데이트 완료")
    else:
        print("2점 이상인 기사가 없습니다.")
else:
    print("관련 기사 없음")