In [2]:
!pip install transformers torch fugashi[unidic-lite]

Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting torch
  Downloading torch-2.7.1-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting fugashi[unidic-lite]
  Downloading fugashi-1.5.1-cp312-cp312-win_amd64.whl.metadata (7.5 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.33.0-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Using cached tokenizers-0.21.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Using cached safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting unidic-lite (from fugashi[unidic-lite])
  Downloading unidic-lite-1.0.8.tar.gz (47.4 MB)
     ---------------------------------------- 0.0/47.4 MB ? eta -:--:--
      --------------------------------------- 0.8/47.4 MB 4.2 

In [5]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
import time
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

### 0Ô∏è‚É£ Í∞êÏÑ± Î∂ÑÏÑùÍ∏∞ ÏÑ∏ÌåÖ (KcBERT Í∏∞Î∞ò)
model_name = "beomi/KcELECTRA-base-v2022"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)


### 1Ô∏è‚É£ ÎÇ†ÏßúÎ≥Ñ Ï¢ÖÎ™©ÌÜ†Î°†Î∞© ÌÅ¨Î°§ÎßÅ Ìï®Ïàò
def crawl_board_by_date(stock_code, target_date_str, max_try=100):
    base_url = f"https://finance.naver.com/item/board.naver?code={stock_code}"
    target_date = datetime.strptime(target_date_str, "%Y-%m-%d").date()

    all_posts = []
    page = 1
    attempts = 0

    while attempts < max_try:
        url = f"{base_url}&page={page}"
        res = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
        soup = BeautifulSoup(res.text, "html.parser")
        rows = soup.select("table.type2 tr")

        found_today_post = False
        found_old_post = False

        for row in rows:
            cols = row.find_all("td")
            if len(cols) < 5:
                continue

            try:
                href = cols[0].find("a")["href"]
                detail_url = "https://finance.naver.com" + href
                date_text = cols[3].text.strip()

                # ÎÇ†Ïßú Ìè¨Îß∑ ÌåêÎ≥Ñ
                if ":" in date_text:
                    # Ïò§Îäò ÎÇ†Ïßú + ÏãúÍ∞ÑÎßå ÏûàÏùå (ex: "14:12")
                    post_date = datetime.now().date()
                else:
                    # ÎÇ†Ïßú + ÏãúÍ∞Ñ ÏûàÏùå (ex: "2024.06.19 11:45")
                    post_date = datetime.strptime(date_text, "%Y.%m.%d %H:%M").date()

                if post_date == target_date:
                    found_today_post = True
                    post_res = requests.get(detail_url, headers={"User-Agent": "Mozilla/5.0"})
                    post_soup = BeautifulSoup(post_res.text, "html.parser")
                    content = post_soup.select_one("div.view_se").get_text(strip=True)
                    all_posts.append({"ÎÇ†Ïßú": post_date, "Î≥∏Î¨∏": content})
                    time.sleep(0.1)
                elif post_date < target_date:
                    found_old_post = True
                    break

            except Exception:
                continue

        if not found_today_post or found_old_post:
            break

        page += 1
        attempts += 1
        time.sleep(0.2)

    return pd.DataFrame(all_posts)


### 2Ô∏è‚É£ Í∞êÏÑ± Î∂ÑÏÑù Ìï®Ïàò
def analyze_sentiment(text_list):
    results = sentiment_pipeline(text_list)
    sentiments = []
    for r in results:
        if r["label"] == "LABEL_0":
            sentiments.append(-r["score"])
        else:
            sentiments.append(r["score"])
    return sentiments


### 3Ô∏è‚É£ Ï†ÑÏ≤¥ ÌååÏù¥ÌîÑÎùºÏù∏: ÎÇ†Ïßú Î¶¨Ïä§Ìä∏Î°ú Í∞êÏÑ± Ï†êÏàò Í≥ÑÏÇ∞
def analyze_multiple_dates_sentiment(stock_code, date_list):
    all_data = []

    for date_str in date_list:
        print(f"üìÜ {date_str} ÏàòÏßë Ï§ë...")
        df = crawl_board_by_date(stock_code, date_str)
        if df.empty:
            print(f"‚ùó {date_str} Îç∞Ïù¥ÌÑ∞ ÏóÜÏùå")
            continue

        print(f"üß† Í∞êÏÑ± Î∂ÑÏÑù Ï§ë... ({len(df)}Í±¥)")
        df["Í∞êÏÑ±Ï†êÏàò"] = analyze_sentiment(df["Î≥∏Î¨∏"].tolist())

        daily_score = df["Í∞êÏÑ±Ï†êÏàò"].mean()
        all_data.append({"ÎÇ†Ïßú": date_str, "ÌèâÍ∑†Í∞êÏÑ±Ï†êÏàò": daily_score})
        time.sleep(0.5)

    return pd.DataFrame(all_data)

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base-v2022 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


In [6]:
analyze_multiple_dates_sentiment("005930", ["2024-06-19"])

üìÜ 2024-06-19 ÏàòÏßë Ï§ë...
‚ùó 2024-06-19 Îç∞Ïù¥ÌÑ∞ ÏóÜÏùå


In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# 1Ô∏è‚É£ Í∞êÏÑ± Î∂ÑÏÑùÍ∏∞ ÏÑ∏ÌåÖ (KcELECTRA)
print("üì¶ Í∞êÏÑ±Î∂ÑÏÑù Î™®Îç∏ Î°úÎî© Ï§ë...")
model_name = "beomi/KcELECTRA-base-v2022"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
print("‚úÖ Í∞êÏÑ±Î∂ÑÏÑùÍ∏∞ Î°úÎî© ÏôÑÎ£å\n")

# 2Ô∏è‚É£ Ï¢ÖÎ™©ÌÜ†Î°†Î∞©ÏóêÏÑú ÌäπÏ†ï ÎÇ†ÏßúÏùò Ï†úÎ™© ÏàòÏßë + Í∞êÏÑ±Î∂ÑÏÑù
def get_sentiment_scores_on_date(code="005930", target_date="2025.06.21", max_page=10):
    headers = {'User-Agent': 'Mozilla/5.0'}
    filtered_data = []

    for page in range(1, max_page + 1):
        url = f"https://finance.naver.com/item/board.naver?code={code}&page={page}"
        res = requests.get(url, headers=headers)
        res.encoding = 'euc-kr'

        soup = BeautifulSoup(res.text, 'html.parser')
        rows = soup.select("table.type2 tr")
        spans = soup.select('span.tah.p10.gray03')
        raw_texts = [span.get_text(strip=True) for span in spans]
        dates_only = [raw_texts[i].split()[0] for i in range(0, len(raw_texts), 2)]

        date_index = 0
        for row in rows:
            tds = row.find_all("td")
            if len(tds) < 5:
                continue

            title_tag = tds[1].find("a")
            if not title_tag:
                continue

            title = title_tag.get_text(strip=True)

            if date_index < len(dates_only):
                post_date = dates_only[date_index]
                date_index += 1

                if post_date == target_date:
                    filtered_data.append({
                        "Ï†úÎ™©": title,
                        "ÎÇ†Ïßú": post_date
                    })
                elif post_date < target_date:
                    return pd.DataFrame(filtered_data)

    df = pd.DataFrame(filtered_data)

    # 3Ô∏è‚É£ Í∞êÏÑ± Î∂ÑÏÑù ÏàòÌñâ
    if not df.empty:
        print(f"üß† Í∞êÏÑ± Î∂ÑÏÑù Ï§ë... ({len(df)}Í∞ú Ï†úÎ™©)")
        try:
            results = sentiment_pipeline(df["Ï†úÎ™©"].tolist())
            df["Í∞êÏÑ±Ï†êÏàò"] = [
                -r["score"] if r["label"] == "LABEL_0" else r["score"]
                for r in results
            ]
        except Exception as e:
            print(f"‚ùó Í∞êÏÑ± Î∂ÑÏÑù Ïò§Î•ò: {e}")
            df["Í∞êÏÑ±Ï†êÏàò"] = None
    else:
        print("‚ùó Ìï¥Îãπ ÎÇ†ÏßúÏóê Í≤åÏãúÍ∏ÄÏù¥ ÏóÜÏäµÎãàÎã§.")

    return df


# ‚úÖ ÏÇ¨Ïö© ÏòàÏãú
if __name__ == "__main__":
    stock_code = "005930"  # Ïòà: ÏÇºÏÑ±Ï†ÑÏûê
    target_date = "2025.06.21"
    result_df = get_sentiment_scores_on_date(stock_code, target_date, max_page=15)

    print("\nüìä ÏµúÏ¢Ö Í≤∞Í≥º:")
    print(result_df)

üì¶ Í∞êÏÑ±Î∂ÑÏÑù Î™®Îç∏ Î°úÎî© Ï§ë...


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base-v2022 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


‚úÖ Í∞êÏÑ±Î∂ÑÏÑùÍ∏∞ Î°úÎî© ÏôÑÎ£å

‚ùó Ìï¥Îãπ ÎÇ†ÏßúÏóê Í≤åÏãúÍ∏ÄÏù¥ ÏóÜÏäµÎãàÎã§.

üìä ÏµúÏ¢Ö Í≤∞Í≥º:
Empty DataFrame
Columns: []
Index: []


In [10]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def get_posts_with_content(code="196170", page=1):
    base_url = "https://finance.naver.com"
    headers = {'User-Agent': 'Mozilla/5.0'}
    all_data = []

    list_url = f"{base_url}/item/board.naver?code={code}&page={page}"
    res = requests.get(list_url, headers=headers)
    res.encoding = 'euc-kr'
    soup = BeautifulSoup(res.text, 'html.parser')

    rows = soup.select("table.type2 tr")
    for row in rows:
        tds = row.find_all("td")
        if len(tds) < 5:
            continue

        a_tag = tds[1].find("a")
        if not a_tag or not a_tag.has_attr('href'):
            continue

        title = a_tag['title']
        href = a_tag['href']
        detail_url = base_url + href

        # Î≥∏Î¨∏ ÌéòÏù¥ÏßÄ ÏöîÏ≤≠
        post_res = requests.get(detail_url, headers=headers)
        post_res.encoding = 'euc-kr'
        post_soup = BeautifulSoup(post_res.text, 'html.parser')

        content_div = post_soup.select_one("div.view_se")
        content = content_div.get_text(strip=True) if content_div else "Î≥∏Î¨∏ ÏóÜÏùå"

        all_data.append({
            "Ï†úÎ™©": title,
            "Î≥∏Î¨∏": content,
            "ÎßÅÌÅ¨": detail_url
        })

        time.sleep(0.2)  # ÏÑúÎ≤Ñ Í≥ºÎ∂ÄÌïò Î∞©ÏßÄ

    return pd.DataFrame(all_data)

# ‚úÖ Ïã§Ìñâ ÏòàÏãú
if __name__ == "__main__":
    df = get_posts_with_content("196170", page=1)
    print(df.head())

                              Ï†úÎ™©  \
0                   40ÎßåÏù¥ÌïòÎäîÏì∏Ïñ¥Îã¥Ïñ¥Îùº    
1                    Î°úÎ¥á Î≥¥Îã§ÎèÑ Î™ªÍ∞ÄÎ©¥    
2  Îã§Î•∏ Ï¢ÖÎ™© ...ÏÜåÍ≥†Í∏∞Ïóê ÏôÄÏù∏ ÏûîÏπò Î≤åÎ¶¨Í≥† ÏûàÎäîÎç∞..   
3                    Ìè≠Îì±Ìï¥ÏÑú „Ö°Ïö∏ÏßÄÎßêÍ≥†    
4       Ïû•Í¥Ä Ïù∏ÏÑ† ÎßàÎ¨¥Î¶¨Î°ú ÌóàÎãàÎ¨∏ÎèÑ ÎßàÎ¨¥Î¶¨ Îã®Í≥Ñ..   

                                                  Î≥∏Î¨∏  \
0                                        ÏàúÏãùÍ∞ÑÏóê40ÎßåÏù¥ÏÉÅÍ∞ÑÎã§   
1  ÎÇ®Îì§ Îèà Ïò§ÏßÄÍ≤åÎì§ Îã§Î≤åÍ≥† Î∞îÏù¥Ïò§ Îã®ÌÉÄÎ°ú Ïö©ÎèàÎ≤åÍ≥† ÎùºÏä§Ìä∏ ÏΩîÏä§Ìîº Ïàè Ï≥êÏÑú ÎèàÎ≤åÎ©¥ Ïò¨Ìï¥...   
2  ÏôïÎî∞Î∞îÏù¥Ïò§ ÎåÄÏû•Ìò∏Íµ¨ Í∞úÎØ∏ÎãàÎì§ÏùÄÎã¥Îã¨ 15ÎßåÏõê Î∞õÏïÑÏÑúÏÜåÏ£º Î®πÏùÑ ÏÉùÍ∞ÅÏóê Îì§Îñ† ÏûàÎã§Î©∞„Öã„Öã„Öã...   
3                                         Ìè≠Îì±Ï†ÑÏóê „Ö°ÌçºÎã¥ÏïÑÎùº   
4  ÏàòÏùµÏ±ôÍ∏∞Í≥†..ÏÜåÏô∏Ï£ºÎ∞îÏù¥Ïò§ Í¥ÄÏã¨ Í∞ÄÏ†∏Ïïº...ÏïåÌÖ°Ïù¥ÎßåÎπºÍ≥†„Öã„Öã„Öã„Öã„Öã„Öã„Öã„Öã„Öã„Öã„Öã„Öã„Öã„Öã„Öã„Öã„Öã...   

                                                  ÎßÅÌÅ¨  
0  

In [12]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# 1Ô∏è‚É£ Í∞êÏÑ± Î∂ÑÏÑùÍ∏∞ Î°úÎî© (KcELECTRA Í∏∞Î∞ò)
print("üì¶ Í∞êÏÑ±Î∂ÑÏÑù Î™®Îç∏ Î°úÎî© Ï§ë...")
model_name = "beomi/KcELECTRA-base-v2022"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
print("‚úÖ Í∞êÏÑ±Î∂ÑÏÑùÍ∏∞ Î°úÎî© ÏôÑÎ£å\n")

# 2Ô∏è‚É£ Ï¢ÖÎ™©ÌÜ†Î°†Î∞© Í≤åÏãúÍ∏Ä Î≥∏Î¨∏ ÌÅ¨Î°§ÎßÅ Ìï®Ïàò
def get_posts_with_content(code="196170", page=1):
    base_url = "https://finance.naver.com"
    headers = {'User-Agent': 'Mozilla/5.0'}
    all_data = []

    list_url = f"{base_url}/item/board.naver?code={code}&page={page}"
    res = requests.get(list_url, headers=headers)
    res.encoding = 'euc-kr'
    soup = BeautifulSoup(res.text, 'html.parser')

    rows = soup.select("table.type2 tr")
    for row in rows:
        tds = row.find_all("td")
        if len(tds) < 5:
            continue

        a_tag = tds[1].find("a")
        if not a_tag or not a_tag.has_attr('href'):
            continue

        title = a_tag['title']
        href = a_tag['href']
        detail_url = base_url + href

        # Î≥∏Î¨∏ ÏöîÏ≤≠
        post_res = requests.get(detail_url, headers=headers)
        post_res.encoding = 'euc-kr'
        post_soup = BeautifulSoup(post_res.text, 'html.parser')

        content_div = post_soup.select_one("div.view_se")
        content = content_div.get_text(strip=True) if content_div else "Î≥∏Î¨∏ ÏóÜÏùå"

        all_data.append({
            "Ï†úÎ™©": title,
            "Î≥∏Î¨∏": content,
            "ÎßÅÌÅ¨": detail_url
        })

        time.sleep(0.2)  # ÏÑúÎ≤Ñ Î∂ÄÌïò Î∞©ÏßÄ

    return pd.DataFrame(all_data)

# 3Ô∏è‚É£ Í∞êÏÑ± Î∂ÑÏÑù Ìï®Ïàò
def analyze_sentiment(text_list):
    try:
        results = sentiment_pipeline(text_list)
        return [
            -r["score"] if r["label"] == "LABEL_0" else r["score"]
            for r in results
        ]
    except Exception as e:
        print("‚ùó Í∞êÏÑ± Î∂ÑÏÑù Ïò§Î•ò:", e)
        return [None] * len(text_list)

# ‚úÖ Ïã§Ìñâ ÏòàÏãú
if __name__ == "__main__":
    df = get_posts_with_content("196170", page=1)

    if not df.empty:
        print("üß† Î≥∏Î¨∏ Í∞êÏÑ± Î∂ÑÏÑù Ï§ë...")
        df["Í∞êÏÑ±Ï†êÏàò"] = analyze_sentiment(df["Î≥∏Î¨∏"].tolist())
        print(df[["Ï†úÎ™©", "Í∞êÏÑ±Ï†êÏàò"]].head())
        print(f"\nüìà ÌèâÍ∑† Í∞êÏÑ± Ï†êÏàò: {df['Í∞êÏÑ±Ï†êÏàò'].mean():.4f}")
    else:
        print("‚ùó Í≤åÏãúÍ∏ÄÏù¥ ÏóÜÏäµÎãàÎã§.")

üì¶ Í∞êÏÑ±Î∂ÑÏÑù Î™®Îç∏ Î°úÎî© Ï§ë...


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base-v2022 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


‚úÖ Í∞êÏÑ±Î∂ÑÏÑùÍ∏∞ Î°úÎî© ÏôÑÎ£å

üß† Î≥∏Î¨∏ Í∞êÏÑ± Î∂ÑÏÑù Ï§ë...
                              Ï†úÎ™©      Í∞êÏÑ±Ï†êÏàò
0            „Ö°.38ÎßåÏõê ÏßÄÌÇ§Î†§Í≥† ÏïàÍ∞ÑÌûòÏùÑ Ïì¥Îã§ -0.518336
1                   40ÎßåÏù¥ÌïòÎäîÏì∏Ïñ¥Îã¥Ïñ¥Îùº  -0.532847
2                    Î°úÎ¥á Î≥¥Îã§ÎèÑ Î™ªÍ∞ÄÎ©¥  -0.516636
3  Îã§Î•∏ Ï¢ÖÎ™© ...ÏÜåÍ≥†Í∏∞Ïóê ÏôÄÏù∏ ÏûîÏπò Î≤åÎ¶¨Í≥† ÏûàÎäîÎç∞.. -0.527990
4                    Ìè≠Îì±Ìï¥ÏÑú „Ö°Ïö∏ÏßÄÎßêÍ≥†   0.503164

üìà ÌèâÍ∑† Í∞êÏÑ± Ï†êÏàò: -0.1604


In [13]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# 1Ô∏è‚É£ Í∞êÏÑ± Î∂ÑÏÑùÍ∏∞ Î°úÎî© (KcELECTRA Í∏∞Î∞ò)
print("üì¶ Í∞êÏÑ±Î∂ÑÏÑù Î™®Îç∏ Î°úÎî© Ï§ë...")
model_name = "beomi/KcELECTRA-base-v2022"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
print("‚úÖ Í∞êÏÑ±Î∂ÑÏÑùÍ∏∞ Î°úÎî© ÏôÑÎ£å\n")

# 2Ô∏è‚É£ Ï¢ÖÎ™©ÌÜ†Î°†Î∞© Í≤åÏãúÍ∏Ä Î≥∏Î¨∏ ÌÅ¨Î°§ÎßÅ Ìï®Ïàò
def get_posts_with_content(code="196170", page=1):
    base_url = "https://finance.naver.com"
    headers = {'User-Agent': 'Mozilla/5.0'}
    all_data = []

    list_url = f"{base_url}/item/board.naver?code={code}&page={page}"
    res = requests.get(list_url, headers=headers)
    res.encoding = 'euc-kr'
    soup = BeautifulSoup(res.text, 'html.parser')

    rows = soup.select("table.type2 tr")
    for row in rows:
        tds = row.find_all("td")
        if len(tds) < 5:
            continue

        a_tag = tds[1].find("a")
        if not a_tag or not a_tag.has_attr('href'):
            continue

        title = a_tag['title']
        href = a_tag['href']
        detail_url = base_url + href

        # Î≥∏Î¨∏ ÏöîÏ≤≠
        post_res = requests.get(detail_url, headers=headers)
        post_res.encoding = 'euc-kr'
        post_soup = BeautifulSoup(post_res.text, 'html.parser')

        content_div = post_soup.select_one("div.view_se")
        content = content_div.get_text(strip=True) if content_div else "Î≥∏Î¨∏ ÏóÜÏùå"

        all_data.append({
            "Ï†úÎ™©": title,
            "Î≥∏Î¨∏": content,
            "ÎßÅÌÅ¨": detail_url
        })

        time.sleep(0.2)  # ÏÑúÎ≤Ñ Î∂ÄÌïò Î∞©ÏßÄ

    return pd.DataFrame(all_data)

# 3Ô∏è‚É£ Í∞êÏÑ± Î∂ÑÏÑù Ìï®Ïàò (Î∞∞Ïπò Ï≤òÎ¶¨)
def analyze_sentiment_batched(text_list, batch_size=16):
    sentiments = []
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i+batch_size]
        try:
            results = sentiment_pipeline(batch)
            for r in results:
                score = -r["score"] if r["label"] == "LABEL_0" else r["score"]
                sentiments.append(score)
        except Exception as e:
            print("‚ùó Í∞êÏÑ± Î∂ÑÏÑù Ïò§Î•ò:", e)
            sentiments.extend([None] * len(batch))
    return sentiments

# ‚úÖ Ïã§Ìñâ ÏòàÏãú
if __name__ == "__main__":
    df = get_posts_with_content("196170", page=1)

    if not df.empty:
        print(f"üß† Ï†ÑÏ≤¥ Î≥∏Î¨∏ {len(df)}Í±¥ Í∞êÏÑ± Î∂ÑÏÑù Ï§ë...")
        df["Í∞êÏÑ±Ï†êÏàò"] = analyze_sentiment_batched(df["Î≥∏Î¨∏"].tolist())
        print(df[["Ï†úÎ™©", "Í∞êÏÑ±Ï†êÏàò"]])
        print(f"\nüìà ÌèâÍ∑† Í∞êÏÑ± Ï†êÏàò: {df['Í∞êÏÑ±Ï†êÏàò'].mean():.4f}")
    else:
        print("‚ùó Í≤åÏãúÍ∏ÄÏù¥ ÏóÜÏäµÎãàÎã§.")

üì¶ Í∞êÏÑ±Î∂ÑÏÑù Î™®Îç∏ Î°úÎî© Ï§ë...


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base-v2022 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


‚úÖ Í∞êÏÑ±Î∂ÑÏÑùÍ∏∞ Î°úÎî© ÏôÑÎ£å

üß† Ï†ÑÏ≤¥ Î≥∏Î¨∏ 20Í±¥ Í∞êÏÑ± Î∂ÑÏÑù Ï§ë...
                               Ï†úÎ™©      Í∞êÏÑ±Ï†êÏàò
0             „Ö°.38ÎßåÏõê ÏßÄÌÇ§Î†§Í≥† ÏïàÍ∞ÑÌûòÏùÑ Ïì¥Îã§ -0.510188
1                    40ÎßåÏù¥ÌïòÎäîÏì∏Ïñ¥Îã¥Ïñ¥Îùº   0.529826
2                     Î°úÎ¥á Î≥¥Îã§ÎèÑ Î™ªÍ∞ÄÎ©¥   0.512113
3   Îã§Î•∏ Ï¢ÖÎ™© ...ÏÜåÍ≥†Í∏∞Ïóê ÏôÄÏù∏ ÏûîÏπò Î≤åÎ¶¨Í≥† ÏûàÎäîÎç∞..  0.512002
4                     Ìè≠Îì±Ìï¥ÏÑú „Ö°Ïö∏ÏßÄÎßêÍ≥†  -0.502430
5        Ïû•Í¥Ä Ïù∏ÏÑ† ÎßàÎ¨¥Î¶¨Î°ú ÌóàÎãàÎ¨∏ÎèÑ ÎßàÎ¨¥Î¶¨ Îã®Í≥Ñ..  0.516859
6            Î°úÎ¥áÏ£º 2Ï∞®Îû†Î¶¨ ÏãúÏûë ÎêêÎã§ÎÑ§Ïöî..^^  0.526363
7                     Íµ≠Í∞Ä Í∏∞Í∞Ñ ÏÇ∞ÏóÖÏúºÎ°ú  -0.521550
8              Îπ®Í∞ÑÎ∂à Ïû†ÍπêÎßå Î≥¥Ïó¨Ï£ºÎ©¥ ÏïàÎêòÍ≤†Îãà~  0.507657
9        Ìô©Ï≤úÍ∏∏ Îã§Ïù¥Î†âÌä∏ ÏûêÏú®Ï£ºÌñâ ÌóàÎ≤åÎùºÏù¥ÌîÑ  Ï∞¨Ìã∞Îì§  0.509163
10                      ÏïåÌÖåÏò§Ï†† Ïò§Î•¥ÎäîÎ≤ï  0.519891
11                      ÏïåÌÖåÏò§Ï†† Ïò§Î•¥ÎäîÎ≤ï  0.516878
12                 ÏïåÌÖå Í∞úÎØ∏ Ïù¥Îèô Î∂ÑÏÑù Î≥

In [14]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# 1Ô∏è‚É£ Í∞êÏÑ± Î∂ÑÏÑùÍ∏∞ Î°úÎî© (KcELECTRA Í∏∞Î∞ò)
print("üì¶ Í∞êÏÑ±Î∂ÑÏÑù Î™®Îç∏ Î°úÎî© Ï§ë...")
model_name = "beomi/KcELECTRA-base-v2022"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
print("‚úÖ Í∞êÏÑ±Î∂ÑÏÑùÍ∏∞ Î°úÎî© ÏôÑÎ£å\n")

# 2Ô∏è‚É£ Ï¢ÖÎ™©ÌÜ†Î°†Î∞© Í≤åÏãúÍ∏Ä Î≥∏Î¨∏ ÌÅ¨Î°§ÎßÅ Ìï®Ïàò
def get_posts_with_content(code="196170", page=1):
    base_url = "https://finance.naver.com"
    headers = {'User-Agent': 'Mozilla/5.0'}
    all_data = []

    list_url = f"{base_url}/item/board.naver?code={code}&page={page}"
    res = requests.get(list_url, headers=headers)
    res.encoding = 'euc-kr'
    soup = BeautifulSoup(res.text, 'html.parser')

    rows = soup.select("table.type2 tr")
    for row in rows:
        tds = row.find_all("td")
        if len(tds) < 5:
            continue

        a_tag = tds[1].find("a")
        if not a_tag or not a_tag.has_attr('href'):
            continue

        title = a_tag['title']
        href = a_tag['href']
        detail_url = base_url + href

        # Î≥∏Î¨∏ ÏöîÏ≤≠
        post_res = requests.get(detail_url, headers=headers)
        post_res.encoding = 'euc-kr'
        post_soup = BeautifulSoup(post_res.text, 'html.parser')

        content_div = post_soup.select_one("div.view_se")
        content = content_div.get_text(strip=True) if content_div else "Î≥∏Î¨∏ ÏóÜÏùå"

        all_data.append({
            "Ï†úÎ™©": title,
            "Î≥∏Î¨∏": content,
            "ÎßÅÌÅ¨": detail_url
        })

        time.sleep(0.2)  # ÏÑúÎ≤Ñ Î∂ÄÌïò Î∞©ÏßÄ

    return pd.DataFrame(all_data)

# 3Ô∏è‚É£ Í∞êÏÑ± Î∂ÑÏÑù Ìï®Ïàò (Î∞∞Ïπò Ï≤òÎ¶¨)
def analyze_sentiment_batched(text_list, batch_size=16):
    sentiments = []
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i+batch_size]
        try:
            results = sentiment_pipeline(batch)
            for r in results:
                score = -r["score"] if r["label"] == "LABEL_0" else r["score"]
                sentiments.append(score)
        except Exception as e:
            print("‚ùó Í∞êÏÑ± Î∂ÑÏÑù Ïò§Î•ò:", e)
            sentiments.extend([None] * len(batch))
    return sentiments

# ‚úÖ Ïã§Ìñâ ÏòàÏãú
if __name__ == "__main__":
    df = get_posts_with_content("196170", page=1)

    if not df.empty:
        print(f"üß† Ï†ÑÏ≤¥ 'Î≥∏Î¨∏' {len(df)}Í±¥ Í∞êÏÑ± Î∂ÑÏÑù Ï§ë...")
        df["Í∞êÏÑ±Ï†êÏàò"] = analyze_sentiment_batched(df["Î≥∏Î¨∏"].tolist())
        print(df[["Î≥∏Î¨∏", "Í∞êÏÑ±Ï†êÏàò"]].head())
        print(f"\nüìà ÌèâÍ∑† Í∞êÏÑ± Ï†êÏàò: {df['Í∞êÏÑ±Ï†êÏàò'].mean():.4f}")
    else:
        print("‚ùó Í≤åÏãúÍ∏ÄÏù¥ ÏóÜÏäµÎãàÎã§.")

üì¶ Í∞êÏÑ±Î∂ÑÏÑù Î™®Îç∏ Î°úÎî© Ï§ë...


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base-v2022 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


‚úÖ Í∞êÏÑ±Î∂ÑÏÑùÍ∏∞ Î°úÎî© ÏôÑÎ£å

üß† Ï†ÑÏ≤¥ 'Î≥∏Î¨∏' 20Í±¥ Í∞êÏÑ± Î∂ÑÏÑù Ï§ë...
                                                  Î≥∏Î¨∏      Í∞êÏÑ±Ï†êÏàò
0                                          Î†àÏù∏Î≥¥Ïö∞ Îî∞ÎùºÍ∞ÄÏûê  0.547531
1                                                „Ö°.Ìóê  0.536404
2                                        ÏàúÏãùÍ∞ÑÏóê40ÎßåÏù¥ÏÉÅÍ∞ÑÎã§  0.524589
3  ÎÇ®Îì§ Îèà Ïò§ÏßÄÍ≤åÎì§ Îã§Î≤åÍ≥† Î∞îÏù¥Ïò§ Îã®ÌÉÄÎ°ú Ïö©ÎèàÎ≤åÍ≥† ÎùºÏä§Ìä∏ ÏΩîÏä§Ìîº Ïàè Ï≥êÏÑú ÎèàÎ≤åÎ©¥ Ïò¨Ìï¥...  0.522002
4  ÏôïÎî∞Î∞îÏù¥Ïò§ ÎåÄÏû•Ìò∏Íµ¨ Í∞úÎØ∏ÎãàÎì§ÏùÄÎã¥Îã¨ 15ÎßåÏõê Î∞õÏïÑÏÑúÏÜåÏ£º Î®πÏùÑ ÏÉùÍ∞ÅÏóê Îì§Îñ† ÏûàÎã§Î©∞„Öã„Öã„Öã...  0.532115

üìà ÌèâÍ∑† Í∞êÏÑ± Ï†êÏàò: 0.4772


In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta
import time

def generate_date_range(start_date: str, end_date: str) -> list:
    """ÎÇ†Ïßú Î≤îÏúÑ ÏÉùÏÑ± Ìï®Ïàò"""
    start = datetime.strptime(start_date, "%Y.%m.%d")
    end = datetime.strptime(end_date, "%Y.%m.%d")
    return [(start + timedelta(days=i)).strftime("%Y.%m.%d") for i in range((end - start).days + 1)]

def get_all_posts_by_date_range(code, start_date, end_date, page_step=10, max_limit=100):
    """ÎÑ§Ïù¥Î≤Ñ Í∏àÏúµ Í≤åÏãúÍ∏Ä ÌÅ¨Î°§ÎßÅ Ìï®Ïàò"""
    target_dates = set(generate_date_range(start_date, end_date))
    base_url = "https://finance.naver.com"
    headers = {'User-Agent': 'Mozilla/5.0'}
    filtered_data = []
    collected_dates = set()
    current_page = 1
    max_page = page_step

    while not target_dates.issubset(collected_dates):
        new_data_found = False  # ÏÉà Í∏Ä ÌÉêÏßÄ Ïó¨Î∂Ä
        for page in range(current_page, max_page + 1):
            list_url = f"{base_url}/item/board.naver?code={code}&page={page}"
            res = requests.get(list_url, headers=headers)
            res.encoding = 'euc-kr'
            soup = BeautifulSoup(res.text, 'html.parser')

            rows = soup.select("table.type2 tr")
            date_tags = soup.select("span.tah.p10.gray03")
            raw_texts = [span.get_text(strip=True) for span in date_tags]
            dates_only = [raw_texts[i].split()[0] for i in range(0, len(raw_texts), 2)]

            if not dates_only:
                print(f":warning: ÌéòÏù¥ÏßÄ {page}ÏóêÏÑú Îçî Ïù¥ÏÉÅ Í≤åÏãúÍ∏Ä ÏóÜÏùå. Ï¢ÖÎ£åÌï©ÎãàÎã§.")
                return pd.DataFrame(filtered_data)

            date_index = 0
            for row in rows:
                tds = row.find_all("td")
                if len(tds) < 5:
                    continue

                a_tag = tds[1].find("a")
                if not a_tag or not a_tag.has_attr('href'):
                    continue

                title = a_tag['title']
                href = a_tag['href']
                detail_url = base_url + href

                if date_index >= len(dates_only):
                    continue

                post_date = dates_only[date_index]
                date_index += 1

                if post_date in target_dates:
                    new_data_found = True
                    post_res = requests.get(detail_url, headers=headers)
                    post_res.encoding = 'euc-kr'
                    post_soup = BeautifulSoup(post_res.text, 'html.parser')
                    content_div = post_soup.select_one("div.view_se")
                    content = content_div.get_text(strip=True) if content_div else "Î≥∏Î¨∏ ÏóÜÏùå"

                    filtered_data.append({
                        "ÎÇ†Ïßú": post_date,
                        "Ï†úÎ™©": title,
                        "Î≥∏Î¨∏": content,
                        "ÎßÅÌÅ¨": detail_url
                    })

                    collected_dates.add(post_date)
                    time.sleep(0.1)

        if not new_data_found:
            print(":white_check_mark: Îçî Ïù¥ÏÉÅ ÏàòÏßëÌï† ÏÉàÎ°úÏö¥ Îç∞Ïù¥ÌÑ∞ ÏóÜÏùå. Ï¢ÖÎ£åÌï©ÎãàÎã§.")
            break

        current_page = max_page + 1
        max_page += page_step

        if max_page > max_limit:
            print(f":no_entry_sign: ÏµúÎåÄ ÌéòÏù¥ÏßÄ Ï†úÌïú({max_limit}) ÎèÑÎã¨. Ï§ëÎã®Ìï©ÎãàÎã§.")
            break

    return pd.DataFrame(filtered_data)

def save_crawled_data(df, code, start_date, end_date):
    """ÌÅ¨Î°§ÎßÅÎêú Îç∞Ïù¥ÌÑ∞Î•º ÌååÏùºÎ°ú Ï†ÄÏû•"""
    filename = f"crawled_data_{code}_{start_date.replace('.', '')}_{end_date.replace('.', '')}.csv"
    df.to_csv(filename, index=False, encoding='utf-8-sig')
    print(f"ÌÅ¨Î°§ÎßÅ Îç∞Ïù¥ÌÑ∞Í∞Ä {filename}Ïóê Ï†ÄÏû•ÎêòÏóàÏäµÎãàÎã§.")
    return filename

if __name__ == "__main__":
    print("=== ÎÑ§Ïù¥Î≤Ñ Í∏àÏúµ Í≤åÏãúÍ∏Ä ÌÅ¨Î°§Îü¨ ===")
    
    code = input("Ï¢ÖÎ™© ÏΩîÎìúÎ•º ÏûÖÎ†•ÌïòÏÑ∏Ïöî (Ïòà: 005930): ").strip()
    start_date = input("ÏãúÏûë ÎÇ†ÏßúÎ•º ÏûÖÎ†•ÌïòÏÑ∏Ïöî (Ïòà: 2025.05.23): ").strip()
    end_date = input("ÎÅù ÎÇ†ÏßúÎ•º ÏûÖÎ†•ÌïòÏÑ∏Ïöî (Ïòà: 2025.06.23): ").strip()

    print(f"\n{code} Ï¢ÖÎ™©Ïùò {start_date} ~ {end_date} Í∏∞Í∞Ñ Í≤åÏãúÍ∏ÄÏùÑ ÌÅ¨Î°§ÎßÅÌï©ÎãàÎã§...")
    
    df = get_all_posts_by_date_range(code, start_date, end_date)
    
    if not df.empty:
        filename = save_crawled_data(df, code, start_date, end_date)
        print(f"\nÌÅ¨Î°§ÎßÅ ÏôÑÎ£å! Ï¥ù {len(df)}Í∞úÏùò Í≤åÏãúÍ∏ÄÏùÑ ÏàòÏßëÌñàÏäµÎãàÎã§.")
        print(f"ÌååÏùºÎ™Ö: {filename}")
    else:
        print("ÌÅ¨Î°§ÎßÅÎêú Îç∞Ïù¥ÌÑ∞Í∞Ä ÏóÜÏäµÎãàÎã§.") 

=== ÎÑ§Ïù¥Î≤Ñ Í∏àÏúµ Í≤åÏãúÍ∏Ä ÌÅ¨Î°§Îü¨ ===

090430 Ï¢ÖÎ™©Ïùò 2025.05.23 ~ 2025.06.23 Í∏∞Í∞Ñ Í≤åÏãúÍ∏ÄÏùÑ ÌÅ¨Î°§ÎßÅÌï©ÎãàÎã§...


ConnectionError: ('Connection aborted.', ConnectionAbortedError(10053, 'ÌòÑÏû¨ Ïó∞Í≤∞ÏùÄ ÏÇ¨Ïö©ÏûêÏùò Ìò∏Ïä§Ìä∏ ÏãúÏä§ÌÖúÏùò ÏÜåÌîÑÌä∏Ïõ®Ïñ¥Ïùò ÏùòÌï¥ Ï§ëÎã®ÎêòÏóàÏäµÎãàÎã§', None, 10053, None))