In [2]:
!pip install transformers torch fugashi[unidic-lite]

Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting torch
  Downloading torch-2.7.1-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting fugashi[unidic-lite]
  Downloading fugashi-1.5.1-cp312-cp312-win_amd64.whl.metadata (7.5 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.33.0-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Using cached tokenizers-0.21.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Using cached safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting unidic-lite (from fugashi[unidic-lite])
  Downloading unidic-lite-1.0.8.tar.gz (47.4 MB)
     ---------------------------------------- 0.0/47.4 MB ? eta -:--:--
      --------------------------------------- 0.8/47.4 MB 4.2 

In [5]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
import time
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

### 0️⃣ 감성 분석기 세팅 (KcBERT 기반)
model_name = "beomi/KcELECTRA-base-v2022"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)


### 1️⃣ 날짜별 종목토론방 크롤링 함수
def crawl_board_by_date(stock_code, target_date_str, max_try=100):
    base_url = f"https://finance.naver.com/item/board.naver?code={stock_code}"
    target_date = datetime.strptime(target_date_str, "%Y-%m-%d").date()

    all_posts = []
    page = 1
    attempts = 0

    while attempts < max_try:
        url = f"{base_url}&page={page}"
        res = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
        soup = BeautifulSoup(res.text, "html.parser")
        rows = soup.select("table.type2 tr")

        found_today_post = False
        found_old_post = False

        for row in rows:
            cols = row.find_all("td")
            if len(cols) < 5:
                continue

            try:
                href = cols[0].find("a")["href"]
                detail_url = "https://finance.naver.com" + href
                date_text = cols[3].text.strip()

                # 날짜 포맷 판별
                if ":" in date_text:
                    # 오늘 날짜 + 시간만 있음 (ex: "14:12")
                    post_date = datetime.now().date()
                else:
                    # 날짜 + 시간 있음 (ex: "2024.06.19 11:45")
                    post_date = datetime.strptime(date_text, "%Y.%m.%d %H:%M").date()

                if post_date == target_date:
                    found_today_post = True
                    post_res = requests.get(detail_url, headers={"User-Agent": "Mozilla/5.0"})
                    post_soup = BeautifulSoup(post_res.text, "html.parser")
                    content = post_soup.select_one("div.view_se").get_text(strip=True)
                    all_posts.append({"날짜": post_date, "본문": content})
                    time.sleep(0.1)
                elif post_date < target_date:
                    found_old_post = True
                    break

            except Exception:
                continue

        if not found_today_post or found_old_post:
            break

        page += 1
        attempts += 1
        time.sleep(0.2)

    return pd.DataFrame(all_posts)


### 2️⃣ 감성 분석 함수
def analyze_sentiment(text_list):
    results = sentiment_pipeline(text_list)
    sentiments = []
    for r in results:
        if r["label"] == "LABEL_0":
            sentiments.append(-r["score"])
        else:
            sentiments.append(r["score"])
    return sentiments


### 3️⃣ 전체 파이프라인: 날짜 리스트로 감성 점수 계산
def analyze_multiple_dates_sentiment(stock_code, date_list):
    all_data = []

    for date_str in date_list:
        print(f"📆 {date_str} 수집 중...")
        df = crawl_board_by_date(stock_code, date_str)
        if df.empty:
            print(f"❗ {date_str} 데이터 없음")
            continue

        print(f"🧠 감성 분석 중... ({len(df)}건)")
        df["감성점수"] = analyze_sentiment(df["본문"].tolist())

        daily_score = df["감성점수"].mean()
        all_data.append({"날짜": date_str, "평균감성점수": daily_score})
        time.sleep(0.5)

    return pd.DataFrame(all_data)

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base-v2022 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


In [6]:
analyze_multiple_dates_sentiment("005930", ["2024-06-19"])

📆 2024-06-19 수집 중...
❗ 2024-06-19 데이터 없음


In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# 1️⃣ 감성 분석기 세팅 (KcELECTRA)
print("📦 감성분석 모델 로딩 중...")
model_name = "beomi/KcELECTRA-base-v2022"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
print("✅ 감성분석기 로딩 완료\n")

# 2️⃣ 종목토론방에서 특정 날짜의 제목 수집 + 감성분석
def get_sentiment_scores_on_date(code="005930", target_date="2025.06.21", max_page=10):
    headers = {'User-Agent': 'Mozilla/5.0'}
    filtered_data = []

    for page in range(1, max_page + 1):
        url = f"https://finance.naver.com/item/board.naver?code={code}&page={page}"
        res = requests.get(url, headers=headers)
        res.encoding = 'euc-kr'

        soup = BeautifulSoup(res.text, 'html.parser')
        rows = soup.select("table.type2 tr")
        spans = soup.select('span.tah.p10.gray03')
        raw_texts = [span.get_text(strip=True) for span in spans]
        dates_only = [raw_texts[i].split()[0] for i in range(0, len(raw_texts), 2)]

        date_index = 0
        for row in rows:
            tds = row.find_all("td")
            if len(tds) < 5:
                continue

            title_tag = tds[1].find("a")
            if not title_tag:
                continue

            title = title_tag.get_text(strip=True)

            if date_index < len(dates_only):
                post_date = dates_only[date_index]
                date_index += 1

                if post_date == target_date:
                    filtered_data.append({
                        "제목": title,
                        "날짜": post_date
                    })
                elif post_date < target_date:
                    return pd.DataFrame(filtered_data)

    df = pd.DataFrame(filtered_data)

    # 3️⃣ 감성 분석 수행
    if not df.empty:
        print(f"🧠 감성 분석 중... ({len(df)}개 제목)")
        try:
            results = sentiment_pipeline(df["제목"].tolist())
            df["감성점수"] = [
                -r["score"] if r["label"] == "LABEL_0" else r["score"]
                for r in results
            ]
        except Exception as e:
            print(f"❗ 감성 분석 오류: {e}")
            df["감성점수"] = None
    else:
        print("❗ 해당 날짜에 게시글이 없습니다.")

    return df


# ✅ 사용 예시
if __name__ == "__main__":
    stock_code = "005930"  # 예: 삼성전자
    target_date = "2025.06.21"
    result_df = get_sentiment_scores_on_date(stock_code, target_date, max_page=15)

    print("\n📊 최종 결과:")
    print(result_df)

📦 감성분석 모델 로딩 중...


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base-v2022 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


✅ 감성분석기 로딩 완료

❗ 해당 날짜에 게시글이 없습니다.

📊 최종 결과:
Empty DataFrame
Columns: []
Index: []


In [10]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def get_posts_with_content(code="196170", page=1):
    base_url = "https://finance.naver.com"
    headers = {'User-Agent': 'Mozilla/5.0'}
    all_data = []

    list_url = f"{base_url}/item/board.naver?code={code}&page={page}"
    res = requests.get(list_url, headers=headers)
    res.encoding = 'euc-kr'
    soup = BeautifulSoup(res.text, 'html.parser')

    rows = soup.select("table.type2 tr")
    for row in rows:
        tds = row.find_all("td")
        if len(tds) < 5:
            continue

        a_tag = tds[1].find("a")
        if not a_tag or not a_tag.has_attr('href'):
            continue

        title = a_tag['title']
        href = a_tag['href']
        detail_url = base_url + href

        # 본문 페이지 요청
        post_res = requests.get(detail_url, headers=headers)
        post_res.encoding = 'euc-kr'
        post_soup = BeautifulSoup(post_res.text, 'html.parser')

        content_div = post_soup.select_one("div.view_se")
        content = content_div.get_text(strip=True) if content_div else "본문 없음"

        all_data.append({
            "제목": title,
            "본문": content,
            "링크": detail_url
        })

        time.sleep(0.2)  # 서버 과부하 방지

    return pd.DataFrame(all_data)

# ✅ 실행 예시
if __name__ == "__main__":
    df = get_posts_with_content("196170", page=1)
    print(df.head())

                              제목  \
0                   40만이하는쓸어담어라    
1                    로봇 보다도 못가면    
2  다른 종목 ...소고기에 와인 잔치 벌리고 있는데..   
3                    폭등해서 ㅡ울지말고    
4       장관 인선 마무리로 허니문도 마무리 단계..   

                                                  본문  \
0                                        순식간에40만이상간다   
1  남들 돈 오지게들 다벌고 바이오 단타로 용돈벌고 라스트 코스피 숏 쳐서 돈벌면 올해...   
2  왕따바이오 대장호구 개미니들은담달 15만원 받아서소주 먹을 생각에 들떠 있다며ㅋㅋㅋ...   
3                                         폭등전에 ㅡ퍼담아라   
4  수익챙기고..소외주바이오 관심 가져야...알텡이만빼고ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ...   

                                                  링크  
0  https://finance.naver.com/item/board_read.nave...  
1  https://finance.naver.com/item/board_read.nave...  
2  https://finance.naver.com/item/board_read.nave...  
3  https://finance.naver.com/item/board_read.nave...  
4  https://finance.naver.com/item/board_read.nave...  


In [12]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# 1️⃣ 감성 분석기 로딩 (KcELECTRA 기반)
print("📦 감성분석 모델 로딩 중...")
model_name = "beomi/KcELECTRA-base-v2022"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
print("✅ 감성분석기 로딩 완료\n")

# 2️⃣ 종목토론방 게시글 본문 크롤링 함수
def get_posts_with_content(code="196170", page=1):
    base_url = "https://finance.naver.com"
    headers = {'User-Agent': 'Mozilla/5.0'}
    all_data = []

    list_url = f"{base_url}/item/board.naver?code={code}&page={page}"
    res = requests.get(list_url, headers=headers)
    res.encoding = 'euc-kr'
    soup = BeautifulSoup(res.text, 'html.parser')

    rows = soup.select("table.type2 tr")
    for row in rows:
        tds = row.find_all("td")
        if len(tds) < 5:
            continue

        a_tag = tds[1].find("a")
        if not a_tag or not a_tag.has_attr('href'):
            continue

        title = a_tag['title']
        href = a_tag['href']
        detail_url = base_url + href

        # 본문 요청
        post_res = requests.get(detail_url, headers=headers)
        post_res.encoding = 'euc-kr'
        post_soup = BeautifulSoup(post_res.text, 'html.parser')

        content_div = post_soup.select_one("div.view_se")
        content = content_div.get_text(strip=True) if content_div else "본문 없음"

        all_data.append({
            "제목": title,
            "본문": content,
            "링크": detail_url
        })

        time.sleep(0.2)  # 서버 부하 방지

    return pd.DataFrame(all_data)

# 3️⃣ 감성 분석 함수
def analyze_sentiment(text_list):
    try:
        results = sentiment_pipeline(text_list)
        return [
            -r["score"] if r["label"] == "LABEL_0" else r["score"]
            for r in results
        ]
    except Exception as e:
        print("❗ 감성 분석 오류:", e)
        return [None] * len(text_list)

# ✅ 실행 예시
if __name__ == "__main__":
    df = get_posts_with_content("196170", page=1)

    if not df.empty:
        print("🧠 본문 감성 분석 중...")
        df["감성점수"] = analyze_sentiment(df["본문"].tolist())
        print(df[["제목", "감성점수"]].head())
        print(f"\n📈 평균 감성 점수: {df['감성점수'].mean():.4f}")
    else:
        print("❗ 게시글이 없습니다.")

📦 감성분석 모델 로딩 중...


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base-v2022 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


✅ 감성분석기 로딩 완료

🧠 본문 감성 분석 중...
                              제목      감성점수
0            ㅡ.38만원 지키려고 안간힘을 쓴다 -0.518336
1                   40만이하는쓸어담어라  -0.532847
2                    로봇 보다도 못가면  -0.516636
3  다른 종목 ...소고기에 와인 잔치 벌리고 있는데.. -0.527990
4                    폭등해서 ㅡ울지말고   0.503164

📈 평균 감성 점수: -0.1604


In [13]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# 1️⃣ 감성 분석기 로딩 (KcELECTRA 기반)
print("📦 감성분석 모델 로딩 중...")
model_name = "beomi/KcELECTRA-base-v2022"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
print("✅ 감성분석기 로딩 완료\n")

# 2️⃣ 종목토론방 게시글 본문 크롤링 함수
def get_posts_with_content(code="196170", page=1):
    base_url = "https://finance.naver.com"
    headers = {'User-Agent': 'Mozilla/5.0'}
    all_data = []

    list_url = f"{base_url}/item/board.naver?code={code}&page={page}"
    res = requests.get(list_url, headers=headers)
    res.encoding = 'euc-kr'
    soup = BeautifulSoup(res.text, 'html.parser')

    rows = soup.select("table.type2 tr")
    for row in rows:
        tds = row.find_all("td")
        if len(tds) < 5:
            continue

        a_tag = tds[1].find("a")
        if not a_tag or not a_tag.has_attr('href'):
            continue

        title = a_tag['title']
        href = a_tag['href']
        detail_url = base_url + href

        # 본문 요청
        post_res = requests.get(detail_url, headers=headers)
        post_res.encoding = 'euc-kr'
        post_soup = BeautifulSoup(post_res.text, 'html.parser')

        content_div = post_soup.select_one("div.view_se")
        content = content_div.get_text(strip=True) if content_div else "본문 없음"

        all_data.append({
            "제목": title,
            "본문": content,
            "링크": detail_url
        })

        time.sleep(0.2)  # 서버 부하 방지

    return pd.DataFrame(all_data)

# 3️⃣ 감성 분석 함수 (배치 처리)
def analyze_sentiment_batched(text_list, batch_size=16):
    sentiments = []
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i+batch_size]
        try:
            results = sentiment_pipeline(batch)
            for r in results:
                score = -r["score"] if r["label"] == "LABEL_0" else r["score"]
                sentiments.append(score)
        except Exception as e:
            print("❗ 감성 분석 오류:", e)
            sentiments.extend([None] * len(batch))
    return sentiments

# ✅ 실행 예시
if __name__ == "__main__":
    df = get_posts_with_content("196170", page=1)

    if not df.empty:
        print(f"🧠 전체 본문 {len(df)}건 감성 분석 중...")
        df["감성점수"] = analyze_sentiment_batched(df["본문"].tolist())
        print(df[["제목", "감성점수"]])
        print(f"\n📈 평균 감성 점수: {df['감성점수'].mean():.4f}")
    else:
        print("❗ 게시글이 없습니다.")

📦 감성분석 모델 로딩 중...


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base-v2022 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


✅ 감성분석기 로딩 완료

🧠 전체 본문 20건 감성 분석 중...
                               제목      감성점수
0             ㅡ.38만원 지키려고 안간힘을 쓴다 -0.510188
1                    40만이하는쓸어담어라   0.529826
2                     로봇 보다도 못가면   0.512113
3   다른 종목 ...소고기에 와인 잔치 벌리고 있는데..  0.512002
4                     폭등해서 ㅡ울지말고  -0.502430
5        장관 인선 마무리로 허니문도 마무리 단계..  0.516859
6            로봇주 2차랠리 시작 됐다네요..^^  0.526363
7                     국가 기간 산업으로  -0.521550
8              빨간불 잠깐만 보여주면 안되겠니~  0.507657
9        황천길 다이렉트 자율주행 허벌라이프  찬티들  0.509163
10                      알테오젠 오르는법  0.519891
11                      알테오젠 오르는법  0.516878
12                 알테 개미 이동 분석 보면  0.536483
13             이득중이지만 내일 더크게 먹기위해  0.521180
14               그리고 이건 소소한 바람이지만 -0.502297
15                 ㅡ.오늘은 로봇주가 잔치다 -0.505962
16                           것 보단  0.512241
17                     반년동안 멈춘 주식  0.501281
18           알텡이만 빼고 사래 두...ㅋㅋㅋㅋㅋ -0.500570
19                             제발  0.535733

📈 평균 감성 점수: 0.2107


In [14]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# 1️⃣ 감성 분석기 로딩 (KcELECTRA 기반)
print("📦 감성분석 모델 로딩 중...")
model_name = "beomi/KcELECTRA-base-v2022"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
print("✅ 감성분석기 로딩 완료\n")

# 2️⃣ 종목토론방 게시글 본문 크롤링 함수
def get_posts_with_content(code="196170", page=1):
    base_url = "https://finance.naver.com"
    headers = {'User-Agent': 'Mozilla/5.0'}
    all_data = []

    list_url = f"{base_url}/item/board.naver?code={code}&page={page}"
    res = requests.get(list_url, headers=headers)
    res.encoding = 'euc-kr'
    soup = BeautifulSoup(res.text, 'html.parser')

    rows = soup.select("table.type2 tr")
    for row in rows:
        tds = row.find_all("td")
        if len(tds) < 5:
            continue

        a_tag = tds[1].find("a")
        if not a_tag or not a_tag.has_attr('href'):
            continue

        title = a_tag['title']
        href = a_tag['href']
        detail_url = base_url + href

        # 본문 요청
        post_res = requests.get(detail_url, headers=headers)
        post_res.encoding = 'euc-kr'
        post_soup = BeautifulSoup(post_res.text, 'html.parser')

        content_div = post_soup.select_one("div.view_se")
        content = content_div.get_text(strip=True) if content_div else "본문 없음"

        all_data.append({
            "제목": title,
            "본문": content,
            "링크": detail_url
        })

        time.sleep(0.2)  # 서버 부하 방지

    return pd.DataFrame(all_data)

# 3️⃣ 감성 분석 함수 (배치 처리)
def analyze_sentiment_batched(text_list, batch_size=16):
    sentiments = []
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i+batch_size]
        try:
            results = sentiment_pipeline(batch)
            for r in results:
                score = -r["score"] if r["label"] == "LABEL_0" else r["score"]
                sentiments.append(score)
        except Exception as e:
            print("❗ 감성 분석 오류:", e)
            sentiments.extend([None] * len(batch))
    return sentiments

# ✅ 실행 예시
if __name__ == "__main__":
    df = get_posts_with_content("196170", page=1)

    if not df.empty:
        print(f"🧠 전체 '본문' {len(df)}건 감성 분석 중...")
        df["감성점수"] = analyze_sentiment_batched(df["본문"].tolist())
        print(df[["본문", "감성점수"]].head())
        print(f"\n📈 평균 감성 점수: {df['감성점수'].mean():.4f}")
    else:
        print("❗ 게시글이 없습니다.")

📦 감성분석 모델 로딩 중...


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base-v2022 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


✅ 감성분석기 로딩 완료

🧠 전체 '본문' 20건 감성 분석 중...
                                                  본문      감성점수
0                                          레인보우 따라가자  0.547531
1                                                ㅡ.헐  0.536404
2                                        순식간에40만이상간다  0.524589
3  남들 돈 오지게들 다벌고 바이오 단타로 용돈벌고 라스트 코스피 숏 쳐서 돈벌면 올해...  0.522002
4  왕따바이오 대장호구 개미니들은담달 15만원 받아서소주 먹을 생각에 들떠 있다며ㅋㅋㅋ...  0.532115

📈 평균 감성 점수: 0.4772
