1단계: 종목토론방 크롤링 (제목만 수집)

In [4]:
import requests
from bs4 import BeautifulSoup
import time

def crawl_naver_talkroom(stock_code, pages=5):
    comments = []

    for page in range(1, pages + 1):
        url = f"https://finance.naver.com/item/board.naver?code={stock_code}&page={page}"
        headers = {'User-Agent': 'Mozilla/5.0'}
        res = requests.get(url, headers=headers)
        soup = BeautifulSoup(res.text, 'html.parser')

        td_tags = soup.select('td.title > a')
        for td in td_tags:
            text = td.get_text(strip=True)
            if text and "삭제된" not in text:
                comments.append(text)

        time.sleep(0.5)
    return comments

# 예시: 삼성전자 (종목코드 005930)
comments = crawl_naver_talkroom("005930", pages=3)
print(comments[:5])  # 최근 글 5개 출력

['하이닉스는 1년최고가 찍었다', '재드레곤??', '이재명 밑에 수하 협상실력', '■유죄취지파기환송이면  유죄지?', '군은 싸울 필요없다. 무장해제 하나???...']


2단계: 감성 분석 (KNU 감성사전 기반 예시)

In [6]:
%pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting JPype1>=0.7.0 (from konlpy)
  Downloading jpype1-1.5.2-cp313-cp313-win_amd64.whl.metadata (5.0 kB)
Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
   ---------------------------------------- 0.0/19.4 MB ? eta -:--:--
   - -------------------------------------- 0.8/19.4 MB 5.3 MB/s eta 0:00:04
   --- ------------------------------------ 1.8/19.4 MB 4.8 MB/s eta 0:00:04
   ----- ---------------------------------- 2.9/19.4 MB 5.2 MB/s eta 0:00:04
   -------- ------------------------------- 4.2/19.4 MB 5.2 MB/s eta 0:00:03
   ----------- ---------------------------- 5.5/19.4 MB 5.4 MB/s eta 0:00:03
   ------------- -------------------------- 6.6/19.4 MB 5.4 MB/s eta 0:00:03
   -------------- ------------------------- 7.1/19.4 MB 4.9 MB/s eta 0:00:03
   ---------------- ----------------------- 8.1/19.4 MB 4.9 MB/s eta 0:00:03
   ------------------- -------------------- 9.4/19.4 MB 5.0 MB/s

In [9]:
import pandas as pd
from konlpy.tag import Okt

# 감성사전 로딩
senti_dict = pd.read_csv("SentiWord_Dict.txt", sep='\t')
okt = Okt()

def analyze_sentiment(text, senti_df):
    words = okt.morphs(text)
    pos, neg = 0, 0

    for w in words:
        row = senti_df[senti_df['word'] == w]
        if not row.empty:
            pol = int(row['polarity'].values[0])
            if pol > 0:
                pos += 1
            elif pol < 0:
                neg += 1

    if pos > neg:
        return 1  # 긍정
    elif pos < neg:
        return -1  # 부정
    else:
        return 0  # 중립

# 전체 감정 분석
results = [analyze_sentiment(text, senti_dict) for text in comments]

JVMNotFoundException: No JVM shared library file (jvm.dll) found. Try setting up the JAVA_HOME environment variable properly.

3단계: 감정 결과 시각화

In [None]:
import matplotlib.pyplot as plt

labels = ['Negative', 'Neutral', 'Positive']
values = [results.count(-1), results.count(0), results.count(1)]

plt.bar(labels, values, color=['red', 'gray', 'green'])
plt.title("감성 분석 결과 (종목토론방)")
plt.ylabel("댓글 수")
plt.show()

In [2]:
%pip install transformers

Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting filelock (from transformers)
  Using cached filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Using cached huggingface_hub-0.33.0-py3-none-any.whl.metadata (14 kB)
Collecting pyyaml>=5.1 (from transformers)
  Downloading PyYAML-6.0.2-cp313-cp313-win_amd64.whl.metadata (2.1 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp313-cp313-win_amd64.whl.metadata (41 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Using cached tokenizers-0.21.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Using cached safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub<1.0,>=0.30.0->transformers)
  Using cached fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)
Downloading transformers-4.52.4-py3-n

In [12]:
%pip install torch

Collecting torch
  Downloading torch-2.7.1-cp313-cp313-win_amd64.whl.metadata (28 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Downloading networkx-3.5-py3-none-any.whl.metadata (6.3 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy>=1.13.3->torch)
  Using cached mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Downloading torch-2.7.1-cp313-cp313-win_amd64.whl (216.1 MB)
   ---------------------------------------- 0.0/216.1 MB ? eta -:--:--
   ---------------------------------------- 0.8/216.1 MB 4.6 MB/s eta 0:00:47
   ---------------------------------------- 1.6/216.1 MB 4.2 MB/s eta 0:00:52
    --------------------------------------- 2.9/216.1 MB 4.8 MB/s eta 0:00:45
    --------------------------------------- 3.9/216.1 MB 4.9 MB/s eta 0:00:44
    --------------------------------------- 4.7/216.1 MB 4.8 MB/s eta 0:00:45
   - -------------------------------------- 5.5/216.1 MB 4.4 MB/s eta 0:00:48

In [12]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
import time
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
### :영: 감성 분석기 세팅 (KcBERT 기반)
model_name = "beomi/KcELECTRA-base-v2022"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
### :일: 날짜별 종목토론방 크롤링 함수
def crawl_board_by_date(stock_code, target_date_str, max_try=100):
    base_url = f"https://finance.naver.com/item/board.nhn?code={stock_code}"
    target_date = datetime.strptime(target_date_str, "%Y-%m-%d").date()
    all_posts = []
    page = 1
    attempts = 0
    while attempts < max_try:
        url = f"{base_url}&page={page}"
        res = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
        soup = BeautifulSoup(res.text, "html.parser")
        rows = soup.select("table.type2 tr")
        found_today_post = False
        found_old_post = False
        for row in rows:
            cols = row.find_all("td")
            if len(cols) < 5:
                continue
            try:
                href = cols[0].find("a")["href"]
                detail_url = "https://finance.naver.com" + href
                date_text = cols[3].text.strip()
                date_obj = datetime.strptime(date_text, "%Y.%m.%d %H:%M")
                post_date = date_obj.date()
                if post_date == target_date:
                    found_today_post = True
                    post_res = requests.get(detail_url, headers={"User-Agent": "Mozilla/5.0"})
                    post_soup = BeautifulSoup(post_res.text, "html.parser")
                    content = post_soup.select_one("div.view_se").get_text(strip=True)
                    all_posts.append({"날짜": post_date, "본문": content})
                    time.sleep(0.1)
                elif post_date < target_date:
                    found_old_post = True
                    break
            except Exception:
                continue
        if not found_today_post or found_old_post:
            break
        page += 1
        attempts += 1
        time.sleep(0.2)
    return pd.DataFrame(all_posts)
### :둘: 감성 분석 함수
def analyze_sentiment(text_list):
    results = sentiment_pipeline(text_list)
    sentiments = []
    for r in results:
        if r["label"] == "LABEL_0":
            sentiments.append(-r["score"])
        else:
            sentiments.append(r["score"])
    return sentiments
### :셋: 전체 파이프라인: 날짜 리스트로 감성 점수 계산
def analyze_multiple_dates_sentiment(stock_code, date_list):
    all_data = []
    for date_str in date_list:
        print(f":달력: {date_str} 수집 중...")
        df = crawl_board_by_date(stock_code, date_str)
        if df.empty:
            print(f":느낌표: {date_str} 데이터 없음")
            continue
        print(f":뇌: 감성 분석 중... ({len(df)}건)")
        df["감성점수"] = analyze_sentiment(df["본문"].tolist())
        daily_score = df["감성점수"].mean()
        all_data.append({"날짜": date_str, "평균감성점수": daily_score})
        time.sleep(0.5)
    return pd.DataFrame(all_data)



Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base-v2022 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


In [13]:
dates = ["2024-06-19", "2024-06-20", "2024-06-21"]
result = analyze_multiple_dates_sentiment("005930", dates)
print(result)

:달력: 2024-06-19 수집 중...
:느낌표: 2024-06-19 데이터 없음
:달력: 2024-06-20 수집 중...
:느낌표: 2024-06-20 데이터 없음
:달력: 2024-06-21 수집 중...
:느낌표: 2024-06-21 데이터 없음
Empty DataFrame
Columns: []
Index: []


In [11]:
import requests
from bs4 import BeautifulSoup
import time
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import pandas as pd

### 감성 분석기 세팅 (KcELECTRA 기반)
model_name = "beomi/KcELECTRA-base-v2022"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

### 1. 네이버 종목토론방 글 크롤링 (페이지 수 지정)
def crawl_naver_talkroom(stock_code, pages=5):
    comments = []
    for page in range(1, pages + 1):
        url = f"https://finance.naver.com/item/board.naver?code={stock_code}&page={page}"
        headers = {'User-Agent': 'Mozilla/5.0'}
        res = requests.get(url, headers=headers)
        soup = BeautifulSoup(res.text, 'html.parser')

        td_tags = soup.select('td.title > a')
        for td in td_tags:
            text = td.get_text(strip=True)
            if text and "삭제된" not in text:
                comments.append(text)
        time.sleep(0.5)
    return comments

### 2. 감성 분석 함수
def analyze_sentiment(text_list):
    results = sentiment_pipeline(text_list)
    sentiments = []
    for r in results:
        if r["label"] == "LABEL_0":
            sentiments.append(-r["score"])
        else:
            sentiments.append(r["score"])
    return sentiments

### 3. 크롤링 + 감성분석 통합 파이프라인 함수
def crawl_and_analyze(stock_code, pages=5):
    print(f"🕵️‍♂️ {stock_code} 토론방 글 크롤링 중... 페이지: {pages}")
    comments = crawl_naver_talkroom(stock_code, pages)
    if not comments:
        print("⚠️ 크롤링된 글이 없습니다.")
        return pd.DataFrame()
    print(f"🧠 감성 분석 중... 총 {len(comments)}건")
    scores = analyze_sentiment(comments)
    df = pd.DataFrame({"본문": comments, "감성점수": scores})
    print(df.head())
    return df

### 예시 실행
df_result = crawl_and_analyze("005930", pages=7)

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base-v2022 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


🕵️‍♂️ 005930 토론방 글 크롤링 중... 페이지: 7
🧠 감성 분석 중... 총 130건
                     본문      감성점수
0       이재명 대통령 정말 대단함.  0.505966
1  윤, 차기 재집권 가능성 90%이상. -0.518183
2       삼전 코스피 시가 총액 1위 -0.517113
3                   모건이 -0.540485
4                   이주식  0.501423


In [14]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def get_posts_with_content(code="196170", page=1):
    base_url = "https://finance.naver.com"
    headers = {'User-Agent': 'Mozilla/5.0'}
    all_data = []

    list_url = f"{base_url}/item/board.naver?code={code}&page={page}"
    res = requests.get(list_url, headers=headers)
    res.encoding = 'euc-kr'
    soup = BeautifulSoup(res.text, 'html.parser')

    rows = soup.select("table.type2 tr")
    for row in rows:
        tds = row.find_all("td")
        if len(tds) < 5:
            continue

        a_tag = tds[1].find("a")
        if not a_tag or not a_tag.has_attr('href'):
            continue

        title = a_tag['title']
        href = a_tag['href']
        detail_url = base_url + href

        # 본문 페이지 요청
        post_res = requests.get(detail_url, headers=headers)
        post_res.encoding = 'euc-kr'
        post_soup = BeautifulSoup(post_res.text, 'html.parser')

        content_div = post_soup.select_one("div.view_se")
        content = content_div.get_text(strip=True) if content_div else "본문 없음"

        all_data.append({
            "제목": title,
            "본문": content,
            "링크": detail_url
        })

        time.sleep(0.2)  # 서버 과부하 방지

    return pd.DataFrame(all_data)

# ✅ 실행 예시
if __name__ == "__main__":
    df = get_posts_with_content("196170", page=1)
    print(df.head())

                              제목  \
0            ㅡ.38만원 지키려고 안간힘을 쓴다   
1                   40만이하는쓸어담어라    
2                    로봇 보다도 못가면    
3  다른 종목 ...소고기에 와인 잔치 벌리고 있는데..   
4                    폭등해서 ㅡ울지말고    

                                                  본문  \
0                                                ㅡ.헐   
1                                        순식간에40만이상간다   
2  남들 돈 오지게들 다벌고 바이오 단타로 용돈벌고 라스트 코스피 숏 쳐서 돈벌면 올해...   
3  왕따바이오 대장호구 개미니들은담달 15만원 받아서소주 먹을 생각에 들떠 있다며ㅋㅋㅋ...   
4                                         폭등전에 ㅡ퍼담아라   

                                                  링크  
0  https://finance.naver.com/item/board_read.nave...  
1  https://finance.naver.com/item/board_read.nave...  
2  https://finance.naver.com/item/board_read.nave...  
3  https://finance.naver.com/item/board_read.nave...  
4  https://finance.naver.com/item/board_read.nave...  


In [18]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta
import time
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# ✅ 감성 분석기 세팅 (KcELECTRA)
model_name = "beomi/KcELECTRA-base-v2022"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# ✅ 날짜 리스트 생성 함수
def generate_date_range(start_date: str, end_date: str) -> list:
    start = datetime.strptime(start_date, "%Y.%m.%d")
    end = datetime.strptime(end_date, "%Y.%m.%d")
    date_list = [(start + timedelta(days=i)).strftime("%Y.%m.%d") for i in range((end - start).days + 1)]
    return date_list

# ✅ 감성 분석 함수
def analyze_sentiment(text_list):
    results = sentiment_pipeline(text_list, truncation=True)
    sentiments = []
    for r in results:
        score = r["score"]
        sentiments.append(-score if r["label"] == "LABEL_0" else score)
    return sentiments

# ✅ 크롤링 + 감성분석 통합 함수
def get_sentiment_by_date(code, start_date, end_date, max_page):
    target_dates = generate_date_range(start_date, end_date)
    base_url = "https://finance.naver.com"
    headers = {'User-Agent': 'Mozilla/5.0'}
    filtered_data = []

    for page in range(1, max_page + 1):
        print(f"📄 Page {page} 크롤링 중...")
        list_url = f"{base_url}/item/board.naver?code={code}&page={page}"
        res = requests.get(list_url, headers=headers)
        res.encoding = 'euc-kr'
        soup = BeautifulSoup(res.text, 'html.parser')

        rows = soup.select("table.type2 tr")
        date_tags = soup.select("span.tah.p10.gray03")
        raw_texts = [span.get_text(strip=True) for span in date_tags]
        dates_only = [raw_texts[i].split()[0] for i in range(0, len(raw_texts), 2)]

        date_index = 0
        for row in rows:
            tds = row.find_all("td")
            if len(tds) < 5:
                continue

            a_tag = tds[1].find("a")
            if not a_tag or not a_tag.has_attr('href'):
                continue

            title = a_tag['title']
            href = a_tag['href']
            detail_url = base_url + href

            if date_index < len(dates_only):
                post_date = dates_only[date_index]
                date_index += 1
            else:
                continue

            if post_date not in target_dates:
                continue

            # 본문 수집
            post_res = requests.get(detail_url, headers=headers)
            post_res.encoding = 'euc-kr'
            post_soup = BeautifulSoup(post_res.text, 'html.parser')
            content_div = post_soup.select_one("div.view_se")
            content = content_div.get_text(strip=True) if content_div else "본문 없음"

            filtered_data.append({
                "날짜": post_date,
                "제목": title,
                "본문": content,
                "링크": detail_url
            })

            time.sleep(0.2)

    if not filtered_data:
        print("⚠️ 조건에 맞는 글이 없습니다.")
        return pd.DataFrame()

    df = pd.DataFrame(filtered_data)
    
    # 감성 분석 실행
    print(f"🧠 감성 분석 중... 총 {len(df)}건")
    df["감성점수"] = analyze_sentiment(df["본문"].tolist())

    # 날짜별 평균 점수 집계
    summary = df.groupby("날짜")["감성점수"].mean().reset_index().rename(columns={"감성점수": "평균감성점수"})
    print("✅ 날짜별 평균 감성점수")
    print(summary)

    return df, summary

# ✅ 실행 예시
if __name__ == "__main__":
    df_raw, df_summary = get_sentiment_by_date("196170", start_date="2025.06.20", end_date="2025.06.23", max_page=10)


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base-v2022 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


📄 Page 1 크롤링 중...
📄 Page 2 크롤링 중...
📄 Page 3 크롤링 중...
📄 Page 4 크롤링 중...
📄 Page 5 크롤링 중...
📄 Page 6 크롤링 중...
📄 Page 7 크롤링 중...
📄 Page 8 크롤링 중...
📄 Page 9 크롤링 중...
📄 Page 10 크롤링 중...
🧠 감성 분석 중... 총 190건
✅ 날짜별 평균 감성점수
           날짜    평균감성점수
0  2025.06.20  0.084335
1  2025.06.21  0.028582
2  2025.06.22 -0.001844
3  2025.06.23  0.103702
