In [22]:
import yfinance as yf
import pandas as pd
import json
from pymongo import MongoClient
from dotenv import load_dotenv
import os
from datetime import datetime

# 환경변수 로드
load_dotenv()
db_password = os.getenv("MONGO_PASSWORD")
mongo_url = f"mongodb+srv://skkucapstone:{db_password}@stock.iz5b97b.mongodb.net/?retryWrites=true&w=majority&appName=XXX"

In [9]:
# MongoDB 연결
client = MongoClient(mongo_url)
db = client["stock"]
collection = db["nasdaq_top50_2022_to_2024"]

# 종목별 데이터 분류
cursor = collection.find({})
data_by_company = {}

for doc in cursor:
    name = doc.get("Name")
    if not name or not doc.get("Date"):
        continue
    if name not in data_by_company:
        data_by_company[name] = []
    data_by_company[name].append(doc)

In [10]:
nasdaq_tickers = [  # NASDAQ 시가총액 상위 50종목
    "AAPL", "MSFT", "GOOGL", "AMZN", "NVDA", "META", "AVGO", "TSLA", "PEP", "COST",
    "ADBE", "CSCO", "TMUS", "TXN", "INTC", "QCOM", "AMGN", "AMD", "INTU", "HON",
    "ISRG", "SBUX", "BKNG", "VRTX", "ADI", "MU", "REGN", "KLAC", "LRCX", "MAR",
    "CTAS", "GILD", "ADP", "PDD", "MNST", "IDXX", "FTNT", "WBD", "EXC", "EA",
    "ROST", "MELI", "CDNS", "PAYX", "BIIB", "ODFL", "MRNA", "KDP", "PCAR", "DLTR"
]

# 일간 데이터

In [23]:
# ✨ 1. 일간 데이터 (10일, 겹침 없음)
daily_jsonl_str = ""

for name, docs in data_by_company.items():
    docs.sort(key=lambda x: x["Date"])

    i = 0
    while i + 10 < len(docs):
        try:
            context_lines = []

            for j in range(i, i + 10):
                d = docs[j]
                if None in [d.get("Open"), d.get("High"), d.get("Low"), d.get("Close"), d.get("Volume"), d.get("Change")]:
                    raise ValueError("데이터 누락")
                line = f"{d['Date']}, {d['Open']}, {d['High']}, {d['Low']}, {d['Close']}, {d['Volume']}, {round(d['Change'], 6)}"
                context_lines.append(line)

            target_docs = docs[i + 10]
            if "Change" not in target_docs or target_docs["Change"] is None:
                i += 10
                continue
            
            output_label = round(target_docs["Change"], 6)

            target_date = datetime.strptime(target_docs['Date'], "%Y-%m-%d").date()

            instruction = f"""Using the context below, estimate the rate of change in the closing price of {name} on {target_date}.
            Return the expected value of change as a decimal.
            
            Context: date, open, high, low, close, volume, change.
            {chr(10).join(context_lines)}
            Answer:"""
    
            json_obj = {
                "instruction": instruction,
                "output": output_label
            }
            
            daily_jsonl_str += json.dumps(json_obj, ensure_ascii=False) + "\n"

        except Exception as e:
            print(f"❗ {name} 일간 처리 중 오류: {e}")
            pass
        i += 10

with open("nasdaq_10_days_reg_output.jsonl", "w", encoding="utf-8") as f:
    f.write(daily_jsonl_str)
print("✅ NASDAQ 일간 JSONL 생성 완료!")



✅ NASDAQ 일간 JSONL 생성 완료!


# 주간 & 월간 데이터

In [30]:
# 런타임 초기화로 인한 코드 재실행
from datetime import datetime
import json
import pandas as pd

# 수익률 계산용 함수 (클래스 분류가 아닌 regression용)
def compute_change(prev_close, next_close):
    return (next_close - prev_close) / prev_close

# weekly & monthly 리샘플링 jsonl 생성
def generate_jsonl_from_resampled(data_by_company, mode='weekly'):
    jsonl_str = ""
    resample_rule = 'W-FRI' if mode == 'weekly' else 'M'

    for name, docs in data_by_company.items():
        try:
            docs.sort(key=lambda x: x["Date"])
            df = pd.DataFrame(docs)
            df['Date'] = pd.to_datetime(df['Date'])
            df = df.set_index('Date')

            df_resampled = df.resample(resample_rule).agg({
                'Open': 'first',
                'High': 'max',
                'Low': 'min',
                'Close': 'last',
                'Volume': 'sum'
            }).dropna()

            df_resampled['Change'] = df_resampled['Close'].pct_change().shift(-1)
            df_resampled = df_resampled.reset_index()

            for i in range(len(df_resampled) - 10):
                try:
                    context_lines = []
                    for j in range(i, i + 10):
                        row = df_resampled.iloc[j]
                        if pd.isnull(row[['Open', 'High', 'Low', 'Close', 'Volume', 'Change']]).any():
                            raise ValueError("누락 데이터 존재")
                        line = f"{row['Date'].strftime('%Y-%m-%d')}, {row['Open']}, {row['High']}, {row['Low']}, {row['Close']}, {row['Volume']}, {round(row['Change'], 6)}"
                        context_lines.append(line)

                    target_row = df_resampled.iloc[i + 10]
                    if pd.isnull(target_row['Close']) or pd.isnull(df_resampled.iloc[i + 9]['Close']):
                        continue

                    # label: i+10의 종가 기준으로 i+9 종가와 비교한 수익률
                    prev_close = df_resampled.iloc[i + 9]['Close']
                    next_close = target_row['Close']
                    rate = compute_change(prev_close, next_close)
                    output_label = round(rate, 6)

                    instruction = f"""Using the context below, estimate the rate of change in the closing price of {name} on {target_row['Date'].date()}.
Return the expected value of change as a decimal.

Context: date, open, high, low, close, volume, change.
{chr(10).join(context_lines)}
Answer:"""

                    json_obj = {
                        "instruction": instruction,
                        "output": output_label
                    }

                    jsonl_str += json.dumps(json_obj, ensure_ascii=False) + "\n"

                except Exception as e:
                    print(f"❗ {name} {mode} 내부 처리 중 오류: {e}")
                    continue

        except Exception as e:
            print(f"❗ {name} {mode} 전체 처리 중 오류: {e}")
            continue

    return jsonl_str

In [31]:


# 경고: 이 단계는 실제 데이터가 없어서 비어있는 결과를 생성함
weekly_jsonl_str = generate_jsonl_from_resampled(data_by_company, mode='weekly')
monthly_jsonl_str = generate_jsonl_from_resampled(data_by_company, mode='monthly')

# 저장 경로 설정
weekly_path = "nasdaq_10_weeks_reg_output.jsonl"
monthly_path = "nasdaq_10_months_reg_output.jsonl"

# 저장
with open(weekly_path, "w", encoding="utf-8") as wf:
    wf.write(weekly_jsonl_str)
    print("✅ NASDAQ 주간 JSONL 생성 완료!")

with open(monthly_path, "w", encoding="utf-8") as mf:
    mf.write(monthly_jsonl_str)
    print("✅ NASDAQ 월간 JSONL 생성 완료!")

