In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

# 1. 파일 불러오기
tech_df = pd.read_csv("1. 조선_기술지표.csv")
news_df = pd.read_csv("6. 뉴스_통합피처.csv")

# 2. 날짜 변환
tech_df['날짜'] = pd.to_datetime(tech_df['날짜'])
news_df['날짜'] = pd.to_datetime(news_df['날짜'])

# 3. 병합
df = pd.merge(tech_df, news_df, on=["날짜", "기업명"], how="left").fillna(0)

# 4. 타깃 생성
df = df[(df["1일후등락률"] > 4) | (df["1일후등락률"] < -1)].copy()
df["target"] = (df["1일후등락률"] > 4).astype(int)

# 5. 필요한 피처만 선택
selected_features = [
    "MA5", "MA10", "종가", "BB_upper", "부정", "tfidf_수주", "중립", "긍정",
    "거래량", "MACD", "RSI", "tfidf_미국", "tfidf_실적", "감성_부정비율", "감성_긍정비율"
]
X = df[selected_features]
y = df["target"]

# 6. 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 7. 훈련/테스트 분할
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, shuffle=False)

# 8. SMOTE로 클래스 불균형 보정
smote = SMOTE(random_state=42, k_neighbors=3)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

# 9. 선형 SVM 모델 학습
model = SVC(kernel="linear", random_state=42)
model.fit(X_train_sm, y_train_sm)

# 10. 예측 및 평가
y_pred = model.predict(X_test)

print("\n✅ 혼동 행렬:")
print(confusion_matrix(y_test, y_pred))

print("\n✅ 분류 리포트:")
print(classification_report(y_test, y_pred))



✅ 혼동 행렬:
[[188  46]
 [ 59  16]]

✅ 분류 리포트:
              precision    recall  f1-score   support

           0       0.76      0.80      0.78       234
           1       0.26      0.21      0.23        75

    accuracy                           0.66       309
   macro avg       0.51      0.51      0.51       309
weighted avg       0.64      0.66      0.65       309

