In [66]:
import pandas as pd
import numpy as np
import datetime

import warnings
warnings.filterwarnings("ignore")

## 1. 處理股價
導入微股力交易數據

In [67]:
stock = pd.read_csv('./bda2024_mid_dataset/bda2024_微股力_個股交易數據-2年.csv')

選0050為此次預測標的，用to_datetime調整時間顯示方式

In [68]:
yuanta = stock[stock['stock_symbol'] == '0050']
yuanta['date'] = pd.to_datetime(yuanta['date']).dt.date
yuanta = yuanta.sort_values(by = 'date').reset_index(drop = True)

先整理表格內容，增加兩個欄位分別為：
1.  five_day_change_pct：表示今日與前五日收盤價的價格變化（調整window來改變時間差）
2.  label：表示今日與五日前收盤價的漲跌標籤，1代表漲，-1代表跌，0代表持平

In [69]:
yuanta = yuanta[['stock_symbol', 'date', 'close']]
yuanta['five_day_change_pct'] = yuanta['close'].rolling(window=5).apply(lambda x: (x.iloc[1]-x.iloc[0])/x.iloc[0]*100)
yuanta['label'] = yuanta['five_day_change_pct'].apply(lambda x: '1' if x > 2 else ('-1' if x < -2 else '0'))

In [70]:
yuanta

Unnamed: 0,stock_symbol,date,close,five_day_change_pct,label
0,0050,2022-03-01,140.45,,0
1,0050,2022-03-02,139.65,,0
2,0050,2022-03-03,139.95,,0
3,0050,2022-03-04,138.45,,0
4,0050,2022-03-07,134.00,-0.569598,0
...,...,...,...,...,...
480,0050,2024-02-20,141.65,4.707613,1
481,0050,2024-02-21,141.20,-0.737619,0
482,0050,2024-02-22,142.80,-0.141543,0
483,0050,2024-02-23,143.75,0.389794,0


在 2022-2024 年 的 485 個交易日中，漲跌變化的個數

In [71]:
yuanta['label'].value_counts()

0     442
1      22
-1     21
Name: label, dtype: int64

## 2. 處理文章資料

先讀取新聞及論壇資料

In [72]:
def load_df(filepath, preview=True):
    print(f"\n----- Loading {filepath}... -----")
    df = pd.read_csv(filepath, encoding='utf-8')
    print(f"Size of dataframe: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    if preview:
        print(df.head())
    return df

In [73]:
news1_df = load_df("./bda2024_mid_dataset/bda2024_202203-202402_內容數據_新聞1.csv", preview=False)
news2_df = load_df("./bda2024_mid_dataset/bda2024_202203-202402_內容數據_新聞2.csv", preview=False)
news3_df = load_df("./bda2024_mid_dataset/bda2024_202203-202402_內容數據_新聞3.csv", preview=False)
news_df = pd.concat([news1_df, news2_df, news3_df], ignore_index=True)

disc_dcard_df = load_df("./bda2024_mid_dataset/bda2024_202203-202402_討論數據_dcard.csv", preview=False)
disc_dcard_df.rename(columns={'forum': 'p_type'}, inplace=True)    # Repair column name typo in data
disc_m1_df = load_df("./bda2024_mid_dataset/bda2024_202203-202402_討論數據_mobile01-1.csv", preview=False)
disc_m2_df = load_df("./bda2024_mid_dataset/bda2024_202203-202402_討論數據_mobile01-2.csv", preview=False)
disc_ptt_df = load_df("./bda2024_mid_dataset/bda2024_202203-202402_討論數據_ptt.csv", preview=False)
disc_df = pd.concat([disc_dcard_df, disc_m1_df, disc_m2_df, disc_ptt_df], ignore_index=True)


----- Loading ./bda2024_mid_dataset/bda2024_202203-202402_內容數據_新聞1.csv... -----
Size of dataframe: (179449, 9)
Columns: ['id', 'p_type', 's_name', 's_area_name', 'post_time', 'title', 'author', 'content', 'page_url']

----- Loading ./bda2024_mid_dataset/bda2024_202203-202402_內容數據_新聞2.csv... -----
Size of dataframe: (15114, 9)
Columns: ['id', 'p_type', 's_name', 's_area_name', 'post_time', 'title', 'author', 'content', 'page_url']

----- Loading ./bda2024_mid_dataset/bda2024_202203-202402_內容數據_新聞3.csv... -----
Size of dataframe: (290929, 9)
Columns: ['id', 'p_type', 's_name', 's_area_name', 'post_time', 'title', 'author', 'content', 'page_url']

----- Loading ./bda2024_mid_dataset/bda2024_202203-202402_討論數據_dcard.csv... -----
Size of dataframe: (231320, 10)
Columns: ['id', 'forum', 's_name', 's_area_name', 'post_time', 'title', 'author', 'content', 'page_url', 'content_type']

----- Loading ./bda2024_mid_dataset/bda2024_202203-202402_討論數據_mobile01-1.csv... -----
Size of dataframe: (487

* 篩選出標題及內文提及「元大台灣50」或「0050」的文章
* drop掉論壇文章集中關於content type的欄位（content type記錄對應的內容是評論or文章等等），把評論集和新聞集concat一起

In [74]:
news_df_yuanta = news_df[(news_df['title'].str.contains('0050') | news_df['title'].str.contains('元大台灣50')) |
                      (news_df['content'].str.contains('0050') | news_df['content'].str.contains('元大台灣50'))].reset_index(drop=True)
# news_df_yuanta

disc_df_yuanta = disc_df[(disc_df['title'].str.contains('0050') | disc_df['title'].str.contains('元大台灣50')) |
                      (disc_df['content'].str.contains('0050') | disc_df['content'].str.contains('元大台灣50'))].reset_index(drop = True)
# disc_df_yuanta

disc_df_yuanta = disc_df_yuanta.drop(columns=['content_type'])
df_yuanta = pd.concat([news_df_yuanta, disc_df_yuanta], ignore_index=True)
# df_yuanta

* 將第 n 天的文章與第 n+5 的股市漲跌標籤合併，並且只保留我們需要的欄位資訊（post_time, title, content, label）
* 用datetime.timedelta(days=5)調整影響天數

In [75]:
df_yuanta['post_time'] = pd.to_datetime(df_yuanta['post_time']).dt.date
yuanta['date-5'] = yuanta['date'] - datetime.timedelta(days=5)
df_yuanta = pd.merge(df_yuanta, yuanta, left_on='post_time', right_on='date-5')[['post_time', 'title', 'content', 'label']]
# df_yuanta

漲、跌、持平的文章總數量

In [76]:
df_yuanta['label'].value_counts()

0     7544
-1     620
1      428
Name: label, dtype: int64

分出「漲」(label=1) 和「跌」(label=-1) 的文章

In [107]:
df_yuanta_up = df_yuanta[df_yuanta['label'] == '1']
df_yuanta_up.to_csv('df_yuanta_up.csv',index=False) # 428 rows × 4 columns
df_yuanta_down = df_yuanta[df_yuanta['label'] == '-1']
# df_yuanta_down # 620 rows × 4 columns
df_yuanta_stay = df_yuanta[df_yuanta['label'] == '0']

# 3. 訓練集文章向量化處理

* 首先先進行斷詞處理，以下我們會先實作幾個步驟：
    1. 正則表示法清除多餘字元：先移除文章中符號、英數字，只保留中文字元
    2. 斷句：由於 monpa 在處理 200 字以上字串的斷詞時可能會出現錯誤結果，因此我們統一對長文章先進行斷句拆成較短的句子組成的 list
    3. 斷詞：透過 monpa 對斷句結果中的所有句子進行斷詞

In [78]:
import monpa
from monpa import utils
import re

In [79]:
# 這個function用來將字串以正則化處理去除中文字元以外的字元
def clearSentence(sentence):
    return re.sub(r'[^\u4e00-\u9fa5]+', '', sentence)

# 我們從stopwords-zh.txt這個檔案中匯入繁體中文的停用詞
with open('stopwords-zh.txt', 'r', encoding='utf-8') as file:
    stopwords = file.read().splitlines() 
file.close()

這邊先嘗試用 2023 年 1 月至 5 月的文章資料來訓練

In [80]:
# 設定訓練資料集的開始日期與結束日期
train_startDate = datetime.date(2023, 1, 1)
train_endDate = datetime.date(2023, 5, 31)

訓練集資料數量（透過上面時間篩選後，被選為訓練集的資料）

In [81]:
print("「漲」的訓練文章數：",len(list(df_yuanta_up[df_yuanta_up['post_time'].between(train_startDate, train_endDate)].index)))
print("「跌」的訓練文章數：",len(list(df_yuanta_down[df_yuanta_down['post_time'].between(train_startDate, train_endDate)].index)))

「漲」的訓練文章數： 56
「跌」的訓練文章數： 46


In [82]:
# 透過monpa對文章進行斷詞處理，並將個別斷詞結果寫在tokenStr這個字串變數中，最後再將訓練集中所有字串存在tokenStr_list中
def generate_tokenized_text(dataset, start_date, end_date):
    train_tokenStr_list = []
    for i in list(dataset[dataset['post_time'].between(start_date, end_date)].index):
        try:
            sentence_list = utils.short_sentence(dataset['content'][i])
            tokenStr = str()
            for sentence in sentence_list:
                sentence = clearSentence(sentence)
                tokens = monpa.cut(sentence)
                tokenStr += ' '.join(tokens)
            train_tokenStr_list.append(tokenStr)
        except:
            train_tokenStr_list.append('')
    return train_tokenStr_list

個別針對「漲」和「跌」的文章斷字

In [83]:
train_tokenStr_list_up = generate_tokenized_text(df_yuanta_up, train_startDate, train_endDate)
train_tokenStr_list_down = generate_tokenized_text(df_yuanta_down, train_startDate, train_endDate)

# train_tokenStr_list_up 代表漲的文章的斷字
# train_tokenStr_list_down 代表跌的文章的斷字

文章向量化處理：我們透過 sklearn 套件中 TfidfVectorizer 將斷詞結果去除stop word後轉為空間向量

In [84]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [85]:
vectorizer_before_chi = TfidfVectorizer(stop_words=stopwords)
X_train_up = vectorizer_before_chi.fit_transform(train_tokenStr_list_up)
X_train_up = pd.DataFrame(X_train_up.toarray(),columns=vectorizer_before_chi.get_feature_names_out())
# X_train_up # 56 rows × 1324 columns

In [86]:
X_train_down = vectorizer_before_chi.fit_transform(train_tokenStr_list_down)
X_train_down = pd.DataFrame(X_train_down.toarray(),columns=vectorizer_before_chi.get_feature_names_out())
# X_train_down # 56 rows × 1324 columns

* 可以看到使用全部的斷詞結果去組成空間向量時稀疏性會非常大，在後續預測時效率會很低，因此我們需要選擇對分類結果有較顯著影響的詞彙作為向量空間的維度，以下我們透過 Chi-square 計算各詞彙與漲跌標籤的獨立性作為選擇向量空間維度的依據。
* 我們先假設使用chi-square挑出前1000大的feature

In [87]:
y_train_up = df_yuanta_up[df_yuanta_up['post_time'].between(train_startDate, train_endDate)]['label']

chi2_selector = SelectKBest(chi2, k = 1000)
chi2_selector.fit(X_train_up, y_train_up)
kbest_vocabs_up = X_train_up.columns[chi2_selector.get_support()]
X_train_up = X_train_up[kbest_vocabs_up]
# X_train_up

In [88]:
y_train_down = df_yuanta_down[df_yuanta_down['post_time'].between(train_startDate, train_endDate)]['label']

chi2_selector = SelectKBest(chi2, k = 1000)
chi2_selector.fit(X_train_down, y_train_down)
kbest_vocabs_down = X_train_down.columns[chi2_selector.get_support()]
X_train_down = X_train_down[kbest_vocabs_down]
# X_train_down

接著刪掉在漲和跌的兩個向量空間中同樣的feature

In [89]:
set_up = set(kbest_vocabs_up)
set_down = set(kbest_vocabs_down)

common_words = set_up.intersection(set_down)

unique_up = set_up - common_words
unique_down = set_down - common_words

unique_up_list = list(unique_up)
unique_down_list = list(unique_down)

# unique_up_list 代表漲的feature
# unique_down_list 代表跌的feature

print('刪掉同樣的後，漲和跌的個別feature數量：', len(unique_up_list))

total_feature = unique_up_list + unique_down_list
print('總feature數量：', len(total_feature))

刪掉同樣的後，漲和跌的個別feature數量： 645
總feature數量： 1290


把所有跟元大相關的漲跌文章各自的title和content合併起來

In [90]:
yuanta_combine_up = df_yuanta_up.title + df_yuanta_up.content
yuanta_combine_down = df_yuanta_down.title + df_yuanta_down.content
yuanta_combine_stay = df_yuanta_stay.title + df_yuanta_stay.content

yuanta_combine_up = yuanta_combine_up.dropna()
yuanta_combine_down = yuanta_combine_down.dropna()
yuanta_combine_stay = yuanta_combine_stay.dropna()

使用CountVectorizer，計算漲跌文章中是否出現前面提取出的unique feature

In [108]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(vocabulary = total_feature, binary=True)

yuanta_up_vector = vectorizer.fit_transform(yuanta_combine_up)
yuanta_down_vector = vectorizer.fit_transform(yuanta_combine_down)
yuanta_stay_vector = vectorizer.fit_transform(yuanta_combine_stay)

In [109]:
df_yuanta_up = df_yuanta_up.reset_index(drop=True)
df_yuanta_down = df_yuanta_down.reset_index(drop=True)
df_yuanta_stay = df_yuanta_stay.reset_index(drop=True)

yuanta_up_vector = pd.DataFrame(yuanta_up_vector.toarray(),columns=vectorizer.get_feature_names_out())
yuanta_up_vector['label'] = 1
yuanta_up_vector['post_time'] = df_yuanta_up['post_time']

yuanta_down_vector = pd.DataFrame(yuanta_down_vector.toarray(),columns=vectorizer.get_feature_names_out())
yuanta_down_vector['label'] = -1
yuanta_down_vector['post_time'] = df_yuanta_down['post_time']

yuanta_stay_vector = pd.DataFrame(yuanta_stay_vector.toarray(),columns=vectorizer.get_feature_names_out())
yuanta_stay_vector['label'] = 0
yuanta_stay_vector['post_time'] = df_yuanta_stay['post_time']

yuanta_vector = pd.concat([yuanta_up_vector, yuanta_down_vector], axis=0)
yuanta_vector = pd.concat([yuanta_vector, yuanta_stay_vector], axis=0)
print(yuanta_vector.shape)

yuanta_vector.to_csv('yuanta_vector.csv',index=False)

(8587, 1292)


### 我們就用到這裡以上！！
下面是學姊給的去年的code，還沒有全部修改完，看你們想要怎麼調整！

# 4. 測試集文章向量化處理

* 接著我們可以透過前面訓練的向量維度將 2022 年 11 月到 2023 年 6 月的文章也轉成 tf-idf 的向量空間

In [105]:
# 首先先計算testing文章個別的tfidf
test_startDate = datetime.date(2023, 6, 1)
test_endDate = datetime.date(2022,10,31)

test_tokenStr_list = []
for i in list(df_yuanta_up[df_yuanta_up['post_time'].between(test_startDate, test_endDate)].index):
    try:
        txt = clearSentence(df_yuanta_up['content'][i])
        sentence_list = utils.short_sentence(txt)
        tokenStr = str()
        for sentence in sentence_list:
            tokens = monpa.cut(sentence)
            tokenStr += ' '.join(tokens)
        test_tokenStr_list.append(tokenStr)
    except:
        test_tokenStr_list.append('')

In [None]:
# 接著將結果透過df.reindex這個方法映射到訓練集的向量空間中
y_test = df_yuanta_up[df_yuanta_up['post_time'].between(test_startDate, test_endDate)]['label']

vectorizer = TfidfVectorizer(stop_words=stopwords)
X_test = vectorizer.fit_transform(test_tokenStr_list)
X_test = pd.DataFrame(X_test.toarray(),columns=vectorizer.get_feature_names())
X_test = X_test.reindex(kbest_vocabs_up, axis=1, fill_value=0)
X_test

# 5. 建立預測模型

In [18]:
from sklearn.ensemble import GradientBoostingClassifier

In [19]:
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1, max_depth=7, random_state=0)
clf.fit(X_train, y_train)
clf.score(X_train, y_train)

1.0

# 6. 檢視預測結果

In [21]:
test_label = yuanta[yuanta['年月日-1'].between(test_startDate, test_endDate)]['label']

test_data = bbs23_yuanta[bbs23_yuanta['post_time'].between(test_startDate, test_endDate)]
test_data['predict_label'] = clf.predict(X_test)
predict_label = pd.merge(
    yuanta[yuanta['年月日-1'].between(test_startDate, test_endDate)], 
    test_data.groupby(['post_time', 'predict_label']).count().sort_values('label', ascending = False).sort_index(level=[0], sort_remaining=False).groupby(level=0).head(1).reset_index(), 
    left_on='年月日-1', right_on='post_time', how='left').fillna(method='ffill').fillna(method='bfill')['predict_label']

from sklearn.metrics import accuracy_score
print('{}月份預測準確率:'.format(test_startDate.month), accuracy_score(test_label, predict_label))

10月份預測準確率: 0.7142857142857143


In [22]:
pd.DataFrame([test_label.reset_index(drop=True), predict_label]).T

Unnamed: 0,label,predict_label
0,跌,跌
1,漲,跌
2,漲,跌
3,漲,跌
4,跌,跌
5,跌,跌
6,跌,跌
7,跌,跌
8,漲,漲
9,漲,漲


# 4. 同學們可以嘗試調整

1. 漲跌標籤的判斷%數（重要！！）
2. 文章與股價時間區間的移動天數（小時數）
3. 使用不同斷詞工具（推薦中研院CKIPTransformer）
4. 特徵選擇的其他方法（lift、、MI、、LLR...）
5. 特徵選擇的數量（太少會有很高的 false positive，太高則效率差）
6. 嘗試用看看不同分類模型
7. 改變投票方法，漲跌平三者的權重應該一樣嗎？

      GOOD LUCK!!!