In [322]:
import os
import pandas as pd
import numpy as np
import datetime

import warnings
warnings.filterwarnings("ignore")

In [331]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_extraction.text import CountVectorizer

In [332]:
with open('stopwords_zh.txt', 'r', encoding='utf-8') as file:
    stopwords = file.read().splitlines() 
file.close()

In [333]:
df_up = pd.read_csv("Up_down_stay/train_tokenStr_list_up.csv")
df_yuanta_up = pd.read_csv("Up_down_stay/df_yuanta_up.csv")
df_yuanta_up["post_time"] = pd.to_datetime(df_yuanta_up["post_time"]).dt.date
up = pd.concat([df_up, df_yuanta_up[["post_time"]]], axis=1)


df_stay = pd.read_csv("Up_down_stay/train_tokenStr_list_stay.csv")
df_yuanta_stay = pd.read_csv("Up_down_stay/df_yuanta_stay.csv")
df_yuanta_stay["post_time"] = pd.to_datetime(df_yuanta_stay["post_time"]).dt.date
stay = pd.concat([df_stay, df_yuanta_stay[["post_time"]]], axis=1)

df_down = pd.read_csv("Up_down_stay/train_tokenStr_list_down.csv")
df_yuanta_down = pd.read_csv("Up_down_stay/df_yuanta_down.csv")
df_yuanta_down["post_time"] = pd.to_datetime(df_yuanta_down["post_time"]).dt.date
down = pd.concat([df_down, df_yuanta_down[["post_time"]]], axis=1)

In [335]:
# 所有需要當作test的月份
total_time = np.arange(np.datetime64("2022-06"), np.datetime64("2024-03"), dtype="datetime64[M]")

for month in total_time:
    time_test = np.arange(month, month+np.timedelta64(1, "M"), dtype='datetime64[D]').tolist()
    time_train = np.arange(np.datetime64(time_test[0], "M")-np.timedelta64(3, "M"), np.datetime64(time_test[0], "M"), dtype='datetime64[D]').tolist()

    # train
    train_startDate = time_train[0]
    train_endDate = time_train[-1]
    train_tokenStr_list_up = []
    train_tokenStr_list_down = []
    train_tokenStr_list_stay = []

    for t in time_train:
        train_tokenStr_list_up += up[up["post_time"] == t]["0"].astype(str).tolist()
        train_tokenStr_list_down += down[down["post_time"] == t]["0"].astype(str).tolist()
        train_tokenStr_list_stay += stay[stay["post_time"] == t]["0"].astype(str).tolist()

    # test
    test_startDate = time_test[0]
    test_endDate = time_test[-1]
    yuanta_up_texts = []
    yuanta_down_texts = []
    yuanta_stay_texts = []

    for t in time_test:
        yuanta_up_texts += up[up["post_time"] == t]["0"].astype(str).tolist()
        yuanta_down_texts += down[down["post_time"] == t]["0"].astype(str).tolist()
        yuanta_stay_texts += stay[stay["post_time"] == t]["0"].astype(str).tolist()

    vectorizer_before_chi = TfidfVectorizer(stop_words=stopwords)
    X_train_up = vectorizer_before_chi.fit_transform(train_tokenStr_list_up)
    X_train_up = pd.DataFrame(X_train_up.toarray(),columns=vectorizer_before_chi.get_feature_names_out())
    up_num_rows = X_train_up.shape[1]

    X_train_down = vectorizer_before_chi.fit_transform(train_tokenStr_list_down)
    X_train_down = pd.DataFrame(X_train_down.toarray(),columns=vectorizer_before_chi.get_feature_names_out())
    X_train_down # 56 rows × 1324 columns
    down_num_rows = X_train_up.shape[1]

    
    y_train_up = df_yuanta_up[df_yuanta_up['post_time'].between(train_startDate, train_endDate)]['label']
    chi2_selector = SelectKBest(chi2, k = int(min(up_num_rows, down_num_rows)*0.5))
    chi2_selector.fit(X_train_up, y_train_up)
    kbest_vocabs_up = X_train_up.columns[chi2_selector.get_support()]
    X_train_up = X_train_up[kbest_vocabs_up]

    y_train_down = df_yuanta_down[df_yuanta_down['post_time'].between(train_startDate, train_endDate)]['label']
    chi2_selector = SelectKBest(chi2, k = int(min(up_num_rows, down_num_rows)*0.5))
    chi2_selector.fit(X_train_down, y_train_down)
    kbest_vocabs_down = X_train_down.columns[chi2_selector.get_support()]
    X_train_down = X_train_down[kbest_vocabs_down]

    set_up = set(kbest_vocabs_up)
    set_down = set(kbest_vocabs_down)

    common_words = set_up.intersection(set_down)

    unique_up = set_up - common_words
    unique_down = set_down - common_words

    unique_up_list = list(unique_up) # unique_up_list 代表漲的feature
    unique_down_list = list(unique_down) # unique_down_list 代表跌的feature

    # print('刪掉同樣的後，漲和跌的個別feature數量：', len(unique_up_list))

    total_feature = unique_up_list + unique_down_list
    # print('總feature數量：', len(total_feature))

    vectorizer = CountVectorizer(lowercase=False, vocabulary=total_feature, binary=True)

    yuanta_up_vector = vectorizer.fit_transform(yuanta_up_texts)  # Now pass the list of document strings
    yuanta_down_vector = vectorizer.fit_transform(yuanta_down_texts)
    yuanta_stay_vector = vectorizer.fit_transform(yuanta_stay_texts)

    # create new folder
    folder = str(month)
    parent_dir = "./output"
    os.mkdir(os.path.join(parent_dir, folder))

    # train vector
    yuanta_up_vector = vectorizer.fit_transform(train_tokenStr_list_up) 
    yuanta_down_vector = vectorizer.fit_transform(train_tokenStr_list_down)
    yuanta_stay_vector = vectorizer.fit_transform(train_tokenStr_list_stay)

    df_yuanta_up = df_yuanta_up.reset_index(drop=True)
    df_yuanta_down = df_yuanta_down.reset_index(drop=True)
    df_yuanta_stay = df_yuanta_stay.reset_index(drop=True)

    yuanta_up_vector = pd.DataFrame(yuanta_up_vector.toarray(),columns=vectorizer.get_feature_names_out())
    yuanta_up_vector['label'] = 1
    yuanta_up_vector['post_time'] = df_yuanta_up[df_yuanta_up['post_time'].between(train_startDate, train_endDate)]['post_time'].to_list()

    yuanta_down_vector = pd.DataFrame(yuanta_down_vector.toarray(),columns=vectorizer.get_feature_names_out())
    yuanta_down_vector['label'] = -1
    yuanta_down_vector['post_time'] = df_yuanta_down[df_yuanta_down['post_time'].between(train_startDate, train_endDate)]['post_time'].to_list()

    yuanta_stay_vector = pd.DataFrame(yuanta_stay_vector.toarray(),columns=vectorizer.get_feature_names_out())
    yuanta_stay_vector['label'] = 0
    yuanta_stay_vector['post_time'] = df_yuanta_stay[df_yuanta_stay['post_time'].between(train_startDate, train_endDate)]['post_time'].to_list()

    yuanta_vector = pd.concat([yuanta_up_vector, yuanta_down_vector], axis=0)
    yuanta_vector = pd.concat([yuanta_vector, yuanta_stay_vector], axis=0)
    print(yuanta_vector.shape)

    yuanta_vector.to_csv("./output/" + str(month) + "/train.csv",index=False)

    # test vector
    yuanta_up_vector = vectorizer.fit_transform(yuanta_up_texts) 
    yuanta_down_vector = vectorizer.fit_transform(yuanta_down_texts)
    yuanta_stay_vector = vectorizer.fit_transform(yuanta_stay_texts)

    yuanta_up_vector = pd.DataFrame(yuanta_up_vector.toarray(),columns=vectorizer.get_feature_names_out())
    yuanta_up_vector['label'] = 1
    yuanta_up_vector['post_time'] = df_yuanta_up[df_yuanta_up['post_time'].between(test_startDate, test_endDate)]['post_time'].to_list()

    yuanta_down_vector = pd.DataFrame(yuanta_down_vector.toarray(),columns=vectorizer.get_feature_names_out())
    yuanta_down_vector['label'] = -1
    yuanta_down_vector['post_time'] = df_yuanta_down[df_yuanta_down['post_time'].between(test_startDate, test_endDate)]['post_time'].to_list()

    yuanta_stay_vector = pd.DataFrame(yuanta_stay_vector.toarray(),columns=vectorizer.get_feature_names_out())
    yuanta_stay_vector['label'] = 0
    yuanta_stay_vector['post_time'] = df_yuanta_stay[df_yuanta_stay['post_time'].between(test_startDate, test_endDate)]['post_time'].to_list()

    yuanta_vector = pd.concat([yuanta_up_vector, yuanta_down_vector], axis=0)
    yuanta_vector = pd.concat([yuanta_vector, yuanta_stay_vector], axis=0)
    print(yuanta_vector.shape)

    yuanta_vector.to_csv("./output/" + str(month) + "/test.csv",index=False)

(1225, 7142)
(504, 7142)
(1320, 5400)
(684, 5400)
(1592, 5714)
(351, 5714)
(1539, 5248)
(279, 5248)
(1314, 5402)
(409, 5402)
(1039, 4430)
(425, 4430)
(1113, 4646)
(373, 4646)
(1207, 5010)
(236, 5010)
(1034, 5116)
(402, 5116)
(1011, 4894)
(447, 4894)
(1085, 4686)
(265, 4686)
(1114, 3612)
(371, 3612)
(1083, 3488)
(401, 3488)
(1037, 2832)
(314, 2832)
(1086, 3982)
(387, 3982)
(1102, 3680)
(231, 3680)
(932, 3552)
(297, 3552)
(915, 3680)
(283, 3680)
(811, 3714)
(333, 3714)
(913, 4946)
(243, 4946)
(859, 4456)
(132, 4456)
