## 라이브러리 및 데이터 로드

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [None]:
# 스케일링 데이터
st_df = pd.read_csv('new_st_df.csv', index_col = 0)

# 원본 데이터
original_df=pd.read_csv(r'original_df.csv',index_col = 0)
original_df=original_df.loc[original_df.index.isin(st_df.index)]

In [None]:
def split_train_test(data, test_ratio):
    np.random.seed(2) 
    shuffled_indices = np.random.permutation(len(data)) 
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    
    return data.iloc[train_indices], data.iloc[test_indices]

train_set, test_set = split_train_test(st_df, 0.2)
train_set_original, test_set_original = split_train_test(original_df, 0.2)

## 클러스터 모델 GMM

In [None]:
from sklearn.mixture import GaussianMixture
GMM_model =  GaussianMixture(n_components=400, random_state=2021)
result = GMM_model.fit_predict(train_set.iloc[:,:31])
train_set['clst'] = result

result_GMM=GMM_model.predict(test_set.iloc[:,:31])

## 클러스터 모델 CNN

In [None]:
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense,Dropout 
from tensorflow.keras import optimizers 
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
import keras
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers.convolutional import Conv1D, Conv2D
from keras.layers.convolutional import MaxPooling1D, MaxPooling2D

train = train_set.loc[:,:'09:30']
train['clst'] = train_set['clst']

data_x=train.drop(['clst'],axis=1)
data_y=train['clst']

(X_train, X_test, y_train, y_test) = train_test_split(data_x, data_y, train_size=0.8, random_state=1115)

#5
#데이터 형태 변환
tf.random.set_seed(3)
X_train_4 = X_train.values.reshape(len(X_train), 1, 31, 1)
X_test_4 = X_test.values.reshape(len(X_test), 1, 31, 1)

early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', min_delta=0, patience=5, verbose=1)

CNN_model = Sequential()
CNN_model.add(Conv2D(filters=128, kernel_size=(1,3), activation='relu', input_shape=(1,31,1), padding='same'))
CNN_model.add(Conv2D(filters=64, kernel_size=(1,3), activation='relu', padding='same'))
CNN_model.add(MaxPooling2D(pool_size=(1,2)))

CNN_model.add(Conv2D(filters=64, kernel_size=(1,2), activation='relu', padding='valid'))
CNN_model.add(Conv2D(filters=32, kernel_size=(1,2), activation='relu', padding='valid'))
CNN_model.add(MaxPooling2D(pool_size=(1,3)))

CNN_model.add(Flatten())
CNN_model.add(Dense(64, activation='relu'))
CNN_model.add(Dense(len(train_set['clst'].unique().tolist()), activation='softmax'))

#CNN_model.summary()

sgd=optimizers.SGD(lr=0.01)
CNN_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
history=CNN_model.fit(X_train_4, y_train, batch_size=1, epochs=50, validation_data=(X_test_4, y_test), callbacks=[early_stop])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 00022: early stopping


In [None]:
#연습용
test = test_set.loc[:,:'09:30']
test=np.array(test).reshape(len(test),1,31,1)
predicted_clst=CNN_model.predict(test)

predicted_clst_df=pd.DataFrame(predicted_clst)
result_CNN = predicted_clst_df.idxmax(axis=1)

## 가중치 모델 35분 예측 

In [None]:
# distance 적용 클러스터별 35분 예측값
import math
def predict_w_distance(data, predict_data, num):
    df = data[data['clst'] == num].reset_index(drop = True)

    columns = df.columns
    data_35 = df[['09:35']]
    
    df = df[columns[:31]]
    data_size = len(df)

    distances = []

    for row in np.array(df):
        dis = np.linalg.norm(row-predict_data)
        distances.append(dis)

    weights = 1/np.log(np.array(distances)+1)
    weights_sum = np.sum(weights)
    weights = weights/weights_sum

    result = 0

    for i in range(len(distances)):
        result += weights[i]*data_35.iloc[i][0]
    
    return result

In [None]:
predict_35 = []
columns = test_set.columns

test_set_30 = np.array(test_set[columns[:31]])

for i in range(len(test_set)):
    cluster_num = result_GMM[i]
    
    predict_35.append(predict_w_distance(train_set, test_set_30[i],cluster_num))

## 평가

In [None]:
clst_price = pd.DataFrame()
clst_price['clst_num'] = result_GMM
clst_price['predict_35'] = predict_35
clst_price['real_value'] = test_set['09:35'].values.tolist()
clst_price['diff'] = clst_price['real_value'] - clst_price['predict_35']
clst_price=clst_price[(result_GMM==result_CNN)]

print('MAE : ',np.mean(abs(clst_price['diff'])))
print('GMM과 CNN에서 겹치는 행의 수 : ',sum(result_GMM==result_CNN))

MAE :  0.4309271481643767
GMM과 CNN에서 겹치는 행의 수 :  354


In [None]:
test_set_original=test_set_original[(result_CNN==result_GMM).values]

In [None]:
def get_revenue(predict_35, test_set_original):
    revenues = []
    original_30 = test_set_original['09:30']
    original_35 = test_set_original['09:35']
    
    k=0
    for i in range(len(predict_35)):
        if predict_35[i] > 1.:
            rev = (original_35 [i]- original_30[i])/original_30[i]
            revenues.append(rev*100)
            k+=1
                        
    
    print('추천 종목 수 : ',k)
    return revenues

rev_percent= get_revenue(clst_price['predict_35'].tolist(), test_set_original)
print('예상 수익률 : ',round(sum(rev_percent),4))

추천 종목 수 :  80
예상 수익률 :  6.2998


In [None]:
test_set['pred_35']=predict_35
test_set_2=test_set[(result_CNN==result_GMM).values]
predicted_stock = test_set_2.loc[test_set['pred_35']>1.0]
print('추천 종목 : ',predicted_stock.index)

추천 종목 :  Index(['하이트진로_20210610', '메리츠화재_20210616', '현대해상_20210428', '대우조선해양_20210610',
       '에코프로비엠_20210319', '현대미포조선_20210625', '효성_20210614', '코미코_20210408',
       '에어부산_20210430', '대영포장_20210609', '알루코_20210524', '삼양식품_20210203',
       '농심_20210315', '한국조선해양_20210601', 'KBSTAR 고배당_20210528',
       'HSD엔진_20210601', '대영포장_20210604', 'BGF_20210622', 'SK이노베이션_20210302',
       'SK케미칼_20210204', '아시아나항공_20210304', '현대위아_20210304', '빙그레_20210513',
       '엔케이물산_20210615', 'TIGER KRX게임K-뉴딜_20210525', '삼성SDI_20210413',
       '한화투자증권_20210604', '이구산업_20210531', '한국앤컴퍼니_20210615', '현대백화점_20210428',
       '아모레퍼시픽_20210415', 'LG전자_20210601', '삼성생명_20210222', '동방_20210318',
       '에어부산_20210203', '사조대림_20210601', '에이디테크놀로지_20210419', '금호건설_20210608',
       '한국타이어테크놀로지_20210506', '한신기계_20210607', 'LS네트웍스_20210603',
       'STX중공업_20210609', '효성_20210215', '코리안리_20210409', '현대중공업지주_20210601',
       '사조대림_20210527', '부국철강_20210525', '써니전자_20210601', '세화아이엠씨_20210602',
       '대덕전자_2021

# 모델 저장

In [None]:
from keras.models import load_model
import joblib

In [None]:
# GMM 모델 저장
joblib.dump(GMM_model, 'GMM_model.pkl') 

# GMM 모델 불러오기
GMM_model2 = joblib.load('GMM_model.pkl')

In [None]:
# CNN 모델 저장, 현재 디렉토리에 CNN 폴더 생성하고 실행하기
CNN_model.save('./CNN')

# CNN 모델 불러오기
CNN_model2 = keras.models.load_model("CNN")
CNN_model2.predict(test)



INFO:tensorflow:Assets written to: ./CNN/assets


INFO:tensorflow:Assets written to: ./CNN/assets


array([[0.0000000e+00, 8.3173766e-25, 6.5172458e-07, ..., 1.1376675e-29,
        5.5513426e-15, 1.0796004e-18],
       [7.9232994e-19, 1.0718057e-22, 6.3110452e-30, ..., 5.3156685e-23,
        1.5193637e-27, 1.6175685e-29],
       [0.0000000e+00, 1.6121230e-18, 9.9367005e-19, ..., 8.0684711e-34,
        1.7811740e-24, 5.9747810e-22],
       ...,
       [1.5142992e-26, 3.8609971e-14, 1.4421503e-20, ..., 4.3667238e-24,
        1.6312591e-16, 4.8795233e-16],
       [0.0000000e+00, 3.8502188e-15, 1.8988723e-20, ..., 0.0000000e+00,
        3.6825130e-26, 2.7714115e-22],
       [5.4589514e-27, 5.2895127e-16, 8.1473325e-32, ..., 1.0810897e-30,
        1.9609491e-30, 1.9301875e-29]], dtype=float32)