In [None]:
import lightgbm as gbt

## 量化交易预测v0.1

#### 当前版本实现时间
2020年8月31日

#### StockPredict类读取数据版本
20200831 | 20200901

#### 主要内容描述
首先通过该方法分析能够使用的特征有哪些，原始数据文件中提供的tick数据中有很多特征本身无作用。在该版本对比当前时刻的卖一价和1分钟后的卖一价，如果价格上升则打标签为1，否则打标签为0。使用GBDT模型做初步预测，挑选有价值的特征，并且得到该模型/策略的baseline效果。

#### TODO List
- 完成单日股票数据可视化接口
- 完成决策树绘制接口
- 对特征进行转换之后再次尝试模型效果（可以在数据加载之后处理，不破坏原始数据）
- 核对当前流程是否有误
- 特征&类别标签相关性分析

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import os
import random
import time
import shutil
import concurrent.futures as cf
import lightgbm as lgb
import logging
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s - [line:%(lineno)d] - %(levelname)s: %(message)s',
                    filename='predict.log',
                    filemode='w')

import warnings
warnings.filterwarnings("ignore")

from multiprocessing import Pool
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, recall_score, precision_score
from sklearn.model_selection import GridSearchCV

class StockDataBasic(object):
    def __init__(self, root_path, parall = False):
        self.version = "__version_0828__"
        self.version_desc = "This is the alpha version 0.1 which just try to make every thing work."
        self.train_data = None
        self.test_data = None
        self._root_path = root_path
        self._stock_list = None
        self._root_label_path = None
        self._root_data_path = None
        self._root_train_path = None
        self._parall = parall
        
        if root_path[-1] == "/":
            self._root_label_path = root_path + "label/"
            self._root_data_path = root_path + "data/"
            self._root_train_path = root_path + "train/"
        else:
            self._root_label_path = root_path + "/label/"
            self._root_data_path = root_path + "/data/"
            self._root_train_path = root_path + "train/"
            
        if self._root_data_path:
            self._stock_list = os.listdir(self._root_data_path)

        
    def __load_label(self, stock_file, csv_file):
        """
        load the label of data.
        :params stock_file: stock dir name, such as 002001.SZ
        :params csv_file: csv file name that contains data.
        :return: label result in dictory formation.
        """
        file_path = self._root_label_path + stock_file + "/" + csv_file
        assert os.path.exists(file_path), 'the given file_path: %s does not esixt.' % file_path

        df = pd.read_csv(file_path)
        ret_dict = df.set_index("MDTime").T.to_dict('int')
        return ret_dict['tag']


    def __load_raw_data(self, stock_file, csv_file, white_list = None):
        """
        load the original data that without labels.
        :params stock_file: stock dir name, such as 002001.SZ
        :params csv_file: csv file name that contains data.
        :return: raw data in pandas.DataFrame formation.
        """
        file_path = self._root_data_path + stock_file + "/" + csv_file
        assert os.path.exists(file_path), 'the given file_path: %s does not esixt.' % file_path
        
        if white_list == None:
            white_list = ['MDTime', 'PreClosePx', 'NumTrades', 'TotalVolumeTrade', 'TotalValueTrade',
                       'LastPx', 'OpenPx', 'ClosePx', 'HighPx', 'LowPx', 'MaxPx', 'MinPx', 'DiffPx1', 
                       'DiffPx2', 'Buy1Price', 'Buy1OrderQty' ,'Sell1Price', 'Sell1OrderQty'
            ]
        df = pd.read_csv(file_path)
        return df[white_list]
    
        
    def data_preprocess(self):
        """
        load the original raw data and its label data to generate the final training/testing data.
        warning: this process might be time-consuming!
        return: None
        """
        if os.path.exists(self._root_train_path):
            shutil.rmtree(self._root_train_path)
        os.mkdir(self._root_train_path)
        
        for stock in self._stock_list:
            cur_stock_train_root_path = self._root_train_path + stock
            cur_stock_label_root_path = self._root_label_path + stock
            if not os.path.exists(cur_stock_train_root_path):
                os.mkdir(cur_stock_train_root_path)
            
            csv_file_list = os.listdir(cur_stock_label_root_path)
            csv_file_lent = len(csv_file_list)
            
            # if process the file in-parall.
            if self._parall:
                pp = cf.ProcessPoolExecutor()
                process_list = list()
                for index, csv_file in enumerate(csv_file_list):
                    logging.info("Processing %d/%d file: %s/%s" % (index + 1, csv_file_lent, stock, csv_file))
                    obj = pp.submit(self.__merge_data_and_label, stock, csv_file)
                    process_list.append(obj)
                pp.shutdown()
            else:
                for index, csv_file in enumerate(csv_file_list):
                    logging.info("Processing %d/%d file: %s/%s" % (index + 1, csv_file_lent, stock, csv_file))
                    self.__merge_data_and_label(stock, csv_file)
                    
            # WARNING: THE SENTENCE JUST FOR DEBUG, DELETE IT IF YOU WANT TO RUN ALL THE DATA.
            # break
        
    
    def __merge_data_and_label(self, stock_name, csv_file):
        """
        Merge the corresponding data and annotated label, the final result will be written into the disk.
        The root path was recorded in self._root_train_path.
        return: None
        """
        label = self.__load_label(stock_name, csv_file)
        data = self.__load_raw_data(stock_name, csv_file)
        data['class'] = None
        for idx, row in data.iterrows():
            if row['MDTime'] in label:
                data.loc[idx, 'class'] = label[row['MDTime']]
        data.dropna(subset = ['class'], inplace = True)
#         del data['MDTime']
        data.to_csv(self._root_train_path + stock_name + "/train_" + csv_file, index = False)
    
    
class PredictModel(StockDataBasic):
    def __init__(self, data_root_path, model_name = "lightgbm"):
        super(PredictModel, self).__init__(data_root_path)
        self.model_name = model_name
        self.model_root_path = None
        self.result_root_path = None
        self.model = None
        self.predict_ticks = self.__generate_predict_ticks()
        self.parameters = {
                'objective': 'binary',
                "boosting": "gbdt",
                'max_depth': 5,
                'learning_rate': 0.1,
                "feature_fraction": 0.9,
                "bagging_fraction": 0.9,
                "nthread": -1,
                'metric': {'binary_logloss'},
                "random_state": 2020,
            }
        
        if data_root_path[-1] == "/":
            self.model_root_path = data_root_path + "model/"
            self.result_root_path = data_root_path + "result/"
        else:
            self.model_root_path = data_root_path + "/model/"
            self.result_root_path = data_root_path + "/result/"
    
    def __generate_predict_ticks(self):
        return_list = list()
        for hour in [9, 10, 11, 13, 14]:
            for minute in range(60):
                for second in [0, 30]:
                    if hour == 9 and minute < 31:
                        continue
                    if hour == 11 and minute >= 30:
                        continue
                    cur_timestamp = str(hour) + str(minute).zfill(2) + str(second).zfill(2) + '000'
                    return_list.append(int(cur_timestamp))
        return return_list[:-7]
    
    
    def load_data(self, stock_name, train_test_split = "20200610"):
        """
        load the training data, including the train set and test set.
        :params stock_name: name of the target stock.
        :params train_test_split: split the data set into training set and test set.
        return: train set and test test, in pandas.DataFrame formation.
        """
        stock_train_root_path = self._root_train_path + stock_name + '/'
        assert os.path.exists(stock_train_root_path), "the train data dir: %s does not exist." % stock_train_root_path
        
        train_test_split = int(train_test_split)
        df_train_positive, df_train_negative, df_test = list(), list(), list()
        csv_list = os.listdir(stock_train_root_path)
        for csv in csv_list:
            try:
                csv_file_date = int(csv.split('_')[0])
                df_cur = pd.read_csv(stock_train_root_path + csv)
                if csv_file_date < train_test_split:
                    data = df_cur.values
                    for i in range(data.shape[0]):
                        if data[i][-1] == 0:
                            df_train_negative.append(data[i])
                        else:
                            df_train_positive.append(data[i])
                else:
                    data = df_cur.values
                    for i in range(data.shape[0]):
                        if int(data[i][0]) in self.predict_ticks:
                            df_test.append(data[i])
            except Exception as e:
                logging.error("Error while processing csv file: %s" % stock_train_root_path + csv)
                logging.error(e)
                continue
        
        random.shuffle(df_train_negative)
        random.shuffle(df_train_positive)
#         min_len = min(len(df_train_positive), len(df_train_negative)) + 1
#         np_train = np.vstack([df_train_positive[-min_len:], df_train_negative[-min_len:]])
        np_train = np.vstack([df_train_positive, df_train_negative])
        np.random.shuffle(np_train)
        df_train = pd.DataFrame(np_train)
        df_test = pd.DataFrame(df_test)
        return df_train, df_test
    
    
    def predict_daily(self, stock_name, test_threshold = "20200610"):
        """
        Predict the daily result.
        :params stock_name: name of stock.
        :params test_threshold: only the date after this threshold be seen as test data(the threshold date was included).
        return: None.
        """
        stock_train_root_path = self._root_train_path + stock_name + '/'
        assert os.path.exists(stock_train_root_path), "the train data dir: %s does not exist." % stock_train_root_path
        
        test_threshold = int(test_threshold)
        csv_list = os.listdir(stock_train_root_path)
        for csv in csv_list:
            logging.info("predict proba for %s/%s" % (stock_name, csv))
            try:
                csv_file_date = int(csv.split('_')[0])
                if csv_file_date < test_threshold:
                    continue
            except Exception as e:
                logging.error("Error while processing csv file: %s" % stock_train_root_path + csv)
                logging.error(e)
                continue
            
            df_test = pd.read_csv(stock_train_root_path + csv)
            original_matrix = df_test.values
            filtered_matrix = list()
            for i in range(len(original_matrix)):
                if original_matrix[i][0] in self.predict_ticks:
                    filtered_matrix.append(original_matrix[i])
            
            filtered_matrix = np.array(filtered_matrix)
            X_test, y_test = filtered_matrix[:, 1:-1], filtered_matrix[:, -1]
            y_pred_prob = predict_model.model.predict(X_test, num_iteration = predict_model.model.best_iteration)
            
            result_df = pd.DataFrame()
            result_df['date'] = csv.split('_')[0]
            result_df['orderTime'] = self.predict_ticks
            result_df['proba'] = y_pred_prob
#             result_df['label'] = y_test
            result_df['date'] = csv.split('_')[0]
            self.save_predict_result(stock_name, csv, result_df)
    
    
    def save_predict_result(self, stock_name, csv_file, df):
        """
        Store the predicted result into csv file.
        :params stock_name: name of the stock.
        :csv_file: name of the csv file.
        return: None.
        """
        if not os.path.exists(self.result_root_path):
            os.mkdir(self.result_root_path)
            
        stock_dir_path = self.result_root_path + stock_name
        if not os.path.exists(stock_dir_path):
            os.mkdir(stock_dir_path)
        
        csv_path = stock_dir_path + '/predict_' + csv_file
        df.to_csv(csv_path, index = False)

    
def load_data_test():
    stock_predict_test = StockDataBasic(root_path)

    label = stock_predict_test.__load_label("002001.SZ", "20200525_002001.csv")
    data = stock_predict_test.__load_raw_data("002001.SZ", "20200525_002001.csv")
    data['class'] = None
    for idx, row in data.iterrows():
        if row['MDTime'] in label:
            data.loc[idx, 'class'] = label[row['MDTime']]
    data.dropna(subset = ['class'], inplace = True)
    return data


def logistic_obj(y_hat, dtrain):
    y = dtrain.get_label()
    p = y_hat
    grad = p - y
    hess = p * (1. - p)
    grad = 4 * p * y + p - 5 * y
    hess = (4 * y + 1) * (p * (1.0 - p))
    return grad, hess


def err_rate(y_hat, dtrain):
    y = dtrain.get_label()
    y_hat = np.clip(y_hat, 10e-7, 1-10e-7)
    loss_fn = y * np.log(y_hat)
    loss_fp = (1.0 - y) * np.log(1.0 - y_hat)
    return 'error', np.sum(-(5 * loss_fn + loss_fp)) / len(y), False

    
if __name__ == "__main__":
    ROOT_PATH = "./stock_predict/"
    
#     # test
#     test_data = load_data_test(ROOT_PATH, FILE_PATH)
#     print(test_data)
    
#     # generating the training data here.
#     stock_predict = StockDataBasic(ROOT_PATH, parall = False)
#     stock_predict.data_preprocess()

    predict_model = PredictModel(ROOT_PATH)
    # train中所有股票代码
    stock_list = os.listdir(predict_model._root_train_path)
    index = 1
    t0 = time.time()
    results = []
    for stock in stock_list:
        # 以每支股票为训练、验证和测试单位
        train, test = predict_model.load_data(stock)
        if train.shape[0] < 50 or test.shape[0] <= 0:
            logging.error("train data was not enough for training or test data missed: %s" % stock)
            continue
        
        # split the train dataset into training set and validation set, ratio 4:1.
        # X不取第一列时间戳，y只取class
        X_trval, y_trval = train.values[:, 1:-1], train.values[:, -1]
        X_train, X_valid, y_train, y_valid = train_test_split(X_trval, y_trval, test_size = 0.2, random_state = 2020)
        # testing set.
        X_test, y_test = test.values[:, 1:-1], test.values[:, -1]
        
        # load training data and evaluation data with lightgbm.Dataset()
        train_data = lgb.Dataset(X_train, y_train)
        eval_data = lgb.Dataset(X_valid, y_valid)
        
        predict_model.model = lgb.train(predict_model.parameters, 
                                        train_data,
                                        num_boost_round=50, 
                                        valid_sets = [train_data, eval_data], 
                                        early_stopping_rounds = 5,
                                        feval = err_rate,
#                                         fobj = logistic_obj,
                                        verbose_eval=0)

        # evaluate the model on testing data.
        y_pred_prob = predict_model.model.predict(X_test, num_iteration = predict_model.model.best_iteration)
        y_pred = [0 if score <= 0.5 else 1 for score in y_pred_prob]
        logging.info("predict result on all testset %s: acc = %f, auc = %f" \
                      % (stock, accuracy_score(y_test, y_pred), roc_auc_score(y_test, y_pred_prob)))
        print("[%d/%d](%d%s) - %s \tacc:%f  auc:%f  precision:%f  recall:%f" \
               % (index, len(stock_list), 100 * index / float(len(stock_list)), '%', stock, 
               accuracy_score(y_test, y_pred), roc_auc_score(y_test, y_pred_prob),
               precision_score(y_test, y_pred), recall_score(y_test, y_pred)))
        results.append([stock, accuracy_score(y_test, y_pred), roc_auc_score(y_test, y_pred_prob),
               precision_score(y_test, y_pred), recall_score(y_test, y_pred)])
        
        # predict the probability 
        predict_model.predict_daily(stock)
        index += 1
    t1 = time.time()
    print("all prediction tasks done, time costed: %dm %ds" % ((t1 - t0) / 60, (t1 - t0) % 60))

In [None]:
columns = ["stock", "acc", "auc", "precision", "recall"]
df_result = pd.DataFrame(np.array(results), columns = columns)
df_result.to_csv("./predict_log_0911_2.csv", index = False)

In [None]:
def get_score(root_path):
    for path in root_path:
        ROOT_PATH = path
        
        predict_model = PredictModel(ROOT_PATH)
        stock_list = os.listdir(predict_model._root_train_path)
        for stock in stock_list:
            train, test = predict_model.load_data(stock)
            
            # shuffle the train and test dataset.
            # train_shuff = shuffle(train)
            # test_shuff  = shuffle(test)

            # split the train dataset into training set and validation set, ratio 4:1.
            X_trval, y_trval = train.values[:, 1:-1], train.values[:, -1]
            X_train, X_valid, y_train, y_valid = train_test_split(X_trval, y_trval, test_size = 0.2, random_state = 2020)
            # testing set.
            X_test, y_test = test.values[:, 1:-1], test.values[:, -1]

            print("train positive: %d | train negative: %d" % (y_trval.sum(), len(train) - y_trval.sum()))
            print("test positive : %d | test negative : %d" % (y_test.sum(), len(test) - y_test.sum()))

            # load training data and evaluation data with lightgbm.Dataset()
            train_data = lgb.Dataset(X_train, label = y_train)
            eval_data = lgb.Dataset(X_valid, y_valid)
            
#             for gridsearch
#             parameters = {
#                 'max_depth': [4, 6, 8],
#                 'learning_rate': [0.01, 0.1, 0.5],
#                 'num_iteration': [20, 40, 60, 80, 100]
#             }
            
#             predict_model.model = lgb.LGBMClassifier(objective = 'binary',
#                                                     metric = 'binary_logloss',
#                                                     max_depth = 6,
#                                                     learning_rate = 0.1,
#                                                     feature_fraction = 0.9,
#                                                     bagging_fraction = 0.9)
            
#             gsearch = GridSearchCV(predict_model.model, param_grid = parameters, scoring = 'accuracy', cv = 3)
#             gsearch.fit(X_trval, y_trval)
#             print('参数的最佳取值:{0}'.format(gsearch.best_params_))
#             break
            
#             参数的最佳取值:{'num_iteration': 40, 'learning_rate': 0.01, 'max_depth': 4}
#             train the lightgbm model
            params = {
                'objective': 'binary',
                "boosting": "gbdt",
                'max_depth': 4,
                'num_iteration': 40,
                'learning_rate': 0.01,
                "feature_fraction": 0.9,
                "bagging_fraction": 0.9,
                "nthread": -1,
                'metric': {'binary_logloss'},
                "random_state": 2020,
            }

            bst = lgb.train(params, train_data, num_boost_round=50, valid_sets = [train_data, eval_data], early_stopping_rounds = 5, verbose_eval=0)
            bst.save_model('model_test.txt', num_iteration = bst.best_iteration)

            # evaluate the model on testing data.
            y_pred_prob = bst.predict(X_test, num_iteration = bst.best_iteration)
            y_pred = [0 if score < 0.5 else 1 for score in y_pred_prob]
            print("%s%s acc = %f" % (path, stock, accuracy_score(y_test, y_pred)))
            print("%s%s auc = %f" % (path, stock, roc_auc_score(y_test, y_pred_prob)))
            print(confusion_matrix(y_test, y_pred))

### 尝试使用sklearn自带的XGBoost

In [None]:
import xgboost as xgb

from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score

ROOT_PATH = "E:/stock_predict/stock_data/20200831/"
predict_model = PredictModel(ROOT_PATH)
train, test = predict_model.load_data("002001.SZ", "20200610")

# shuffle the train and test dataset.
# train_shuff = shuffle(train)
# test_shuff  = shuffle(test)

# split the train dataset into training set and validation set, ratio 4:1.
X_trval, trval_label = train.values[:, 2:-1], train.values[:, -1]
X_train, X_valid, y_train, y_valid = train_test_split(X_trval, trval_label, test_size = 0.2, random_state = 2020)
# testing set.
X_test, y_test = test.values[:, 2:-1], test.values[:, -1]

print("train positive: %d | train negative: %d" % (trval_label.sum(), len(train) - trval_label.sum()))
print("test positive : %d | test negative : %d" % (y_test.sum(), len(test) - y_test.sum()))

# load training data and evaluation data with lightgbm.Dataset()
train_data = lgb.Dataset(X_train, label = y_train)
eval_data = lgb.Dataset(X_valid, y_valid, reference=train_data)

# train the lightgbm model
params = {
    'max_depth': 20,
    'learning_rate': 0.01,
    'n_estimators': 50,
}

bst = xgb.XGBClassifier(max_depth = 10,
                       learning_rate = 0.01,
                       n_estimators = 50,
                       objective = 'binary:logistic')
bst.fit(X_train. y_train)

# evaluate the model on testing data.
# y_pred_prob = bst.predict(X_test, num_iteration = bst.best_iteration)
# y_pred = [0 if score < 0.5 else 1 for score in y_pred_prob]
# print("acc = ", accuracy_score(y_test, y_pred))
# print("auc = ", roc_auc_score(y_test, y_pred_prob))
# print("confusuion matrix = ", confusion_matrix(y_test, y_pred))

In [None]:
lgb.LGBMClassifier()

In [None]:
X_test

In [None]:
from sklearn.datasets import load_iris
from sklearn import tree
import graphviz

iris = load_iris()
clf = tree.DecisionTreeClassifier()
clf = clf.fit(iris.data, iris.target)
dot_data = tree.export_graphviz(clf, out_file=None) 
graph = graphviz.Source(dot_data) 
graph.render("iris") 

dot_data = tree.export_graphviz(clf, out_file=None, 
                         feature_names=iris.feature_names,  
                         class_names=iris.target_names,  
                         filled=True, rounded=True,  
                         special_characters=True)  

graph = graphviz.Source(dot_data)
graph

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import plotly.express as px

file_path = "E:/stock_predict/stock_data/20200901/train/002001.SZ/"
csv_list = os.listdir(file_path)
df = pd.read_csv(file_path + csv_list[2])
df = df[df['MDTime'] > 93000000]
df['class'] = df['class'] +25
px.line(df, x="MDTime", y=["LastPx", "PreClosePx", "HighPx", "LowPx", "Sell1Price", "Buy1Price", "class"])

In [None]:
df

In [None]:
df_test[df_test[0] > 5]

In [None]:
# coding=utf-8
from __future__ import print_function, absolute_import, unicode_literals
from datetime import datetime
import numpy as np
from gm.api import *
import sys
try:
    from sklearn import svm
except:
    print('请安装scikit-learn库和带mkl的numpy')
    sys.exit(-1)
'''
本策略选取了七个特征变量组成了滑动窗口长度为15天的训练集,随后训练了一个二分类(上涨/下跌)的支持向量机模型.
若没有仓位则在每个星期一的时候输入标的股票近15个交易日的特征变量进行预测,并在预测结果为上涨的时候购买标的.
若已经持有仓位则在盈利大于10%的时候止盈,在星期五损失大于2%的时候止损.
特征变量为:1.收盘价/均值2.现量/均量3.最高价/均价4.最低价/均价5.现量6.区间收益率7.区间标准差
训练数据为:SHSE.600000浦发银行,时间从2016-03-01到2017-06-30
回测时间为:2017-07-01 09:00:00到2017-10-01 09:00:00
'''
def init(context):
    # 订阅浦发银行的分钟bar行情
    context.symbol = 'SHSE.600000'
    subscribe(symbols=context.symbol, frequency='60s')
    start_date = '2016-03-01'  # SVM训练起始时间
    end_date = '2017-06-30'  # SVM训练终止时间
    # 用于记录工作日
    # 获取目标股票的daily历史行情
    recent_data = history(context.symbol, frequency='1d', start_time=start_date, end_time=end_date, fill_missing='last',
                          df=True)
    days_value = recent_data['bob'].values
    days_close = recent_data['close'].values
    days = []
    # 获取行情日期列表
    print('准备数据训练SVM')
    for i in range(len(days_value)):
        days.append(str(days_value[i])[0:10])
    x_all = []
    y_all = []
    for index in range(15, (len(days) - 5)):
        # 计算三星期共15个交易日相关数据
        start_day = days[index - 15]
        end_day = days[index]
        data = history(context.symbol, frequency='1d', start_time=start_day, end_time=end_day, fill_missing='last',
                       df=True)
        close = data['close'].values
        max_x = data['high'].values
        min_n = data['low'].values
        amount = data['amount'].values
        volume = []
        for i in range(len(close)):
            volume_temp = amount[i] / close[i]
            volume.append(volume_temp)
        close_mean = close[-1] / np.mean(close)  # 收盘价/均值
        volume_mean = volume[-1] / np.mean(volume)  # 现量/均量
        max_mean = max_x[-1] / np.mean(max_x)  # 最高价/均价
        min_mean = min_n[-1] / np.mean(min_n)  # 最低价/均价
        vol = volume[-1]  # 现量
        return_now = close[-1] / close[0]  # 区间收益率
        std = np.std(np.array(close), axis=0)  # 区间标准差
        # 将计算出的指标添加到训练集X
        # features用于存放因子
        features = [close_mean, volume_mean, max_mean, min_mean, vol, return_now, std]
        x_all.append(features)
    # 准备算法需要用到的数据
    for i in range(len(days_close) - 20):
        if days_close[i + 20] > days_close[i + 15]:
            label = 1
        else:
            label = 0
        y_all.append(label)
    x_train = x_all[: -1]
    y_train = y_all[: -1]
    # 训练SVM
    context.clf = svm.SVC(C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False,
                          tol=0.001, cache_size=200, verbose=False, max_iter=-1,
                          decision_function_shape='ovr', random_state=None)
    context.clf.fit(x_train, y_train)
    print('训练完成!')
def on_bar(context, bars):
    bar = bars[0]
    # 获取当前年月日
    today = bar.bob.strftime('%Y-%m-%d')
    # 获取数据并计算相应的因子
    # 于星期一的09:31:00进行操作
    # 当前bar的工作日
    weekday = datetime.strptime(today, '%Y-%m-%d').isoweekday()
    # 获取模型相关的数据
    # 获取持仓
    position = context.account().position(symbol=context.symbol, side=PositionSide_Long)
    # 如果bar是新的星期一且没有仓位则开始预测
    if not position and weekday == 1:
        # 获取预测用的历史数据
        data = history_n(symbol=context.symbol, frequency='1d', end_time=today, count=15,
                         fill_missing='last', df=True)
        close = data['close'].values
        train_max_x = data['high'].values
        train_min_n = data['low'].values
        train_amount = data['amount'].values
        volume = []
        for i in range(len(close)):
            volume_temp = train_amount[i] / close[i]
            volume.append(volume_temp)
        close_mean = close[-1] / np.mean(close)
        volume_mean = volume[-1] / np.mean(volume)
        max_mean = train_max_x[-1] / np.mean(train_max_x)
        min_mean = train_min_n[-1] / np.mean(train_min_n)
        vol = volume[-1]
        return_now = close[-1] / close[0]
        std = np.std(np.array(close), axis=0)
        # 得到本次输入模型的因子
        features = [close_mean, volume_mean, max_mean, min_mean, vol, return_now, std]
        features = np.array(features).reshape(1, -1)
        prediction = context.clf.predict(features)[0]
        # 若预测值为上涨则开仓
        if prediction == 1:
            # 获取昨收盘价
            context.price = close[-1]
            # 把浦发银行的仓位调至95%
            order_target_percent(symbol=context.symbol, percent=0.95, order_type=OrderType_Market,
                                 position_side=PositionSide_Long)
            print('SHSE.600000以市价单开多仓到仓位0.95')
    # 当涨幅大于10%,平掉所有仓位止盈
    elif position and bar.close / context.price >= 1.10:
        order_close_all()
        print('SHSE.600000以市价单全平多仓止盈')
    # 当时间为周五并且跌幅大于2%时,平掉所有仓位止损
    elif position and bar.close / context.price < 1.02 and weekday == 5:
        order_close_all()
        print('SHSE.600000以市价单全平多仓止损')
if __name__ == '__main__':
    '''
    strategy_id策略ID,由系统生成
    filename文件名,请与本文件名保持一致
    mode实时模式:MODE_LIVE回测模式:MODE_BACKTEST
    token绑定计算机的ID,可在系统设置-密钥管理中生成
    backtest_start_time回测开始时间
    backtest_end_time回测结束时间
    backtest_adjust股票复权方式不复权:ADJUST_NONE前复权:ADJUST_PREV后复权:ADJUST_POST
    backtest_initial_cash回测初始资金
    backtest_commission_ratio回测佣金比例
    backtest_slippage_ratio回测滑点比例
    '''
    run(strategy_id='strategy_id',
        filename='main.py',
        mode=MODE_BACKTEST,
        token='token_id',
        backtest_start_time='2017-07-01 09:00:00',
        backtest_end_time='2017-10-01 09:00:00',
        backtest_adjust=ADJUST_PREV,
        backtest_initial_cash=10000000,
        backtest_commission_ratio=0.0001,
        backtest_slippage_ratio=0.0001)

In [None]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
 
iris = load_iris()
data = iris.data
target = iris.target
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2)
 
# 创建成lgb特征的数据集格式
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
 
# 将参数写成字典下形式
params = {
    'task': 'train',
    'boosting_type': 'gbdt',  # 设置提升类型
    'objective': 'regression',  # 目标函数
    'metric': {'l2', 'auc'},  # 评估函数
    'num_leaves': 31,  # 叶子节点数
    'learning_rate': 0.05,  # 学习速率
    'feature_fraction': 0.9,  # 建树的特征选择比例
    'bagging_fraction': 0.8,  # 建树的样本采样比例
    'bagging_freq': 5,  # k 意味着每 k 次迭代执行bagging
    'verbose': 1  # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
}
 
# 训练 cv and train
gbm = lgb.train(params, lgb_train, num_boost_round=20, valid_sets=lgb_eval, early_stopping_rounds=5)
 
# 保存模型到文件
gbm.save_model('model.txt')
 
# 预测数据集
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
 
# 评估模型
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)