- [Loading news files](#Loading-news-files)
- [Extract sentimental datas of A-share stocks from these files ](#Extract-sentimental-datas-of-A-share-stocks-from-these-files)
- [Data processing](#Data-processing)
- [Sort data for companies in one day](#Sort-data-for-companies-in-one-day)
- [Get other feature data and labels](#Get-other-feature-data-and-labels)

<font size=6>Warning: This file is not runnable since raw data set is not uploaded for NDA</font>

But if you want to check whether this file is runnable you can change 'E:\\GitHub\\final project\\20160101-20160401.txt' in [Loading news files](#Loading-news-files) into 'https://github.com/SnakeWayne/PHBS_MLF_2019/blob/master/data/sampledata.txt' for they being both in the same format

# Sentimental Factor and Dataset Construction

In [3]:
import os
import time
import json
import zipfile
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

###  Loading news files

In [34]:
sample_file_path = 'E:\\GitHub\\final project\\20160101-20160401.txt'   # replace to your own folder path
sample_f=open(sample_file_path,'r', encoding='UTF-8')
#with open(sample_file_path, 'r') as sample_f:
sample_line = sample_f.readline()
sample_line = json.loads(sample_line)#read the file

In [35]:
sample_line#展示文件

{'newsInfo': {'newsId': 181595,
  'newsTitle': 'Kaili Breakfast Express 0104',
  'newsTitle_cn': '开利早餐快递0104',
  'newsTs': '2016-01-04T09:12:00+0800',
  'newsUrl': 'url',
  'newsSource': '',
  'newsSummary': '从2016年1月1日起，新股发行将按照新的制度执行。2015年逾九成股票温和上涨伴随着上周四收盘，2015年的A股经过了波澜壮阔的杠杆牛市，再到急速下跌后的企稳反攻，终于落下帷幕。分析人士指出，在经历连续数年投资回报的“大年”后，在A股市场可能迎来大扩容的背景下，2016年大概率是个“小年”。A股1.1万亿元限售股1月份待解禁重仓基金怎么办在大幅调整的背景下，证监会于2015年7月8日出台“大股东减持禁令”限制大股东及董监高的减持行为：从公告之日起6个月内，大股东及董事、监事、高级管理人员不得通过二级市场减持本公司股份，这对当时市场稳定起到了积极作用。不过，有分析人士认为，从2015年7月8日至12月31日，上述解禁压力将集中移至2016年1月份。预言五，2016年M2新增规模预计为12万亿元左右，有望降息三次。'},
 'newsTags': [{'itemType': 'Company',
   'itemName_cn': '浦发银行',
   'itemName': 'SPDB',
   'itemId': 'CSF0000001394',
   'ItemExtId': '600000_SH_EQ',
   'ItemRelevance': 0.2591},
  {'itemType': 'Company',
   'itemName_cn': '兴业银行',
   'itemName': 'INDUSRIAL BANK',
   'itemId': 'CSF0000002423',
   'ItemExtId': '601166_SH_EQ',
   'ItemRelevance': 0.2591},
  {'itemType': 'Company',
   'itemName_cn': '唯品会 ADR',
   'itemName': 'V

### Extract sentimental datas of A-share stocks from these files 

Extract the following data from each line of the each news file:

* news id
* news time
* related security code
* relavance score
* sentimental type
* sentimental weight

In [95]:
def extract_sentimentals(file_path):
    """extract sentimental datas from news file"""
    
    def _tag_to_dic(news_tags):
        ret_dic = {}
        for item in news_tags:
            if 'ItemExtId' in item:
                ret_dic[item['ItemExtId']] = item['ItemRelevance']
                #use dictionary to record the relevance between news and the asset
        return ret_dic
    
    rets = []
    error_news = []
    news_total_count = 0
    news_f=open(file_path,'r',encoding='UTF-8')
    file_lines = news_f.readlines()
    for file_line in file_lines:
        try:
            temp_dict= json.loads(file_line)
            rel_dic = _tag_to_dic(temp_dict['newsTags'])
            time_str =  temp_dict['newsInfo']['newsTs'].split('+')[0]
            news_time = parse(time_str)
            news_id = temp_dict['newsInfo']['newsId']
            emotion_infos = temp_dict['emotionInfos']
            if emotion_infos:
                for em_info in emotion_infos:
                    item_type = em_info['emotionEntity']
                    if  item_type == 'Company':
                            # only keep the record related to A-share Market
                        if em_info['entityCode'][-5:] in ['SH_EQ','SZ_EQ'] and em_info['entityCode'][0] in ['0','3','6']:                            
                            item_rel = rel_dic.get(em_info['entityCode'])
                            rets.append((news_id, 
                                         news_time, 
                                         em_info['entityCode'],
                                         em_info['emotionIndicator'],
                                         em_info['emotionWeight'],
                                         item_rel
                                        ))
        except:
            error_news.append(file_line)
    df_senti = pd.DataFrame(data=rets, columns=['news_id','news_time','code','senti_type','senti_weight','relevance'])
    
    return df_senti, error_news

In [96]:
def load_news_files(folder_path):
    df =  pd.DataFrame()
    for file_name in tqdm_notebook(os.listdir(folder_path)):
        if file_name.split('.')[-1] == 'txt':
            st = time.time()
            file_path = folder_path+'/'+file_name
            temp_df,_ = extract_sentimentals(file_path)
            df = df.append(temp_df)
            df = df.sort_values('news_time').reset_index(drop=True)
            et = time.time()
    return df

In [97]:
def load_raw_sentimental_data(load_from_file=False, load_from_csv=False, file_path=None):
    '''load from raw file or presaved sentimental csv file, either 'load_from_file' or 'load_from_csv' must be set
    load_from_file: default False, set to True if generate the sentimental data from original news files and must provide 'dir_path'
    load_from_csv: default False, set to True if you have pre-saved sentimental data
    file_path: default None, must be provided
    '''
    if load_from_file:
        if not file_path:
            raise ValueError('Please specify the file_path of the raw news files!')
        df_senti_raw = load_news_files(file_path)
        
    elif load_from_csv:
        if not file_path:
            raise ValueError('Please specify the file_path of the presaved sentimental data!')
        df_senti_raw = pd.read_csv(file_path, index_col=0, parse_dates=True)
        df_senti_raw['news_time'] = pd.to_datetime(df_senti_raw['news_time'])
            
    else:
        raise ValueError('Check the parameters!')
        
    return df_senti_raw 
    

In [52]:
unzip_to_dir_path = './final project'
df_senti_raw = load_raw_sentimental_data(load_from_file=True, file_path=unzip_to_dir_path)

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

In [82]:
df_senti_raw.head()

Unnamed: 0,news_id,news_time,code,senti_type,senti_weight,relevance,senti_score
0,4088047,2016-01-01,600053.XSHG,1,0.68,0.1458,9.9144
1,4086425,2016-01-01,002202.XSHE,1,0.86,0.0633,5.4438
2,4086320,2016-01-01,000732.XSHE,0,0.99,0.0251,0.0
3,4085488,2016-01-01,601618.XSHG,-1,1.0,0.0609,-6.09
4,4091599,2016-01-01,601933.XSHG,0,0.74,0.0298,0.0


### Data processing
process raw data loaded with function ```load_news_files``` by the following steps:
* 1. change senti_type: 2 --> -1
* 2. calculate senti_score: ```senti_score = senti_type*senti_weight*relevance*100```
* 3. map calendar date to trade date: cut at 15:00

The function ```map_trade_date``` maps the calendar datetime of the news to the trade calendar datetime.

In [98]:
def map_trade_date(df_raw, cut_hour, cut_minute):
    '''map calendar date to trade date
    df_raw: raw sentimental data from 'load_raw_sentimental_data'
    cut_hour & cut_minute: the sentimental data before cut_hour:cut_time(eg. 15:00) will be taken into current day's trading;
    the sentimental data after cut_hour:cut_time will be taken into next day's trading
    '''
    
    start_date = df_raw['news_time'].min()
    start_date = start_date.strftime("%Y-%m-%d")
    
    end_date = df_raw['news_time'].max()
    end_date = end_date.strftime("%Y-%m-%d")
    
    
    dts = pd.read_csv('trade_calendar_zh.csv', index_col=0)
    dts_arr = dts['date'].values
    
    
    st_idx = np.searchsorted(dts_arr, start_date)
    et_idx = np.searchsorted(dts_arr, end_date)
    
    trade_dts = [parse(dt).date() for dt in dts_arr[st_idx-1:et_idx+2]]
    ti = datetime(2019,1,1,hour=cut_hour, minute=cut_minute).time()
    trade_dts = [datetime.combine(dt,ti) for dt in trade_dts]
    ts_trade_dts = pd.Series(trade_dts)
    trade_date = pd.cut(df_raw['news_time'].astype(np.int64)//10**9,
                        bins=ts_trade_dts.astype(np.int64)//10**9)
    trade_date = pd.to_datetime([it.right for it in trade_date], unit='s')
    df_raw['trade_date'] = pd.to_datetime(pd.DatetimeIndex(trade_date).date)

    
    return df_raw


def raw_data_process(df_raw, cut_hour, cut_minute):
    """
    process raw data loaded by function load_news_files:
    1. change senti_type: 2 --> -1
    2. calculate senti_score: senti_score = senti_type*senti_weight*relevance*100
    3. map calendar date to trade date
    """
    df_raw['senti_type'] = df_raw['senti_type'].replace(2,-1)
    df_raw['senti_score'] =  df_raw['senti_type']*df_raw['senti_weight']*df_raw['relevance']*100
    df_raw['code'] = [''.join([i[0:6],'.XSHE']) if i[-5:-3] == 'SZ' else ''.join([i[0:6],'.XSHG']) for i in df_raw['code'] ]
   
    return map_trade_date(df_raw, cut_hour, cut_minute)

In [99]:
cut_hour=15  # 3:00 PM
cut_minute=0
df_senti = raw_data_process(df_senti_raw, cut_hour, cut_minute)

In [100]:
df_senti

Unnamed: 0,news_id,news_time,code,senti_type,senti_weight,relevance,senti_score,trade_date
0,4088047,2016-01-01 00:00:00,600053.XSHG,1,0.68,0.1458,9.9144,2016-01-04
1,4086425,2016-01-01 00:00:00,002202.XSHG,1,0.86,0.0633,5.4438,2016-01-04
2,4086320,2016-01-01 00:00:00,000732.XSHG,0,0.99,0.0251,0.0000,2016-01-04
3,4085488,2016-01-01 00:00:00,601618.XSHG,-1,1.00,0.0609,-6.0900,2016-01-04
4,4091599,2016-01-01 00:00:00,601933.XSHG,0,0.74,0.0298,0.0000,2016-01-04
5,4091599,2016-01-01 00:00:00,002024.XSHG,0,0.66,0.0298,0.0000,2016-01-04
6,4086350,2016-01-01 00:00:00,601933.XSHG,0,0.72,0.0448,0.0000,2016-01-04
7,4086851,2016-01-01 00:00:00,600029.XSHG,-1,0.60,1.0000,-60.0000,2016-01-04
8,4085506,2016-01-01 00:00:00,601088.XSHG,0,0.83,0.0176,0.0000,2016-01-04
9,4091645,2016-01-01 00:00:00,600029.XSHG,-1,0.53,0.5026,-26.6378,2016-01-04


### Sort data for companies in one day

* drop stock of which the listed times less than 20 days

The dayily sentimental score:

$ S_i^{t} = \sum_{n=1}^N Score_{n}/N $

* $Score_n$ is the score of stock ```i``` in news ```n``` at time ```t```    

In [101]:
# Daily sentimental score factor
def func_senti_fac_daily(df_senti):
    fac_senti_1d = df_senti.groupby(['trade_date']).apply(lambda x: x.groupby('code')['senti_score'].mean())
    fac_senti_1d.index.names = ['date','asset']
    return fac_senti_1d

In [106]:
def filter_listed_days(fac_df, days=20):
    '''
    filter the stocks listed less than 'days'.  
    fac_df: multi-index dataframe of the sentimental factor
    days: defalut 20, the stock has been listed less this days will be deleted 
    '''    
    def _combine_list_days(df):
        def _cal_days(code):
            dates = list_dts_dct.get(code)
            if not dates:
                return -1
            elif dates[0]>dt or dates[1]<dt:
                return -1
            else:
                days = (dt - dates[0]).days
                return days
        dt = df.index.get_level_values(level=0).unique()[0]
        if isinstance(dt, str):
            dt = parse(dt).date()
        df = df.reset_index(level=0, drop=True)
        df['list_days'] = [  _cal_days(code) for code in df.index]

        return df
            
    fac_df = fac_df.to_frame()
    fac_df.columns = ['factor']
    all_codes = fac_df.index.get_level_values(level=1).unique()
    list_dts = pd.read_csv('stock_list_dates.csv',index_col=0, parse_dates=['list_date','dlist_date'])
    list_dts_dct = {k[1]['sec_code']:(k[1]['list_date'], k[1]['dlist_date']) for k in list_dts.iterrows()}
    fac_df = fac_df.groupby(level=0).apply(_combine_list_days)
    fac_df = fac_df.query("list_days > @days")
    
    return fac_df['factor']
   

In [107]:
fac_senti_1d = func_senti_fac_daily(df_senti)
fac_senti_1d = filter_listed_days(fac_senti_1d, days=20)

In [246]:
fac_senti_1d

date        asset      
2016-01-04  600000.XSHG    30.666667
            600004.XSHG     0.000000
            600008.XSHG    75.000000
            600015.XSHG     6.380300
            600028.XSHG     7.392900
            600029.XSHG    -5.436829
            600036.XSHG     2.315475
            600037.XSHG     0.000000
            600038.XSHG    26.626600
            600048.XSHG    25.463400
            600052.XSHG    -2.219900
            600053.XSHG     4.957200
            600056.XSHG    12.562275
            600057.XSHG     0.000000
            600058.XSHG   -12.180000
            600060.XSHG     4.462475
            600064.XSHG     0.000000
            600066.XSHG    27.119300
            600077.XSHG     0.000000
            600079.XSHG    21.276000
            600085.XSHG    18.743700
            600088.XSHG     0.000000
            600094.XSHG    70.000000
            600098.XSHG    25.901400
            600100.XSHG     0.000000
            600101.XSHG     4.750200
            60

In [334]:
#fac_senti_1d.to_excel('fac_senti_1d.xlsx')
fac=fac_senti_1d.to_frame()
fac.factor[0]

30.666666666666668

### Get other feature data and labels
* start day: ```2015.12.1```  end day: ```current day```
* index from TaLib: ```MFI SMA5 SMA10 MOM ROC ATR BETA CCI```
* compare the close price between the former day and current day, if the price drops, label=0, else label=1

Read the feature and label data to local excel files.

In [177]:
import talib
import jqdatasdk
import datetime

In [376]:
start_date = datetime.date(2015, 12, 1)
end_date = datetime.date(2016, 4, 3)
x_all = []
y_all = []
for index in range(len(fac)):
    # 得到计算指标的所有数据
    
    stock_data = get_price(fac.index[index][1], start_date=start_date, end_date=fac.index[index][0], 
                           frequency='daily', fields=['open', 'close', 'high', 'low','volume', 'money'])
    close_prices = stock_data['close'].values
    #通过数据计算指标
    mfi_data = talib.MFI(stock_data['high'].values,stock_data['low'].values,stock_data['close'].values,stock_data['volume'].values,5)[-2]
    sma_data5 = talib.SMA(close_prices,5)[-2]#简单移动平均线（Simple Moving Average，SMA），又称“算术移动平均线”，是指对特定期间的收盘价进行简单平均化的意思
    sma_data10 = talib.SMA(close_prices,10)[-2]
    #ema_data = talib.EMA(close_prices,5)[-2]#指数移动平均线
    mom_data = talib.MOM(close_prices)[-2]#动量线，动量可以视为一段期间内，股价涨跌变动的比率。
    roc_data = talib.ROC(close_prices)[-2]#变动率(Rate of change,ROC)，ROC是由当天的股价与一定的天数之前的某一天股价比较，其变动速度的大小,来反映股票市场变动的快慢程度。ROC，也叫做变动速度指标、变动率指标或变化速率指标。
    atr_data = talib.ATR(stock_data['high'].values,stock_data['low'].values,stock_data['close'].values,5)[-2]
    #ATR,均幅指标（Average True Ranger）,取一定时间周期内的股价波动幅度的移动平均值，主要用于研判买卖时机
    beta_data = talib.BETA(stock_data['high'].values,stock_data['low'].values,5)[-2]
    cci_data = talib.CCI(stock_data['high'].values,stock_data['low'].values,stock_data['close'].values,5)[-2]
    # CCI顺势指标(Commodity Channel Index),CCI指标，是由美国股市分析家唐纳德?蓝伯特（Donald Lambert）所创造的，是一种重点研判股价偏离度的股市分析工具。

    
    features = []
    features.append(fac.index[index][0])
    features.append(fac.index[index][1])
    features.append(mfi_data)
    features.append(sma_data5)
    features.append(sma_data10)
    features.append(mom_data)
    features.append(roc_data)
    features.append(atr_data)
    features.append(beta_data)
    features.append(cci_data)
    features.append(fac.factor[index])
    
    label = False
    if close_prices[-2] < close_prices[-1]:
        label = True
    x_all.append(features)
    y_all.append(label)
    

# 准备算法需要用到的数据
x_train = x_all[: -1]
y_train = y_all[: -1]
x_test = x_all[-1]
y_test = y_all[-1]
print('data done')

data done


In [378]:
df_x1=pd.DataFrame(x_all)
df_y1=pd.DataFrame(y_all)
df_x1.rename(columns={'0':'time','1':'asset','2':'MFI','3':'SMA','4':'WMA','5':'EMA','6':'MOM','7':'ROC','8':'MA','9':'CCI','10':'SENTI'})
df_x1.to_excel('feature_new.xlsx')
df_y1.to_excel('label_new.xlsx')
#df_x1.iloc[:,0]
df_x1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,2016-01-04,600000.XSHG,2.620627e+01,11.976,12.050,-0.14,-1.183432,0.352271,0.524071,-99.737533,30.666667
1,2016-01-04,600004.XSHG,4.776406e+01,9.138,9.249,-0.02,-0.217865,0.211334,0.244056,39.855072,0.000000
2,2016-01-04,600008.XSHG,1.862869e+01,4.854,4.908,-0.15,-3.054990,0.118358,0.205416,-77.651515,75.000000
3,2016-01-04,600015.XSHG,8.314698e+01,8.978,8.976,0.65,7.611241,0.338147,0.161472,93.137255,6.380300
4,2016-01-04,600028.XSHG,2.424278e+01,3.958,3.985,-0.05,-1.250000,0.061401,0.473047,-17.156863,7.392900
5,2016-01-04,600029.XSHG,4.382751e+01,8.300,8.433,0.44,5.670103,0.320832,0.010242,-106.060606,-5.436829
6,2016-01-04,600036.XSHG,6.947149e+01,15.834,15.833,0.78,5.206943,0.418683,-0.026633,12.858556,2.315475
7,2016-01-04,600037.XSHG,1.571536e+01,21.018,21.409,-1.28,-5.923184,0.635583,-0.046732,-99.544937,0.000000
8,2016-01-04,600038.XSHG,4.459055e+01,51.352,51.926,-2.48,-4.590892,1.320202,0.345006,94.764745,26.626600
9,2016-01-04,600048.XSHG,2.015128e+01,9.494,9.810,-0.76,-7.600000,0.456775,0.453234,-69.833853,25.463400
