In [1056]:
import math, os, re, time, random
from datetime import datetime, timedelta
import numpy as np
import pandas as pd

from sklearn.decomposition import PCA
import statsmodels.api as sm
from statsmodels import regression
from sklearn import linear_model
from sklearn.preprocessing import normalize
        
import numpy as np, pandas as pd, math

## input: path
## output: (feature_file, object_file)
def get_files(path = './gupiao/'):
    ## get paths
    def feature_file(number):
        return path + 'features/' + number + '_20060101-20180725.csv'
    ## get paths
    def object_file(number):
        return path + 'objects/' + number + '_20060101-20180823.csv'
 
    feature_numbers = { x.split('_')[0] for x in os.listdir(path + 'features') }
    object_numbers = { x.split('_')[0] for x in os.listdir(path + 'objects') }

    file_numbers = feature_numbers.intersection(object_numbers)
    file_names = [ (feature_file(x), object_file(x)) for x in file_numbers ]
    
    return file_names

## input: (feature_file, object_file)
## output: pd.Dataframe
def get_data(file_pair):
    
    (feature_file, object_file) = file_pair
    
    feature = pd.read_csv(feature_file)
    
    #### CURRENTLY USING ALL FEATURES 
#     pca_factors = ['LCAP','LFLO','EquityFixedAssetRatio','DAVOL20','Volatility','DAREC','ROA','OperatingRevenueGrowRate']
    pca_factors = ['LCAP', 'LFLO', 'PE','PB', 'PCF', 'PS', 'ROE', 'ROA', 'EPS', 'NetProfitRatio','GrossIncomeRatio', 'OperatingRevenueGrowRate','OperatingProfitGrowRate', 'OperCashGrowRate', 'DebtsAssetRatio','FixAssetRatio', 'CurrentRatio', 'QuickRatio', 'KDJ_K', 'BIAS20','BIAS60', 'PSY', 'VOL20', 'VOL60', 'Volatility', 'AR', 'BR', 'REC','DAREC']
    #### CURRENTLY USING ALL FEATURES 
    
    pca_factors = list(set(list(feature.columns)).intersection(pca_factors))
    feature = feature.get(['tradeDate'] + pca_factors)
    
    feature['month'] = feature['tradeDate'].apply(lambda x: datetime.strftime(datetime.strptime(x, "%Y-%m-%d"), "%Y-%m"))
    
    #### VERY IMPORTANT
    #     feature = feature.fillna(feature.mean())
    feature.dropna(axis=1)
    feature.reset_index(drop=1)
    #### VERY IMPORTANT
    
    Object = pd.read_csv(object_file)
    Object = Object.get(['tradeDate', 'dailyReturnReinv'])
    Object['month'] = Object['tradeDate'].apply(lambda x: datetime.strftime(datetime.strptime(x,"%Y-%m-%d"), "%Y-%m"))

    new_feature = pd.DataFrame(columns=['month']+pca_factors)
    for (i, (month, dat)) in enumerate(feature.groupby('month')):
        new_feature.loc[i, ['month']] = month
        new_feature.loc[i, pca_factors] = dat[pca_factors].apply(np.mean)
    
    new_Object = pd.DataFrame(columns=['month','dailyReturnReinv'])
    for (i, (month, dat)) in enumerate(Object.groupby('month')):
        new_Object.loc[i, ['month']] = month
        new_Object.loc[i, ['dailyReturnReinv']] = dat[['dailyReturnReinv']].apply(np.mean)
    
    new_data = pd.merge(new_feature, new_Object, how='outer', on=['month'])
    new_data.dropna(inplace = True)
    new_data.reset_index(drop=True,inplace=True)
    new_data.loc[new_data.shape[0]-1, new_data.columns[-1]] = math.nan
    
    return new_data

## input: pd.Dataframe
## output: pd.Dataframe (with principle components)
def get_pca_data(data, describe_ratio = 0.99):
    
    df = data[data.columns[1:-1]]
    df = df.values
    ## normalize principle components
    df = normalize(df, norm='l2', axis=1, copy=True, return_norm=False)
    pca = PCA(copy = True, n_components = describe_ratio, svd_solver = 'full')
    
    pca.fit_transform(df)
    low_d = pca.transform(df)
    
    low_d = normalize(low_d, norm='l2', axis=1, copy=True, return_norm=False)

    pr = pd.DataFrame(low_d, dtype='float64')
    pr.columns = ['pr_{}'.format(i) for i in range(len(pr.columns)) ]
    ## get data with only principle components
    data = pd.concat([data[data.columns[0]], pr, data[data.columns[-1]]], axis = 1)

    return data

## input: pd.Dataframe (with last row unpredicted)
## output: prediction of last row
def get_regression_model(data):

    data, sample = data.iloc[:-1,], data.iloc[-1,]
    data = data.values[:,1:]
    
    sample[0] = 1.0; sample = sample[:-1]
    sample = np.array(sample, dtype='float64')
    X, Y = data[:,:-1], data[:,-1]
    X, Y = np.array(X, dtype='float64', copy=True), np.array(Y, dtype='float64')
    X = sm.add_constant(X)
    model = sm.OLS(Y,X)
    results = model.fit()
#     print (results.params)  print (results.summary())
    return results.predict(sample)[0]

if __name__ == '__main__':
    
    selection = {}
    for file_name in get_files():
        try:
            data = get_data(file_name)
            pca_data = get_pca_data(data)
            date = pca_data.iloc[-1,0]
            score = get_regression_model(pca_data)
            name = file_name[0].split('/')[-1].split('_')[0]
            print ('{} {} {}'.format(date, name, score))
            selection[(date,name)] = score
        except Exception as err:
            print("Error {}".format(err))
    
    sorted_selection = sorted(selection.items(), key=lambda x: x[1], reverse=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


2018-07 600331 0.012666025571639336
2018-07 600406 0.002555455841212527


  explained_variance_ = (S ** 2) / (n_samples - 1)


Error zero-size array to reduction operation maximum which has no identity
2018-07 000898 0.005079452538549065
2018-07 000550 0.00010222200021284072
2018-07 600215 0.00727521125687962
2018-07 000601 0.001118757064687332
2018-07 600021 0.003488520733737622
2018-07 000036 -0.00211034740029469
2018-07 600688 -0.001534270723445791
2018-07 600220 0.019671508887693524
2018-07 600271 0.004236772962410087
2017-08 000422 -0.00344597959754669
2018-07 000630 -0.0016299582561855315
2018-07 600597 0.00100500321527705
2018-07 600675 -0.006400168858010747
2018-07 600740 -0.002101656161961829
2018-07 600171 -0.0005008357778660199
Error Input contains NaN, infinity or a value too large for dtype('float64').
2018-07 600256 0.014003136842825548
2012-01 000652 -0.023494686803548718
2018-07 000778 0.002135028619318923
2016-03 000932 0.00681196456888184
2018-07 600320 0.011220445419484762
2018-07 600026 0.001644167545165348
Error zero-size array to reduction operation maximum which has no identity
2018-07 6

2018-04 600143 -0.005313951561604684
2018-04 600057 0.001185448721921193
Error Input contains NaN, infinity or a value too large for dtype('float64').
Error Input contains NaN, infinity or a value too large for dtype('float64').
2018-04 000599 0.002798473436255908
2018-07 000930 0.004049619047619049
2018-07 600639 -8.207237369388669e-05
Error zero-size array to reduction operation maximum which has no identity
2018-07 600820 0.0005636660032945402
2018-07 600600 0.002010226748081036
2018-07 600085 9.436292665116256e-05
2017-04 600100 0.004691314392909564
2018-07 000488 -0.0017857603295561492
2017-04 600096 0.0038168185670544295
2018-07 600637 -0.0012046873751285461
2018-07 600299 -0.0011659981051249171
Error Input contains NaN, infinity or a value too large for dtype('float64').
2018-07 000786 0.0013198454964635834
2018-07 000708 0.0002451483428572809
2018-04 600797 0.0003760371286419023
2018-07 000822 -0.00244430698557386
2018-07 600970 0.0010407325635487276
2016-04 600121 0.0055335973

In [1058]:
for x in sorted_selection:
    print (x)

(('2009-10', '000562'), 0.028299153338554244)
(('2018-07', '600220'), 0.019671508887693524)
(('2018-07', '600256'), 0.014003136842825548)
(('2015-04', '600832'), 0.01391103749951893)
(('2018-07', '600110'), 0.013252909188291771)
(('2018-07', '600331'), 0.012666025571639336)
(('2018-07', '000767'), 0.012060779704017165)
(('2018-07', '600602'), 0.012025665105928455)
(('2018-07', '000636'), 0.011593755206494059)
(('2018-07', '600078'), 0.011394676575987818)
(('2018-07', '600320'), 0.011220445419484762)
(('2015-04', '000024'), 0.010531145632561181)
(('2013-05', '600088'), 0.010280012406346442)
(('2015-01', '600601'), 0.009803330608571609)
(('2018-07', '000400'), 0.00965217297965138)
(('2018-07', '600569'), 0.009580531165035941)
(('2018-07', '600739'), 0.009517717966205402)
(('2018-07', '000607'), 0.009333449346362847)
(('2018-07', '600282'), 0.008839015505083491)
(('2018-07', '000725'), 0.008379377729349862)
(('2018-07', '000528'), 0.008309876718968613)
(('2014-10', '600884'), 0.0081269567

In [1020]:
print(sorted_selection)

[('000562', 0.04783927012543639),
 ('600012', 0.025002701312588447),
 ('600088', 0.021196853023769686),
 ('000539', 0.019889420385489368),
 ('000617', 0.019654301199672253),
 ('600832', 0.01880939951591364),
 ('600739', 0.01817124719142578),
 ('600256', 0.01559546581309939),
 ('000962', 0.014567123546083733),
 ('600132', 0.014341171072755704),
 ('600500', 0.01314580109008682),
 ('600270', 0.01309394600019777),
 ('600020', 0.012721591268809588),
 ('000726', 0.012572862095829163),
 ('000968', 0.01195243999423871),
 ('000400', 0.011835234357805546),
 ('000636', 0.011714175199303184),
 ('000898', 0.011378352832291997),
 ('600410', 0.011362306303562438),
 ('000021', 0.011076730863379663),
 ('000937', 0.011000567364337467),
 ('600549', 0.010880940614604945),
 ('600662', 0.010751231448317359),
 ('600498', 0.010665317949261933),
 ('000920', 0.010132978266991326),
 ('600058', 0.009975756316734988),
 ('600104', 0.00983130259917925),
 ('000009', 0.009608593208964353),
 ('000012', 0.00947220370065