In [178]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
import pandas as pd
import numpy as np

In [179]:
MAX_LOOKBACK = 50
##select = RFE(RandomForestRegressor(n_estimators=100, random_state=42),
                ## n_features_to_select=10)

In [180]:
    def read_stock_data():
        '''
        Description:
            Reads in simulated stock data from stock_data.csv
        Returns:
            stock_df (DataFrame): standardized ticker/factor data in pandas df
        Raises:
            AssertionError: ticker_data.csv/factor_data.csv has an invalid format
        '''
        ticker_df = pd.read_csv('stock_data/ticker_data.csv')
        factor_df = pd.read_csv('stock_data/factor_data.csv')
        assert 'timestep' in ticker_df.columns, "ticker_data.csv has an invalid format"
        assert 'ticker' in ticker_df.columns, "ticker_data.csv has an invalid format"
        assert 'returns' in ticker_df.columns, "ticker_data.csv has an invalid format"
        assert 'timestep' in factor_df.columns, "factor_data.csv has an invalid format"
        ticker_df.set_index('timestep', inplace=True)
        factor_df.set_index('timestep', inplace=True)
        stock_df = ticker_df.join(factor_df, how='left')
        return stock_df

In [181]:
stock_df=read_stock_data();

In [182]:
    def build_signal(stock_features):
        enc = LabelBinarizer()
        enc_results = enc.fit_transform(stock_features['industry'])
        
        industry_binary = pd.DataFrame(enc_results, columns=enc.classes_)
        stock_features.drop('industry', axis=1, inplace=True)
        feature_binary = stock_features.join(industry_binary, how='left')
        feature_binary.fillna(0, inplace=True)
        
        returns = feature_binary.set_index('ticker')['returns']
        
        X_train, X_test, y_train, y_test = train_test_split(feature_binary, returns, random_state = 0)
        
        rf = RandomForestRegressor().fit(X_train, y_train)
        
        print("Score on training set for Unscaled RandomForestRegressor:", rf.score(X_train, y_train))
        print("Score on test set for Unscaled RandomForestRegressor:", rf.score(X_test, y_test))
        print("Importance of features:", rf.feature_importances_)
        #feature_importance = pd.DataFrame(rf.feature_importances_,
                                          #index = X_train.columns,
                                          #columns=['importance']).sort_values('importance', ascending=False)
        #return feature_importance.head()

In [183]:
    def simulate_portfolio():
        '''
        Description:
            Simulates performance of the portfolio on historical data
        Return:
            sharpe (int) - sharpe ratio for the portfolio
        '''
        daily_returns = []
        stock_df = read_stock_data()
        for idx in stock_df.index.unique():
            #print("timestep", idx)
            if idx < MAX_LOOKBACK:
                continue
            stock_features = stock_df.loc[idx-MAX_LOOKBACK:idx-1]
            returns = stock_df.loc[idx:idx].set_index('ticker')['returns']
            
            signal = build_signal(stock_features)
            #signal_return = returns * signal
            #daily_returns.append(np.mean(signal_return))
        #sharpe_ratio = np.sqrt(252) * (np.mean(daily_returns) / np.std(daily_returns))
        return signal

In [184]:
simulate_portfolio()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Score on training set for Unscaled RandomForestRegressor: 0.995649447363
Score on test set for Unscaled RandomForestRegressor: 0.995795827895
Importance of features: [  0.00000000e+00   3.72269357e-07   3.00260026e-06   9.99988342e-01
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   5.97802505e-06   3.45040300e-07   4.61762924e-07
   2.04839721e-07   6.41209769e-07   6.52170587e-07   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Score on training set for Unscaled RandomForestRegressor: 0.996120108037
Score on test set for Unscaled RandomForestRegressor: 0.996104222184
Importance of features: [  0.00000000e+00   0.00000000e+00   0.00000000e+00   9.99990663e-01
   0.00000000e+00   7.88420798e-07   0.00000000e+00   0.00000000e+00
   0.00000000e+00   4.02377570e-06   4.52431375e-06   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Score on training set for Unscaled RandomForestRegressor: 0.99578645938
Score on test set for Unscaled RandomForestRegressor: 0.995846885762
Importance of features: [  4.47843422e-07   3.66677447e-07   0.00000000e+00   9.99996801e-01
   0.00000000e+00   0.00000000e+00   5.31169918e-07   2.00279450e-07
   0.00000000e+00   6.24151869e-07   6.30020763e-07   3.98934709e-07
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Score on training set for Unscaled RandomForestRegressor: 0.995864065932
Score on test set for Unscaled RandomForestRegressor: 0.995725757385
Importance of features: [  0.00000000e+00   2.36416926e-07   0.00000000e+00   9.99997592e-01
   1.66035974e-07   5.00702991e-07   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   2.89729623e-07   3.22864355e-07
   0.00000000e+00   4.09157424e-07   4.82661824e-07   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Score on training set for Unscaled RandomForestRegressor: 0.995781737698
Score on test set for Unscaled RandomForestRegressor: 0.99569560356
Importance of features: [  2.59877727e-07   0.00000000e+00   0.00000000e+00   9.99997664e-01
   0.00000000e+00   3.46278364e-07   1.72106003e-07   3.50045311e-07
   0.00000000e+00   1.67073146e-07   0.00000000e+00   1.04109704e-06
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Score on training set for Unscaled RandomForestRegressor: 0.995766776617
Score on test set for Unscaled RandomForestRegressor: 0.995635311157
Importance of features: [  1.43363410e-06   0.00000000e+00   0.00000000e+00   9.99998337e-01
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   2.28895272e-07   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Score on training set for Unscaled RandomForestRegressor: 0.995937611988
Score on test set for Unscaled RandomForestRegressor: 0.995936073521
Importance of features: [  0.00000000e+00   0.00000000e+00   0.00000000e+00   9.99993025e-01
   0.00000000e+00   3.30663177e-06   0.00000000e+00   4.53277674e-07
   0.00000000e+00   1.06511974e-06   1.30157646e-06   8.48841671e-07
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


KeyboardInterrupt: 