In [2]:
"""
This is a template for the APIs of models to be used into the stacking framework.
"""
from time import time, ctime
from sklearn.linear_model import LinearRegression
import pandas as pd

class model_example():
    """base class for the model

    this class is for a model (that can also be
    a combination of bagged models)
    The commonality of the bagged models is that
    they share the feature generation
    """

    def __init__(self, name):
        self.name  = name
        self.model = None
        self.type  = LinearRegression
        print("\ninit model {}".format(self.name))

    def _generate_features(self, market_data, news_data, verbose=False):
        """
        given the original market_data and news_data
        generate new features, doesn't change original data.
        NOTE: data cleaning and preprocessing is not here,
        here is only feats engineering

        Args:
            [market_train_df, news_train_df]: pandas.DataFrame
        Returns:
            complete_features: pandas.DataFrame
        """
        start_time = time()
        if verbose: print("Starting features generation for model {}, {}".format(self.name, ctime()))

        complete_features = market_data.copy()
        complete_features['open+close'] = complete_features['open'] + complete_features['close']
        complete_features.drop(['time','assetCode','assetName'],axis=1,inplace=True)
        complete_features.fillna(0, inplace=True)

        if verbose: print("Finished features generation for model {}, TIME {}".format(self.name, time()-start_time))
        return complete_features

    def train(self, X, Y, verbose=False):
        """
        basic method to train a model with given data
        model will be inside self.model after training

        Args:
            X: [market_train_df, news_train_df]
            Y: [target]
            verbose: (bool)
        Returns:
            (optional) training_results
        """
        start_time = time()
        if verbose: print("Starting training for model {}, {}".format(self.name, ctime()))

        X_train = self._generate_features(X[0], X[1])
        if verbose: print("X_train shape {}".format(X_train.shape))
        self.model = LinearRegression()
        self.model.fit(X_train, Y)
        del X_train

        if verbose: print("Finished training for model {}, TIME {}".format(self.name, time()-start_time))


    def predict(self, X, verbose=False):
        """
        given a block of X features gives prediction for everyrow

        Args:
            X: [market_train_df, news_train_df]
        Returns:
            y: pandas.Series
        """
        start_time = time()
        if verbose: print("Starting prediction for model {}, {}".format(self.name, ctime()))
        if self.model is None:
            raise "Error: model is not trained!"

        X_test = self._generate_features(X[0], X[1])
        if verbose: print("X_test shape {}".format(X_test.shape))
        y_test = self.model.predict(X_test)

        if verbose: print("Finished prediction for model {}, TIME {}".format(self.name, time()-start_time))
        return y_test


    def predict_rolling(self, historical_df, prediction_length, verbose=False):
        """
        predict features from X, uses historical for (lagged) feature generation
        to be used with rolling prediciton structure from competition

        Args:
            historical_df: [market_train_df, news_train_df]
            prediction_length: generate features on historical_df, predict only on the last rows
        """
        start_time = time()
        if verbose: print("Starting rolled prediction for model {}, {}".format(self.name, ctime()))

        processed_historical_df = self._generate_features(historical_df[0], historical_df[1])
        X_test = processed_historical_df.iloc[-prediction_length:]
        if verbose: print("X_test shape {}".format(X_test.shape))
        y_test = self.model.predict(X_test)

        if verbose: print("Finished rolled prediction for model {}, TIME {}".format(self.name, time()-start_time))
        return y_test

In [18]:
df

Unnamed: 0,time,assetCode,assetName,volume,close,open,returnsClosePrevRaw1,returnsOpenPrevRaw1,returnsClosePrevMktres1,returnsOpenPrevMktres1,returnsClosePrevRaw10,returnsOpenPrevRaw10,returnsClosePrevMktres10,returnsOpenPrevMktres10,returnsOpenNextMktres10,universe,lag_10_open_max
0,2007-02-01 22:00:00+00:00,A.N,Agilent Technologies Inc,2606900.0,32.19,32.17,0.005938,0.005312,,,-0.001860,0.000622,,,0.034672,1.0,
1,2007-02-01 22:00:00+00:00,AAI.N,AirTran Holdings Inc,2051600.0,11.12,11.08,0.004517,-0.007168,,,-0.078708,-0.088066,,,0.027803,0.0,
2,2007-02-01 22:00:00+00:00,AAP.N,Advance Auto Parts Inc,1164800.0,37.51,37.99,-0.011594,0.025648,,,0.014332,0.045405,,,0.024433,1.0,
3,2007-02-01 22:00:00+00:00,AAPL.O,Apple Inc,23747329.0,84.74,86.23,-0.011548,0.016324,,,-0.048613,-0.037182,,,-0.007425,1.0,
4,2007-02-01 22:00:00+00:00,ABB.N,ABB Ltd,1208600.0,18.02,18.01,0.011791,0.025043,,,0.012929,0.020397,,,-0.017994,1.0,
5,2007-02-01 22:00:00+00:00,ABC.N,AmerisourceBergen Corp,1657300.0,52.37,52.40,-0.000191,0.008468,,,0.089000,0.077746,,,0.058680,1.0,
6,2007-02-01 22:00:00+00:00,ABD.N,ACCO Brands Corp,1186200.0,23.63,24.13,-0.020721,-0.007404,,,0.005104,0.026809,,,-0.044285,0.0,
7,2007-02-01 22:00:00+00:00,ABM.N,ABM Industries Inc,301200.0,26.19,25.90,0.013545,0.014890,,,0.068980,0.047311,,,0.016578,0.0,
8,2007-02-01 22:00:00+00:00,ABT.N,Abbott Laboratories,5692300.0,52.87,52.50,-0.002453,-0.004739,,,0.001515,-0.004928,,,0.009861,1.0,
9,2007-02-01 22:00:00+00:00,ABV.N,Companhia de Bebidas das Americas Ambev,401800.0,52.46,52.10,0.014112,0.024784,,,0.042321,0.027411,,,0.012917,1.0,86.23


In [19]:
locals()

{'In': ['',
  u'df=pd.read_csv("data/market_train_df_head.csv").drop(\'Unnamed: 0\', axis=1)',
  u'"""\nThis is a template for the APIs of models to be used into the stacking framework.\n"""\nfrom time import time, ctime\nfrom sklearn.linear_model import LinearRegression\nimport pandas as pd\n\nclass model_example():\n    """base class for the model\n\n    this class is for a model (that can also be\n    a combination of bagged models)\n    The commonality of the bagged models is that\n    they share the feature generation\n    """\n\n    def __init__(self, name):\n        self.name  = name\n        self.model = None\n        self.type  = LinearRegression\n        print("\\ninit model {}".format(self.name))\n\n    def _generate_features(self, market_data, news_data, verbose=False):\n        """\n        given the original market_data and news_data\n        generate new features, doesn\'t change original data.\n        NOTE: data cleaning and preprocessing is not here,\n        here is 