In [2]:
"""
This is a template for the APIs of models to be used into the stacking framework.
"""
from time import time, ctime
from sklearn.linear_model import LinearRegression
import pandas as pd

class model_example():
    """base class for the model

    this class is for a model (that can also be
    a combination of bagged models)
    The commonality of the bagged models is that
    they share the feature generation
    """

    def __init__(self, name):
        self.name  = name
        self.model = None
        self.type  = LinearRegression
        print("\ninit model {}".format(self.name))

    def _generate_features(self, market_data, news_data, verbose=False):
        """
        given the original market_data and news_data
        generate new features, doesn't change original data.
        NOTE: data cleaning and preprocessing is not here,
        here is only feats engineering

        Args:
            [market_train_df, news_train_df]: pandas.DataFrame
        Returns:
            complete_features: pandas.DataFrame
        """
        start_time = time()
        if verbose: print("Starting features generation for model {}, {}".format(self.name, ctime()))

        complete_features = market_data.copy()
        complete_features['open+close'] = complete_features['open'] + complete_features['close']
        complete_features.drop(['time','assetCode','assetName'],axis=1,inplace=True)
        complete_features.fillna(0, inplace=True)

        if verbose: print("Finished features generation for model {}, TIME {}".format(self.name, time()-start_time))
        return complete_features

    def train(self, X, Y, verbose=False):
        """
        basic method to train a model with given data
        model will be inside self.model after training

        Args:
            X: [market_train_df, news_train_df]
            Y: [target]
            verbose: (bool)
        Returns:
            (optional) training_results
        """
        start_time = time()
        if verbose: print("Starting training for model {}, {}".format(self.name, ctime()))

        X_train = self._generate_features(X[0], X[1])
        if verbose: print("X_train shape {}".format(X_train.shape))
        self.model = LinearRegression()
        self.model.fit(X_train, Y)
        del X_train

        if verbose: print("Finished training for model {}, TIME {}".format(self.name, time()-start_time))


    def predict(self, X, verbose=False):
        """
        given a block of X features gives prediction for everyrow

        Args:
            X: [market_train_df, news_train_df]
        Returns:
            y: pandas.Series
        """
        start_time = time()
        if verbose: print("Starting prediction for model {}, {}".format(self.name, ctime()))
        if self.model is None:
            raise "Error: model is not trained!"

        X_test = self._generate_features(X[0], X[1])
        if verbose: print("X_test shape {}".format(X_test.shape))
        y_test = self.model.predict(X_test)

        if verbose: print("Finished prediction for model {}, TIME {}".format(self.name, time()-start_time))
        return y_test


    def predict_rolling(self, historical_df, prediction_length, verbose=False):
        """
        predict features from X, uses historical for (lagged) feature generation
        to be used with rolling prediciton structure from competition

        Args:
            historical_df: [market_train_df, news_train_df]
            prediction_length: generate features on historical_df, predict only on the last rows
        """
        start_time = time()
        if verbose: print("Starting rolled prediction for model {}, {}".format(self.name, ctime()))

        processed_historical_df = self._generate_features(historical_df[0], historical_df[1])
        X_test = processed_historical_df.iloc[-prediction_length:]
        if verbose: print("X_test shape {}".format(X_test.shape))
        y_test = self.model.predict(X_test)

        if verbose: print("Finished rolled prediction for model {}, TIME {}".format(self.name, time()-start_time))
        return y_test

In [3]:
df=pd.read_csv("data/market_train_df_head.csv").drop('Unnamed: 0', axis=1)

In [16]:
df['open'].rolling(10, min_periods=1).max()

0      32.17
1      32.17
2      37.99
3      86.23
4      86.23
5      86.23
6      86.23
7      86.23
8      86.23
9      86.23
10     86.23
11     86.23
12     86.23
13    117.77
14    117.77
15    117.77
16    117.77
17    117.77
18    117.77
19    117.77
20    117.77
21    117.77
22    117.77
23     68.00
24     68.00
25     68.00
26     68.00
27     68.00
28     68.00
29     68.00
       ...  
70     60.40
71     60.40
72     60.40
73     60.40
74     60.40
75    114.00
76    114.00
77    114.00
78    114.00
79    114.00
80    114.00
81    114.00
82    114.00
83    114.00
84    114.00
85     70.19
86     58.96
87     58.96
88     79.62
89     79.62
90     79.62
91     79.62
92     79.62
93     79.62
94     79.62
95     79.62
96     79.62
97     79.62
98     74.62
99     74.62
Name: open, Length: 100, dtype: float64

In [12]:
df[['open','lag_10_open_max']]

Unnamed: 0,open,lag_10_open_max
0,32.17,
1,11.08,
2,37.99,
3,86.23,
4,18.01,
5,52.40,
6,24.13,
7,25.90,
8,52.50,
9,52.10,86.23
