In [46]:
"""
This is a template for the APIs of models to be used into the stacking framework.
run with Python 3.x
"""
from time import time, ctime
import lightgbm as lgb
import pandas as pd
from datetime import datetime


def sigma_score(preds, valid_data):
    """
    this is a custom metric used to train the model_lgbm_baseline
    """
    df_time = valid_data.params['extra_time'] # will be injected afterwards
    labels = valid_data.get_label()

    #    assert len(labels) == len(df_time)

    x_t = preds * labels #  * df_valid['universe'] -> Here we take out the 'universe' term because we already keep only those equals to 1.

    # Here we take advantage of the fact that `labels` (used to calculate `x_t`)
    # is a pd.Series and call `group_by`
    x_t_sum = x_t.groupby(df_time).sum()
    score = x_t_sum.mean() / x_t_sum.std()

    return 'sigma_score', score, True

class model_lgbm_baseline():
    """this is a baseline lightLGB model with simple features

    this class is for a model (that can also be
    a combination of bagged models)
    The commonality of the bagged models is that
    they share the feature generation
    """

    def __init__(self, name):
        self.name  = name
        self.model = None
        self.type  = lgb.Booster
        print("\ninit model {}".format(self.name))

    def _generate_features(self, market_data, news_data, verbose=False):
        """
        GENERAL:
        given the original market_data and news_data
        generate new features, doesn't change original data.
        NOTE: data cleaning and preprocessing is not here,
        here is only feats engineering

        MODEL SPECIFIC:
        as as a baseline for decision trees model we add
        features that are the most popular among public
        kernels on Kaggle:

        - [36] short-term lagged features on returns
        - [6]  long-term moving averages
        - [1]  day of the week

        Args:
            [market_train_df, news_train_df]: pandas.DataFrame
        Returns:
            complete_features: pandas.DataFrame
        """
        start_time = time()
        if verbose: print("Starting features generation for model {}, {}".format(self.name, ctime()))

        complete_features = market_data.copy()

        # [36] short-term lagged features on returns
        for feature in ['returnsClosePrevRaw10','returnsOpenPrevRaw10','returnsClosePrevMktres10','returnsOpenPrevMktres10']:
            for lag in [3,7,14]:
                complete_features['lag_{}_{}_max'.format(lag, feature)]  = complete_features[feature].rolling(lag, min_periods=1).max()
                complete_features['lag_{}_{}_min'.format(lag, feature)]  = complete_features[feature].rolling(lag, min_periods=1).min()
                complete_features['lag_{}_{}_mean'.format(lag, feature)] = complete_features[feature].rolling(lag, min_periods=1).mean()

        # [6]  long-term moving averages
        for feature in ['open','close']:
            for lag in [50, 100, 200]:
                complete_features['lag_{}_{}_mean'.format(lag, feature)]  = complete_features[feature].rolling(lag, min_periods=1).mean()

        self.max_lag = 200

        # [1]  day of the week
        complete_features['weekday'] = complete_features['time'].apply(lambda x: datetime.strptime(x.split()[0], "%Y-%M-%d").weekday())


        complete_features.drop(['time','assetCode','assetName'],axis=1,inplace=True)
        complete_features.fillna(0, inplace=True) # TODO: for next models control this fillna with EDA

        if verbose: print("Finished features generation for model {}, TIME {}".format(self.name, time()-start_time))
        return complete_features

    def train(self, X, Y, verbose=False):
        """
        GENERAL:
        basic method to train a model with given data
        model will be inside self.model after training

        MODEL SPECIFIC:

        - split 0.8 train validation
        - universe filter on validation
        - custom metric used (sigma_scored) ,
            need to put 'metric':'None' in parameters
        - one single lgbm with params_1 from script 67

        Args:
            X: [market_train_df, news_train_df]
            Y: [target]
            verbose: (bool)
        Returns:
            (optional) training_results
        """
        start_time = time()
        if verbose: print("Starting training for model {}, {}".format(self.name, ctime()))

        time_reference = X[0]['time'] #time is dropped in preprocessing, but is needed later for metrics eval

        X = self._generate_features(X[0], X[1])

        # split X in X_train and Y_val
        split = int(len(X) * 0.8)
        test_train_distsance = 0
        X_train, X_val = X[:split - test_train_distsance], X[split:]
        Y_train, Y_val = Y[:split - test_train_distsance], Y[split:]

        if verbose: print("X_train shape {}".format(X_train.shape))
        if verbose: print("X_val shape {}".format(X_train.shape))

        # universe filtering on validation set
        universe_filter = X['universe'][split:] == 1.0
        X_val = X_val[universe_filter]
        Y_val = Y_val[universe_filter]

        # this is a time_val series used to calc the sigma_score later, applied split and universe filter
        time_val = time_reference[split:][universe_filter]
        assert len(time_val) == len(X_val)
        time_train = time_reference[:split - test_train_distsance]
        assert len(time_train) == len(X_train)

        # train parameters prearation
        train_cols = X.columns.tolist()
        lgb_train = lgb.Dataset(X_train.values, Y_train, feature_name=train_cols, free_raw_data=False)
        lgb_val = lgb.Dataset(X_val.values, Y_val, feature_name=train_cols, free_raw_data=False)

        lgb_train.params = {
            'extra_time' : time_train.factorize()[0]
        }
        lgb_val.params = {
            'extra_time' : time_val.factorize()[0]
        }

        x_1 = [0.19000424246380565, 2452, 212, 328, 202]
        params_1 = {
            # from script 67
            'task': 'train',
            'boosting_type': 'gbdt',
            'objective': 'binary',
    #         'objective': 'regression',
            'learning_rate': x_1[0],
            'num_leaves': x_1[1],
            'min_data_in_leaf': x_1[2],
    #         'num_iteration': x_1[3],
            'num_iteration': 239,
            'max_bin': x_1[4],
            'verbose': 1
        }

        # start training
        training_results = {}
        self.model = lgb.train(
                params_1,
                lgb_train,
                num_boost_round=1000,
                valid_sets=(lgb_val,lgb_train),
                valid_names=('valid','train'),
                verbose_eval=25,
                early_stopping_rounds=100,
                #feval=sigma_score,
                evals_result=training_results)
        del X, X_train, X_val

        if verbose: print("Finished training for model {}, TIME {}".format(self.name, time()-start_time))
        return training_results


    def predict(self, X, verbose=False):
        """
        given a block of X features gives prediction for everyrow

        Args:
            X: [market_train_df, news_train_df]
        Returns:
            y: pandas.Series
        """
        start_time = time()
        if verbose: print("Starting prediction for model {}, {}".format(self.name, ctime()))
        if self.model is None:
            raise "Error: model is not trained!"

        X_test = self._generate_features(X[0], X[1])
        if verbose: print("X_test shape {}".format(X_test.shape))
        y_test = self.model.predict(X_test)

        if verbose: print("Finished prediction for model {}, TIME {}".format(self.name, time()-start_time))
        return y_test


    def predict_rolling(self, historical_df, prediction_length, verbose=False):
        """
        predict features from X, uses historical for (lagged) feature generation
        to be used with rolling prediciton structure from competition

        Args:
            historical_df: [market_train_df, news_train_df]
            prediction_length: generate features on historical_df, predict only on the last rows
        """
        start_time = time()
        if verbose: print("Starting rolled prediction for model {}, {}".format(self.name, ctime()))

        processed_historical_df = self._generate_features(historical_df[0], historical_df[1])
        X_test = processed_historical_df.iloc[-prediction_length:]
        if verbose: print("X_test shape {}".format(X_test.shape))
        y_test = self.model.predict(X_test)

        if verbose: print("Finished rolled prediction for model {}, TIME {}".format(self.name, time()-start_time))
        return y_test

In [2]:
df=pd.read_csv("data/market_train_df_head.csv").drop('Unnamed: 0', axis=1)

In [3]:
df

Unnamed: 0,time,assetCode,assetName,volume,close,open,returnsClosePrevRaw1,returnsOpenPrevRaw1,returnsClosePrevMktres1,returnsOpenPrevMktres1,returnsClosePrevRaw10,returnsOpenPrevRaw10,returnsClosePrevMktres10,returnsOpenPrevMktres10,returnsOpenNextMktres10,universe
0,2007-02-01 22:00:00+00:00,A.N,Agilent Technologies Inc,2606900.0,32.19,32.17,0.005938,0.005312,,,-0.001860,0.000622,,,0.034672,1.0
1,2007-02-01 22:00:00+00:00,AAI.N,AirTran Holdings Inc,2051600.0,11.12,11.08,0.004517,-0.007168,,,-0.078708,-0.088066,,,0.027803,0.0
2,2007-02-01 22:00:00+00:00,AAP.N,Advance Auto Parts Inc,1164800.0,37.51,37.99,-0.011594,0.025648,,,0.014332,0.045405,,,0.024433,1.0
3,2007-02-01 22:00:00+00:00,AAPL.O,Apple Inc,23747329.0,84.74,86.23,-0.011548,0.016324,,,-0.048613,-0.037182,,,-0.007425,1.0
4,2007-02-01 22:00:00+00:00,ABB.N,ABB Ltd,1208600.0,18.02,18.01,0.011791,0.025043,,,0.012929,0.020397,,,-0.017994,1.0
5,2007-02-01 22:00:00+00:00,ABC.N,AmerisourceBergen Corp,1657300.0,52.37,52.40,-0.000191,0.008468,,,0.089000,0.077746,,,0.058680,1.0
6,2007-02-01 22:00:00+00:00,ABD.N,ACCO Brands Corp,1186200.0,23.63,24.13,-0.020721,-0.007404,,,0.005104,0.026809,,,-0.044285,0.0
7,2007-02-01 22:00:00+00:00,ABM.N,ABM Industries Inc,301200.0,26.19,25.90,0.013545,0.014890,,,0.068980,0.047311,,,0.016578,0.0
8,2007-02-01 22:00:00+00:00,ABT.N,Abbott Laboratories,5692300.0,52.87,52.50,-0.002453,-0.004739,,,0.001515,-0.004928,,,0.009861,1.0
9,2007-02-01 22:00:00+00:00,ABV.N,Companhia de Bebidas das Americas Ambev,401800.0,52.46,52.10,0.014112,0.024784,,,0.042321,0.027411,,,0.012917,1.0


In [5]:
import lightgbm as lgb


This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [10]:
type(df.time[0])

str

In [16]:
from datetime import datetime

In [28]:
t=df.time[3]

In [42]:
t=t[:9]+'7'+t[10:]

In [48]:
df['time'].apply(lambda x: datetime.strptime(x.split()[0], "%Y-%M-%d").weekday())

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
     ..
70    0
71    0
72    0
73    0
74    0
75    0
76    0
77    0
78    0
79    0
80    0
81    0
82    0
83    0
84    0
85    0
86    0
87    0
88    0
89    0
90    0
91    0
92    0
93    0
94    0
95    0
96    0
97    0
98    0
99    0
Name: time, Length: 100, dtype: int64

In [44]:
help(datetime.strptime(t.split()[0], "%Y-%M-%d").weekday)

Help on built-in function weekday:

weekday(...) method of datetime.datetime instance
    Return the day of the week represented by the date.
    Monday == 0 ... Sunday == 6



In [49]:
help(lgb.train)

Help on function train in module lightgbm.engine:

train(params, train_set, num_boost_round=100, valid_sets=None, valid_names=None, fobj=None, feval=None, init_model=None, feature_name='auto', categorical_feature='auto', early_stopping_rounds=None, evals_result=None, verbose_eval=True, learning_rates=None, keep_training_booster=False, callbacks=None)
    Perform the training with given parameters.
    
    Parameters
    ----------
    params : dict
        Parameters for training.
    train_set : Dataset
        Data to be trained on.
    num_boost_round : int, optional (default=100)
        Number of boosting iterations.
    valid_sets : list of Datasets or None, optional (default=None)
        List of data to be evaluated on during training.
    valid_names : list of strings or None, optional (default=None)
        Names of ``valid_sets``.
    fobj : callable or None, optional (default=None)
        Customized objective function.
    feval : callable or None, optional (default=None)

In [50]:
lgb.Booster

lightgbm.basic.Booster